n02_ningbo.py 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849
  1. #!/usr/bin/env python
  2. # coding:utf-8
  3. import requests,json,time
  4. from setting import proxies
  5. # from urllib import parse
  6. # from pymongo import MongoClient
  7. # myclient = MongoClient("mongodb://127.0.0.1:27017/")
  8. # myco2 = myclient['shuiwu06']['02_nb']
  9. # myco2_b = myclient['shuiwu06']['02_nb']
  10. from lxml import etree
  11. from mongo_cho import myco2,myco2_b
  12. r = requests.session()
  13. r.keep_alive = False
  14. #########见30的
  15. # http://ningbo.chinatax.gov.cn/col/col6300/index.html
  16. def r1(year,mon,day):
  17. dict1 = {}
  18. url = 'http://ningbo.chinatax.gov.cn/art/{year}/{mon}/{day}/art_6166_7114.html'.format(year=year,mon=mon,day=mon)
  19. headers = {
  20. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  21. }
  22. html = requests.get(url=url,headers=headers,proxies=proxies)
  23. selector = etree.HTML(html)
  24. a = selector.xpath('//div[@id="zoom"]//table//tr')
  25. for i in a:
  26. k1 = i.xpath('td[1]')
  27. if k1:
  28. k2 = k1[0].xpath('string(.)').strip()
  29. k3 = k2.replace('\r','').replace('\t','').replace('\n','').replace(' ','')
  30. # print(k2)
  31. v1 = i.xpath('td[2]')
  32. if v1:
  33. v2 = v1[0].xpath('string(.)').strip()
  34. v3 = v2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
  35. # print(v3)
  36. else:
  37. v3 = ''
  38. dict1[k3] = v3
  39. dict1['url'] = url
  40. dict1['date'] = str(year)+'/'+str('04')
  41. print(dict1)
  42. myco2_b.insert_one(dict1)
  43. for i in range(1,30):
  44. r1('2023','4',i)