n21_jiangxi.py 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
  1. #!/usr/bin/env python
  2. # coding:utf-8
  3. import re,time
  4. import requests,json
  5. from setting import proxies
  6. from urllib import parse
  7. from lxml import etree
  8. from mongo_cho import myco21,myco21_b,r_myco15
  9. from rety import retry
  10. r = requests.session()
  11. r.keep_alive = False
  12. @retry(3)
  13. def r2(dt,uid):
  14. url = 'http://jiangxi.chinatax.gov.cn/taxmap/front/getdetail.do'
  15. headers = {
  16. "User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
  17. }
  18. params = {
  19. "iid":uid
  20. }
  21. response = r.get(url=url,params=params,headers=headers,proxies=proxies)
  22. response.encoding = 'UTF-8'
  23. html = response.text
  24. # print(html)
  25. selector = etree.HTML(html)
  26. a = selector.xpath('//table[@class="xxTable"]//tr')
  27. dict1 = {}
  28. for i in a:
  29. k1 = i.xpath('th/text()')
  30. if k1:
  31. str1 = ''
  32. for kk1 in k1:
  33. str1 += kk1
  34. v1 =i.xpath('td/text()')
  35. str2 = ''
  36. if v1:
  37. for vv1 in v1:
  38. str2 += vv1
  39. dict1[str1] = str2
  40. dict1['date'] = dt
  41. dict1['uid'] = uid
  42. # print(dict1)
  43. return dict1
  44. # r2()
  45. @retry(3)
  46. def r1(pg):
  47. dt = '2021/05'
  48. url = 'http://jiangxi.chinatax.gov.cn/taxmap/front/result2.do'
  49. headers = {
  50. "User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
  51. }
  52. params = {
  53. "region":"",
  54. "nature": "",
  55. "year": "",
  56. "pageno": pg,
  57. "_": int(round(time.time() * 1000)),
  58. }
  59. response = r.get(url=url,headers=headers,params=params,proxies=proxies)
  60. html = response.text
  61. # print(html)
  62. a = re.findall('getDetail\((.*?)\)',html)
  63. list1 = []
  64. list2 = []
  65. for uid in a:
  66. if uid != 'iid':
  67. print(uid)
  68. utf = r_myco15.sismember('n21', uid) ##更改
  69. if not utf:
  70. rsd = r2(dt, uid)
  71. list1.append(rsd)
  72. list2.append(uid)
  73. else:
  74. print('已存在,>>>n21')
  75. if list1:
  76. myco21.insert_many(list1)
  77. print('已存入原始库')
  78. if list2:
  79. myco21_b.insert_many(list1)
  80. print('已存入备份原始库')
  81. for mis in list2:
  82. r_myco15.sadd('n21', mis) ##更改
  83. # pg = '1'
  84. def runs():
  85. for pg in range(1,3):
  86. print(pg,'===============================')
  87. r1(pg)
  88. runs()