n04_sz.py 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. #!/usr/bin/env python
  2. # coding:utf-8
  3. import requests,json,re
  4. from setting import proxies
  5. from urllib import parse
  6. from lxml import etree
  7. from mongo_cho import myco4,r_myco15,myco4_b
  8. r = requests.session()
  9. r.keep_alive = False
  10. import urllib3
  11. urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
  12. from rety import retry
  13. @retry(3)
  14. def r1_d(url,dt):
  15. # url = 'https://shenzhen.chinatax.gov.cn/mhsofpro/otherproject/wgtg/data.jsp?tags=ps_18756&fh=true'
  16. headers = {
  17. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  18. }
  19. response = r.get(url=url,headers=headers,verify=False,proxies=proxies)
  20. html = response.text
  21. # print(html)
  22. selector = etree.HTML(html)
  23. a = selector.xpath('//tr')
  24. dict1 = {}
  25. for i in a:
  26. # print(i.xpath('td[1]/text()'))
  27. # print(i.xpath('td[2]/text()'))
  28. try:
  29. k1 = i.xpath('td[1]')
  30. text = k1[0].xpath('string(.)').strip()
  31. # text
  32. v1 = i.xpath('td[2]')
  33. text1 = v1[0].xpath('string(.)').strip()
  34. v2 = text1.replace(' ', '').replace('\r', '').replace('\t', '').replace('\n', '')
  35. dict1[text] = v2
  36. except:
  37. pass
  38. dict1['url'] = url
  39. dict1['date'] = dt
  40. print(dict1)
  41. return dict1
  42. # r1_d()
  43. @retry(3)
  44. def r1(ny,cpg,tpg,dt):
  45. url = 'https://shenzhen.chinatax.gov.cn/mhsofpro/otherproject/page/page.jsp?type=w_date&msg={}'.format(ny)
  46. headers = {
  47. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  48. }
  49. data = {
  50. "curPage":cpg,
  51. "totalPages": tpg,
  52. "pageNum": "1",
  53. }
  54. response = r.post(url=url,headers=headers,verify=False,data=data,proxies=proxies)
  55. html = response.text
  56. selector = etree.HTML(html)
  57. rpg = re.findall(r'\/(.*?)页',html)
  58. tpg = rpg[1]
  59. # print(rpg)
  60. a = selector.xpath('//input[@id="button2"]/@onclick')
  61. list1 = []
  62. list2 = []
  63. for i in a:
  64. # print(i)
  65. aa = re.findall("'(.*?)'",i)[0]
  66. url1 = 'https://shenzhen.chinatax.gov.cn'+aa
  67. print(url1,dt)
  68. utf = r_myco15.sismember('n04', url1)
  69. if not utf:
  70. rsd = r1_d(url1, dt)
  71. list1.append(rsd)
  72. list2.append(url1)
  73. else:
  74. print('已存在,>>>n04')
  75. pass
  76. if list1:
  77. myco4.insert_many(list1)
  78. if list2:
  79. myco4_b.insert_many(list1)
  80. for mis in list2:
  81. r_myco15.sadd('n04', mis)
  82. # myco4.insert_many(list1)
  83. return int(tpg)
  84. def runs(ny1='2021',ny2='9'):
  85. ny=str(ny2)+'_'+ str(ny1)
  86. # cpg='1'
  87. dt=str(ny1)+'/'+str(ny2)+'/1'
  88. tpg = r1(ny, '1', '1', dt)
  89. if tpg >1:
  90. for i in range(2,tpg+1):
  91. print(i,'页============')
  92. r1(ny,i,tpg,dt)
  93. runs()