n08_fj.py 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. #!/usr/bin/env python
  2. # coding:utf-8
  3. import requests,json,re
  4. from setting import proxies
  5. from urllib import parse
  6. from lxml import etree
  7. from mongo_cho import myco8,myco8_b,r_myco15
  8. from rety import retry
  9. r = requests.session()
  10. r.keep_alive = False
  11. @retry(3)
  12. def r1(ny1,ny2,pg):
  13. url = 'http://fujian.chinatax.gov.cn/was5/web/search'
  14. headers = {
  15. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  16. }
  17. params = {
  18. "channelid":"291316",
  19. "templet": "zdaj.jsp",
  20. "sortfield": "-datefor",
  21. "classsql": "datefor={ny1}\-{ny2}".format(ny1=ny1,ny2=ny2),
  22. "r": "0.31052286956801844",
  23. "prepage": "8",
  24. "page": pg,
  25. }
  26. response = r.get(url=url,headers=headers,params=params,proxies=proxies)
  27. html = response.text
  28. tpg = re.findall('"pagenum":"(.*?)"',html)[0]
  29. print(tpg,'===========')
  30. # a = json.loads(html)
  31. print(html)
  32. html1 = html.replace(" ","").replace("\r","").replace("\n","").replace("\t","")
  33. res1 = re.findall('"docs":\[(.*?)\]',html1)
  34. res2 = res1[0]
  35. res3 = re.findall('\{(.*?)\}',res2)
  36. list1 = []
  37. listurl = []
  38. for i in res3:
  39. i1 = "{" + i + "}"
  40. i2 = json.loads(i1)
  41. print(i2)
  42. url1 = i2['url']
  43. utf = r_myco15.sismember('n08', url1) ##更改
  44. if not utf:
  45. listurl.append(url1)
  46. list1.append(i2)
  47. else:
  48. print('已存在,>>>n08')
  49. pass
  50. list2 = list1[:-1]
  51. # print(list2)
  52. # if list2:
  53. # myco8.insert_many(list2)
  54. # if listurl:
  55. # myco8_b.insert_many(list2)
  56. # for mis in listurl:
  57. # r_myco15.sadd('n08', mis) ##更改
  58. return tpg
  59. def runs(ny1,ny2):
  60. rpg = r1(ny1,ny2,pg=1)
  61. # print(rpg)
  62. # print(type(rpg))
  63. if rpg == '0':
  64. print('122')
  65. return 'er1'
  66. else:
  67. for pg2 in range(2,int(rpg)+1):
  68. r1(ny1, ny2, pg2)
  69. for pg2 in range(3,7):
  70. r1('2023', '10', pg2)