n17_qinhai.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. #!/usr/bin/env python
  2. # coding:utf-8
  3. import requests,json
  4. from setting import proxies
  5. from urllib import parse
  6. from lxml import etree
  7. from mongo_cho import myco17,r_myco15,myco17_b
  8. from rety import retry
  9. r = requests.session()
  10. r.keep_alive = False
  11. # http://qinghai.chinatax.gov.cn/web/zdsswfsxaj/zdaj.shtml
  12. @retry(3)
  13. def r1_d(url):
  14. # url = 'http://qinghai.chinatax.gov.cn/web/2020nd/202007/e4856c576fa04e059eff6762dc47bf0c.shtml'
  15. headers = {
  16. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  17. }
  18. response = r.get(url=url, headers=headers, proxies=proxies)
  19. # html = response.text
  20. # print(html)
  21. html = response.text
  22. selector = etree.HTML(html)
  23. dt = selector.xpath('//*[@id="page-newContent"]/div[2]/div/div[1]/div/div[1]/div/span[1]/text()')
  24. dt1 = dt[0].replace('发布时间:','').replace('\r','').replace('\n','').replace(' ','').replace('-','/')
  25. dt2 = dt1[:-5]
  26. print(dt2)
  27. a = selector.xpath('//tr')
  28. dict1 = {}
  29. for i in a:
  30. k1 = i.xpath('td[1]')
  31. if k1:
  32. k2 = k1[0].xpath('string(.)').strip()
  33. k3 = k2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
  34. # print(k2)
  35. v1 = i.xpath('td[2]')
  36. if v1:
  37. v2 = v1[0].xpath('string(.)').strip()
  38. v3 = v2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
  39. # print(v3)
  40. else:
  41. v3 = ''
  42. dict1[k3] = v3
  43. dict1['url'] = url
  44. # dt=''
  45. dict1['date'] = dt2
  46. print(dict1)
  47. return dict1
  48. # r1_d(url)
  49. @retry(3)
  50. def r1(ny,pg):
  51. if pg ==1:
  52. url = 'http://qinghai.chinatax.gov.cn/web/{}nd/iframe.shtml'.format(ny)
  53. else:
  54. url = 'http://qinghai.chinatax.gov.cn/web/{ny}nd/iframe_{pg}.shtml'.format(ny=ny,pg=pg)
  55. # url = 'http://qinghai.chinatax.gov.cn/web/2021nd/iframe.shtml'
  56. headers = {
  57. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  58. }
  59. response = r.get(url=url, headers=headers, proxies=proxies)
  60. html = response.text
  61. # print(html)
  62. if "404 Not Found" in html:
  63. print('zz')
  64. return 'zz'
  65. selector = etree.HTML(html)
  66. a = selector.xpath('//a/@href')
  67. list1 = []
  68. list2 = []
  69. for i in a:
  70. print(i)
  71. url1 = 'http://qinghai.chinatax.gov.cn' + i
  72. utf = r_myco15.sismember('n17', url1) ##更改
  73. if not utf:
  74. rsd = r1_d(url1)
  75. list1.append(rsd)
  76. list2.append(url1)
  77. else:
  78. print('已存在,>>>n17')
  79. if list1:
  80. myco17.insert_many(list1)
  81. print('已存入原始库')
  82. if list2:
  83. myco17_b.insert_many(list1)
  84. print('已存入备份原始库')
  85. for mis in list2:
  86. r_myco15.sadd('n17', mis) ##更改
  87. # if list1:
  88. # myco17.insert_many(list1)
  89. # print('1')
  90. # r1(pg=1)
  91. def runs(ny):
  92. for pg in range(1,100):
  93. print(pg, '===========')
  94. tf = r1(ny,pg)
  95. if tf == "zz":
  96. break
  97. runs(2021)
  98. for pg in range(54,55):
  99. print(pg,'===========')
  100. r1(2021,pg)