n03_beijing.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. #!/usr/bin/env python
  2. # coding:utf-8
  3. import re
  4. from rety import retry
  5. import requests,json
  6. from setting import proxies
  7. from urllib import parse
  8. from lxml import etree
  9. from mongo_cho import myco3,r_myco15,myco3_b
  10. r = requests.session()
  11. r.keep_alive = False
  12. #
  13. @retry(3)
  14. def r1_d(cid,ny):
  15. url = 'http://beijing.chinatax.gov.cn/bjsat/office/jsp/zdsswfaj/wwidquery'
  16. headers = {
  17. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  18. }
  19. data = {
  20. "id":cid,
  21. "dq": "",
  22. "ajlx": "",
  23. "ndjd": ny,
  24. "bz": "ndjd",
  25. "dqy": "2",
  26. "ymdx": "",
  27. "nsrmc": "",
  28. "nsrsbh": "",
  29. "zcdz": "",
  30. "zzjgdm": "",
  31. "fddbrmc": "",
  32. "fddbrsfzhm": "",
  33. "cwfzrmc": "",
  34. "cwfzrsfzhm": "",
  35. "orgCode": "11100000000",
  36. }
  37. response = r.post(url=url,headers=headers,data=data,proxies=proxies)
  38. html = response.text
  39. selector = etree.HTML(html)
  40. a = selector.xpath('/html/body/table/tbody/tr/td/table/tbody//tr')
  41. dict1 = {}
  42. for i in a:
  43. try:
  44. k1 = i.xpath('td[1]/text()')[0].replace(' ','').replace('\r','').replace('\n','').replace('\t','')
  45. v1 = i.xpath('td[2]/text()')[0].replace(' ','').replace('\r','').replace('\n','').replace('\t','')
  46. dict1[k1] = v1
  47. except:
  48. pass
  49. dict1['uid'] = cid
  50. dict1['date'] = ny.replace('年度','/').replace('月','').replace(' ','')
  51. # print(dict1)
  52. return dict1
  53. # r3_d()
  54. @retry(3)
  55. def r1(ny,dqy):
  56. url = 'http://beijing.chinatax.gov.cn/bjsat/office/jsp/zdsswfaj/wwquery'
  57. headers = {
  58. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  59. }
  60. data1 = {
  61. "orgCode":"11100000000",
  62. "bz": "ndjd",
  63. "ndjd": ny,
  64. }
  65. data = {
  66. "id": '',
  67. "dq": "",
  68. "ajlx": "",
  69. "ndjd": ny,
  70. "bz": "ndjd",
  71. "dqy": dqy,
  72. "ymdx": "",
  73. "nsrmc": "",
  74. "nsrsbh": "",
  75. "zcdz": "",
  76. "zzjgdm": "",
  77. "fddbrmc": "",
  78. "fddbrsfzhm": "",
  79. "cwfzrmc": "",
  80. "cwfzrsfzhm": "",
  81. "orgCode": "11100000000",
  82. }
  83. response = r.post(url=url,headers=headers,data=data,proxies=proxies)
  84. html = response.text
  85. rpg = re.findall(r'果(.*?)页',html)
  86. tpg = 0
  87. if rpg:
  88. tpg = rpg[0].replace(' ','')
  89. print(tpg)
  90. selector = etree.HTML(html)
  91. a = selector.xpath('/html/body/table/tbody/tr/td/table[2]/tbody//tr')
  92. list1 = []
  93. list2 = []
  94. for i in a:
  95. rst = i.xpath('td[5]/input/@onclick')
  96. if rst:
  97. codt = rst[0]
  98. cd1 = re.findall(r"'(.*?)'",codt)[0]
  99. print(cd1)
  100. utf = r_myco15.sismember('n03', cd1)
  101. if not utf:
  102. rsd = r1_d(cd1, ny)
  103. list1.append(rsd)
  104. list2.append(cd1)
  105. else:
  106. print('已存在,>>>n03')
  107. pass
  108. if list1:
  109. myco3.insert_many(list1)
  110. if list2:
  111. myco3_b.insert_many(list1)
  112. for mis in list2:
  113. r_myco15.sadd('n03', mis)
  114. # myco3.insert_many(list1)
  115. return int(tpg)
  116. @retry(3)
  117. def get_ny():
  118. url ='http://beijing.chinatax.gov.cn/bjsat/office/jsp/zdsswfaj/ndjd.jsp?orgCode=11100000000'
  119. headers = {
  120. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0",
  121. }
  122. response = r.get(url=url, headers=headers, proxies=proxies)
  123. html = response.text
  124. # print(html)
  125. a = re.findall(r"cx\('2021年度(.*?) 月'\)",html)
  126. list1 = []
  127. for i in a:
  128. # print(i)
  129. ny = i.replace(' ','')
  130. list1.append(ny)
  131. eny = list1[-1]
  132. print(eny,'>>>from n03_ah***')
  133. return eny
  134. # get_ny()
  135. def runs(year,month):
  136. if len(str(month)) == 1:
  137. Year_Month = str(year) + "年度" + str(month) + " 月"
  138. else:
  139. Year_Month = str(year) + "年度" + str(month) + " 月"
  140. for dqy in range(1,2):
  141. print(year,month,dqy,'页=================')
  142. tpg = r1(Year_Month,dqy)
  143. if tpg >1:
  144. for i in range(2,tpg+1):
  145. print(year,month,i,'页=================')
  146. r1(Year_Month, i)
  147. runs(2024,1)