n03_bj.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. #!/usr/bin/env python
  2. # coding:utf-8
  3. import re
  4. from rety import retry
  5. import requests,json
  6. from setting import proxies
  7. from urllib import parse
  8. from lxml import etree
  9. from mongo_cho import myco3,r_myco15,myco3_b
  10. r = requests.session()
  11. r.keep_alive = False
  12. @retry(3)
  13. def r1_d(cid,ny):
  14. url = 'http://beijing.chinatax.gov.cn/bjsat/office/jsp/zdsswfaj/wwidquery'
  15. headers = {
  16. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  17. }
  18. data = {
  19. "id":cid,
  20. "dq": "",
  21. "ajlx": "",
  22. "ndjd": ny,
  23. "bz": "ndjd",
  24. "dqy": "2",
  25. "ymdx": "",
  26. "nsrmc": "",
  27. "nsrsbh": "",
  28. "zcdz": "",
  29. "zzjgdm": "",
  30. "fddbrmc": "",
  31. "fddbrsfzhm": "",
  32. "cwfzrmc": "",
  33. "cwfzrsfzhm": "",
  34. "orgCode": "11100000000",
  35. }
  36. response = r.post(url=url,headers=headers,data=data,proxies=proxies)
  37. html = response.text
  38. selector = etree.HTML(html)
  39. a = selector.xpath('/html/body/table/tbody/tr/td/table/tbody//tr')
  40. dict1 = {}
  41. for i in a:
  42. try:
  43. k1 = i.xpath('td[1]/text()')[0].replace(' ','').replace('\r','').replace('\n','').replace('\t','')
  44. v1 = i.xpath('td[2]/text()')[0].replace(' ','').replace('\r','').replace('\n','').replace('\t','')
  45. dict1[k1] = v1
  46. except:
  47. pass
  48. dict1['uid'] = cid
  49. dict1['date'] = ny.replace('年度','/').replace('月','').replace(' ','')
  50. # print(dict1)
  51. return dict1
  52. # r3_d()
  53. @retry(3)
  54. def r1(ny,dqy):
  55. url = 'http://beijing.chinatax.gov.cn/bjsat/office/jsp/zdsswfaj/wwquery'
  56. headers = {
  57. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  58. }
  59. data1 = {
  60. "orgCode":"11100000000",
  61. "bz": "ndjd",
  62. "ndjd": ny,
  63. }
  64. data = {
  65. "id": '',
  66. "dq": "",
  67. "ajlx": "",
  68. "ndjd": ny,
  69. "bz": "ndjd",
  70. "dqy": dqy,
  71. "ymdx": "",
  72. "nsrmc": "",
  73. "nsrsbh": "",
  74. "zcdz": "",
  75. "zzjgdm": "",
  76. "fddbrmc": "",
  77. "fddbrsfzhm": "",
  78. "cwfzrmc": "",
  79. "cwfzrsfzhm": "",
  80. "orgCode": "11100000000",
  81. }
  82. response = r.post(url=url,headers=headers,data=data,proxies=proxies)
  83. html = response.text
  84. rpg = re.findall(r'果(.*?)页',html)
  85. tpg = 0
  86. if rpg:
  87. tpg = rpg[0].replace(' ','')
  88. print(tpg)
  89. selector = etree.HTML(html)
  90. a = selector.xpath('/html/body/table/tbody/tr/td/table[2]/tbody//tr')
  91. list1 = []
  92. list2 = []
  93. for i in a:
  94. rst = i.xpath('td[5]/input/@onclick')
  95. if rst:
  96. codt = rst[0]
  97. cd1 = re.findall(r"'(.*?)'",codt)[0]
  98. print(cd1)
  99. utf = r_myco15.sismember('n03', cd1)
  100. if not utf:
  101. rsd = r1_d(cd1, ny)
  102. list1.append(rsd)
  103. list2.append(cd1)
  104. else:
  105. print('已存在,>>>n03')
  106. pass
  107. if list1:
  108. myco3.insert_many(list1)
  109. if list2:
  110. myco3_b.insert_many(list1)
  111. for mis in list2:
  112. r_myco15.sadd('n03', mis)
  113. # myco3.insert_many(list1)
  114. return int(tpg)
  115. @retry(3)
  116. def get_ny():
  117. url ='http://beijing.chinatax.gov.cn/bjsat/office/jsp/zdsswfaj/ndjd.jsp?orgCode=11100000000'
  118. headers = {
  119. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  120. }
  121. response = r.get(url=url, headers=headers, proxies=proxies)
  122. html = response.text
  123. # print(html)
  124. a = re.findall(r"cx\('2021年度(.*?) 月'\)",html)
  125. list1 = []
  126. for i in a:
  127. # print(i)
  128. ny = i.replace(' ','')
  129. list1.append(ny)
  130. eny = list1[-1]
  131. print(eny,'>>>from n03_ah***')
  132. return eny
  133. # get_ny()
  134. def runs(ny1,ny2):
  135. if len(str(ny2)) == 1:
  136. ny = str(ny1) + "年度" + str(ny2) + " 月"
  137. else:
  138. ny = str(ny1) + "年度" + str(ny2) + " 月"
  139. for dqy in range(1,2):
  140. print(ny1,ny2,dqy,'页=================')
  141. tpg = r1(ny,dqy)
  142. if tpg >1:
  143. for i in range(2,tpg+1):
  144. print(ny1,ny2,i,'页=================')
  145. r1(ny, i)
  146. runs(2021,12)