123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101 |
- #!/usr/bin/env python
- # coding:utf-8
- import requests,json,re
- from setting import proxies
- from urllib import parse
- from lxml import etree
- from mongo_cho import myco4,r_myco15,myco4_b
- r = requests.session()
- r.keep_alive = False
- import urllib3
- urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
- from rety import retry
- @retry(3)
- def r1_d(url,dt):
- # url = 'https://shenzhen.chinatax.gov.cn/mhsofpro/otherproject/wgtg/data.jsp?tags=ps_18756&fh=true'
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- response = r.get(url=url,headers=headers,verify=False,proxies=proxies)
- html = response.text
- # print(html)
- selector = etree.HTML(html)
- a = selector.xpath('//tr')
- dict1 = {}
- for i in a:
- # print(i.xpath('td[1]/text()'))
- # print(i.xpath('td[2]/text()'))
- try:
- k1 = i.xpath('td[1]')
- text = k1[0].xpath('string(.)').strip()
- # text
- v1 = i.xpath('td[2]')
- text1 = v1[0].xpath('string(.)').strip()
- v2 = text1.replace(' ', '').replace('\r', '').replace('\t', '').replace('\n', '')
- dict1[text] = v2
- except:
- pass
- dict1['url'] = url
- dict1['date'] = dt
- print(dict1)
- return dict1
- # r1_d()
- @retry(3)
- def r1(ny,cpg,tpg,dt):
- url = 'https://shenzhen.chinatax.gov.cn/mhsofpro/otherproject/page/page.jsp?type=w_date&msg={}'.format(ny)
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- data = {
- "curPage":cpg,
- "totalPages": tpg,
- "pageNum": "1",
- }
- response = r.post(url=url,headers=headers,verify=False,data=data,proxies=proxies)
- html = response.text
- selector = etree.HTML(html)
- rpg = re.findall(r'\/(.*?)页',html)
- tpg = rpg[1]
- # print(rpg)
- a = selector.xpath('//input[@id="button2"]/@onclick')
- list1 = []
- list2 = []
- for i in a:
- # print(i)
- aa = re.findall("'(.*?)'",i)[0]
- url1 = 'https://shenzhen.chinatax.gov.cn'+aa
- print(url1,dt)
- utf = r_myco15.sismember('n04', url1)
- if not utf:
- rsd = r1_d(url1, dt)
- list1.append(rsd)
- list2.append(url1)
- else:
- print('已存在,>>>n04')
- pass
- if list1:
- myco4.insert_many(list1)
- if list2:
- myco4_b.insert_many(list1)
- for mis in list2:
- r_myco15.sadd('n04', mis)
- # myco4.insert_many(list1)
- return int(tpg)
- def runs(ny1='2021',ny2='9'):
- ny=str(ny2)+'_'+ str(ny1)
- # cpg='1'
- dt=str(ny1)+'/'+str(ny2)+'/1'
- tpg = r1(ny, '1', '1', dt)
- if tpg >1:
- for i in range(2,tpg+1):
- print(i,'页============')
- r1(ny,i,tpg,dt)
- runs()
|