12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879 |
- #!/usr/bin/env python
- # coding:utf-8
- import requests,json
- from setting import proxies
- from urllib import parse
- from lxml import etree
- from mongo_cho import myco20,r_myco15,myco20_b
- from rety import retry
- r = requests.session()
- r.keep_alive = False
- @retry(3)
- def r1(ny1,ny2,pg):
- url = 'http://hunan.chinatax.gov.cn/hardcasegetdatanew'
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- type_value = str(ny1) + '0' + str(ny2)
- dt = str(ny1) + '/0' + str(ny2)
- data = {
- "type":"3",
- "type_value": type_value,
- "case_type": "1",
- "page": pg,
- "limit": "10",
- "is_search": "0",
- "taxpayerName": "",
- "taxpayerNumber": "",
- "organizationalCode": "",
- "place": "",
- "legalName": "",
- "legalIdCard": "",
- "financeName": "",
- "financeIdCard": "",
- "personName": "",
- "personIdCard": "",
- "_csrf": "fe7aeeb7-63a9-4770-9f35-84869a82d042",
- }
- response = r.post(url=url,headers=headers,data=data,proxies=proxies)
- # print(response.text)
- rsd = response.json()
- rsl = rsd['data']
- rpg = rsd['hardCasePage']['totalPages']
- # print(rpg)
- list1 = []
- list2 = []
- if rsl:
- for i in rsl:
- i['date'] = dt
- print(i)
- url1 = i['id']
- utf = r_myco15.sismember('n20', url1) ##更改
- if not utf:
- list1.append(i)
- list2.append(url1)
- else:
- print('已存在,>>>n20')
- if list1:
- myco20.insert_many(list1)
- print('已存入原始库')
- if list2:
- myco20_b.insert_many(list1)
- print('已存入备份原始库')
- for mis in list2:
- r_myco15.sadd('n20', mis) ##更改
- # if list1:
- # myco20.insert_many(list1)
- return int(rpg)
- def runs(ny1,ny2):
- rpg = r1(ny1,ny2,pg=1)
- if rpg>1:
- for pg in range(2,rpg+1):
- print(pg,'====')
- r1(ny1,ny2,pg)
- runs('2022','11')
|