123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157 |
- #!/usr/bin/env python
- # coding:utf-8
- import requests,json
- from setting import proxies
- from urllib import parse
- from lxml import etree
- from mongo_cho import myco29,r_myco15,myco29_b
- from rety import retry
- r = requests.session()
- r.keep_alive = False
- @retry(3)
- def r1_d(cmpname,url):
- # url = 'https://xizang.chinatax.gov.cn/art/2019/6/26/art_2371_382.html'
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- response = r.get(url=url,headers=headers,proxies=proxies)
- response.encoding = 'UTF-8'
- html = response.text
- if "附件下载" in html:
- return {}
- if "市局频道" in html:
- return {}
- selector = etree.HTML(html)
- dts = selector.xpath('//div[@class="main"]//div[@class="main_content"]//span/text()')
- dt = ''
- for i in dts:
- if '发布时间' in i:
- i1 = i.split(' ')[0]
- dt = i2 = i1.split(':')[1].replace('-','/')
- print(dt)
- dict1 = {}
- dict1['纳税人名称'] = cmpname
- result = selector.xpath('//div[@id="zoom"]')
- result1 = result[0].xpath('string(.)').strip()
- # print(result1)
- reu1 = result1.split('注册地址:')
- # print(reu1[1])
- dict1['注册地址'] = reu1[1]
- reu2 = reu1[0].split('主要违法事实:')
- # print(reu2[1])
- dict1['主要违法事实'] =reu2[1]
- reu3 = reu2[0].split('违法案件性质:')
- # print(reu3[1])
- dict1['违法案件性质'] =reu3[1]
- reu4 = reu3[0].split('组织机构代码:')
- # print(reu4[1])
- dict1['组织机构代码'] =reu4[1]
- reu5 = reu4[0].split('法人信息:')
- # print(reu5[1])
- dict1['法人信息'] =reu5[1]
- reu6 = reu5[0].split('纳税人识别号:')
- # print(reu6[1])
- dict1['纳税人识别号'] =reu6[1]
- dict1['url'] = url
- dict1['date'] = dt
- print(dict1)
- return dict1
- # r1_d('')
- @retry(3)
- def r1(pg):
- url = 'https://xizang.chinatax.gov.cn/module/search/index.jsp'
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- params = {
- "field":"vc_name:1,field_406:1,field_407:1,field_408:1",
- "i_columnid": "style_63",
- "vc_name": "",
- "field_406": "",
- "field_407": "",
- "field_408": "",
- "currpage": pg,
- }
- response = r.get(url=url,headers=headers,params=params,proxies=proxies)
- html = response.text
- selector = etree.HTML(html)
- a = selector.xpath('//td//a/@href')
- list1 = []
- list2 = []
- for i in a:
- # print(i)
- if "art" in i:
- url1 = "https://xizang.chinatax.gov.cn" + i.replace('../..','')
- print(url1)
- utf = r_myco15.sismember('n29', url1) ##更改
- if not utf:
- rsd = r1_d(url1)
- if rsd:
- list1.append(rsd)
- list2.append(url1)
- else:
- print('已存在,>>>n29')
- # if list1:
- # myco29.insert_many(list1)
- # print('已存入原始库')
- # if list2:
- # myco29_b.insert_many(list1)
- # print('已存入备份原始库')
- # for mis in list2:
- # r_myco15.sadd('n29', mis) ##更改
- # if list1:
- # myco29.insert_many(list1)
- @retry(3)
- def r2(pg):
- url = 'https://xizang.chinatax.gov.cn/module/search/index.jsp'
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- params = {
- "field":"vc_name:1,field_406:1,field_407:1,field_408:1",
- "i_columnid": "style_63",
- "vc_name": "",
- "field_406": "",
- "field_407": "",
- "field_408": "",
- "currpage": pg,
- }
- response = r.get(url=url,headers=headers,params=params,proxies=proxies)
- html = response.text
- selector = etree.HTML(html)
- a = selector.xpath('//tr[@class="form-list"]')
- list1 = []
- list2 = []
- for i in a:
- cmpname = i.xpath('td[2]/text()')[0]
- # print(cmpname)
- urlz = i.xpath('td[4]//a/@href')[0]
- # print(urlz)
- url1 = "https://xizang.chinatax.gov.cn" + urlz.replace('../..', '')
- utf = r_myco15.sismember('n29', url1) ##更改
- if not utf:
- rsd = r1_d(cmpname,url1)
- if rsd:
- list1.append(rsd)
- list2.append(url1)
- else:
- print('已存在,>>>n29')
- if list1:
- myco29.insert_many(list1)
- print('已存入原始库')
- if list2:
- myco29_b.insert_many(list1)
- print('已存入备份原始库')
- for mis in list2:
- r_myco15.sadd('n29', mis) ##更改
- def runs():
- for pg in range(1,4):
- print(pg,'================')
- r2(pg)
- runs()
|