续企查查之后,爱企查作为不用登录也能查的网站,还自带json返回,方便了数据的获取,深受我的喜爱。
天眼查只能手机号登录,没有企查查QQ登录的方便,勉强把它拉过来吧,他既没有json返回,网页上也没有任何字典数据,只能从表格中获取。第一次使用天眼查爬取的时候发现了反爬字体 tyc-num ,查阅了各种资料才知道需要用 TTFont来输出其数字对应关系(不错,以后自己也能用用),所需 tyc-num.woff 的资源可以在天眼查网站上找到下载,也可以去资源中找到,多的就不说了,直接上代码:
# _*_ coding:utf-8 _*_
# FileName: get_company.py
# IDE: PyCharm
# 菜菜代码,永无BUG!
import json
import requests
from urllib import parse
from bs4 import BeautifulSoup
from fontTools.ttLib import TTFont
from get_cookies import get_cookies_from_chrome
# https://aiqicha.baidu.com/
# https://www.tianyancha.com/
"""
from faker import Faker
fake = Faker()
headers = {'User-Agent': fake.user_agent()}
"""
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"}
aqc_search_url = "https://aiqicha.baidu.com/s?t=0&"
aqc_detail_url = "https://aiqicha.baidu.com/company_detail_"
aqc_data_url = "https://aiqicha.baidu.com/detail/basicAllDataAjax?pid="
tyc_search_url = "https://www.tianyancha.com/search?"
tyc_cookie = get_cookies_from_chrome('tianyancha.com')
# 破解干扰爬虫的字体
if not __import__('os').path.isfile('tyc-num.xml'):
font = TTFont('tyc-num.woff')
font.saveXML('tyc-num.xml')
ids = BeautifulSoup(open('tyc-num.xml', 'r', encoding='utf-8').read(), 'xml').find('GlyphOrder').find_all('GlyphID')
gid = {}
for g in range(10): # 主要破解数字0-9
gid[ids[g + 2]["name"]] = ids[g]["id"]
def aqc_get_company(company_name): # 爱企查不用登录账号即可流畅查询
s = requests.session()
s.headers.update(headers)
s.get('https://aiqicha.baidu.com/')
r = s.get(aqc_search_url + parse.urlencode({"q": company_name}))
if r.ok:
soup = BeautifulSoup(r.text, "html.parser")
script = soup.find_all('script')[2].text
while 'window.pageData' not in script:
input('可能需要登录才能继续使用!')
information = json.loads(script[script.find('{'): script.rfind('};') + 1]) # 全部细节
if "resultList" not in information["result"]:
return f"未搜寻到公司 “{company_name}” !"
for info in information["result"]["resultList"]:
name_ = BeautifulSoup(info["entName"], "html.parser").text.replace('(', '(').replace(')', ')')
pid = info["pid"]
if company_name != name_.replace(name_[name_.find('('): name_.rfind(')') + 1], '') and company_name != name_.replace('(', '').replace(')', ''):
continue
s.headers["Referer"] = f"{aqc_detail_url}{pid}"
r = s.get(f'{aqc_data_url}{pid}')
if r.ok:
information = r.json()["data"]["basicData"]
company = {
"企业名称": information["entName"],
"法定代表人": information["legalPerson"],
"经营状态": information["openStatus"],
"统一社会信用代码": information["unifiedCode"],
"工商注册号": information.get("regCode", ""),
"组织机构代码": information["orgNo"],
"纳税人识别号": information["taxNo"],
"注册资本": information["regCapital"],
"实缴资本": information["realCapital"],
"登记机关": information["authority"],
"成立日期": information["annualDate"],
"核准日期": information["annualDate"],
"营业期限": information["openTime"],
"注册地址": information["regAddr"],
"经营范围": information["scope"],
"企业类型": information["entType"],
"所属行业": information["industry"],
"行政区划": information["district"],
"参保人数": information["insuranceInfo"]["insuranceNum"],
"曾用名": information["prevEntName"]
}
return company
return f"获取公司 “{name_}” 详情信息失败!"
return f"未搜寻到公司 “{company_name}” !"
return "搜索失败!"
def tyc_get_company(company_name): # 天眼查查了几次后就必须要登录查,被天眼查必须登录烦了,lxml 似乎比 html.parser 牛逼
r = requests.get(tyc_search_url + parse.urlencode({"key": company_name}), headers=headers, cookies={"cookie": tyc_cookie})
if r.ok:
soup = BeautifulSoup(r.text, "html.parser")
table = soup.find("div", attrs={"class": "result-list sv-search-container"})
if table is None:
return f"未搜寻到公司 “{company_name}” !"
for tr in table.find_all("div", attrs={"class": "search-item sv-search-company"}):
info = tr.find("div", attrs={"class": "content"})
name_ = info.find("div", attrs={"class": "header"}).find('a').text.replace('(', '(').replace(')', ')')
url = info.find("div", attrs={"class": "header"}).find('a')["href"]
if company_name != name_.replace(name_[name_.find('('): name_.rfind(')') + 1], '') and company_name != name_.replace('(', '').replace(')', ''):
continue
r = requests.get(url, headers=headers, cookies={"cookie": tyc_cookie})
if r.ok:
soup = BeautifulSoup(r.text, "lxml")
information = soup.find('div', attrs={"id": "_container_baseInfo"}).find('table').find('tbody')
attrs = {}
for row in information.find_all('tr'):
cols = row.find_all('td')
if len(cols) % 2:
cols = cols[:-1]
for col in range(len(cols) // 2):
if cols[col * 2 + 1].find('div', attrs={"class": "name"}):
value = cols[col * 2 + 1].find('div', attrs={"class": "name"}).text
elif cols[col * 2 + 1].find('span', attrs={"class": "sort-score-value"}):
value = cols[col * 2 + 1].find('span', attrs={"class": "sort-score-value"}).text
elif cols[col * 2 + 1].find('text', attrs={"class": "tyc-num lh24"}):
value = ''
for k in cols[col * 2 + 1].find('text', attrs={"class": "tyc-num lh24"}).text:
value += gid.get(k, '-')
else:
value = cols[col * 2 + 1].text
attrs[cols[col * 2].find(text=True)] = value
return attrs
return f"获取公司 “{name_}” 详情信息失败!"
return f"未搜寻到公司 “{company_name}” !"
return "搜索失败!"
if __name__ == '__main__':
# 公司名示例:浙江阿瓦隆科技有限公司
print(aqc_get_company(''))
print(tyc_get_company(""))
|