def crawer_car_model_data(brand_url):
headers = {
'authority': 'car.autohome.com.cn',
'method': 'GET',
'scheme': 'https',
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cookie': 'fvlid=156974583432110wygoXZiH; sessionid=D7FE9717-245E-4F8D-8D42-AAF453D1F470%7C%7C2019-09-29+16%3A30%3A35.298%7C%7C0; autoid=851072202da5829e1b4e6cbb05975388; cookieCityId=110100; __ah_uuid_ng=c_D7FE9717-245E-4F8D-8D42-AAF453D1F470; area=460106; ahpau=1; sessionuid=D7FE9717-245E-4F8D-8D42-AAF453D1F470%7C%7C2019-09-29+16%3A30%3A35.298%7C%7C0; ahsids=3170; sessionip=153.0.3.115; Hm_lvt_9924a05a5a75caf05dbbfb51af638b07=1585205934,1585207311,1585266321; clubUserShow=87236155|692|2|%E6%B8%B8%E5%AE%A2|0|0|0||2020-03-27+08%3A35%3A50|0; clubUserShowVersion=0.1; sessionvid=0F2198AC-5A75-47E2-B476-EAEC2AF05F04; Hm_lpvt_9924a05a5a75caf05dbbfb51af638b07=1585269508; ahpvno=45; v_no=8; visit_info_ad=D7FE9717-245E-4F8D-8D42-AAF453D1F470||0F2198AC-5A75-47E2-B476-EAEC2AF05F04||-1||-1||8; ref=www.baidu.com%7C0%7C0%7C0%7C2020-03-27+08%3A38%3A40.425%7C2019-10-07+22%3A52%3A34.733',
'sec-fetch-mode': 'no-cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
}
url = brand_url[-1]
re = requests.get(url, headers=headers)
soup = BeautifulSoup(re.text, 'html.parser')
# 子品牌
sub_brand = soup.find("div", {'class': 'cartab-title'}).find("h2").find("a").text
# 销售列表(在售、即将销售、停售)
sale_list = soup.find("div", {'class': 'tab tab02 brandtab-cont'}).find("ul",
{"data-trigger": "click"}).find_all(
"li")
car_sales = []
for li in sale_list:
a = li.find("a")
if a is None:
continue
sale_name = a.text
sale_href = 'https://car.autohome.com.cn' + a.get('href')
print(sale_href)
re = requests.get(sale_href, headers=headers)
soup = BeautifulSoup(re.text, 'html.parser')
# 获取车型列表
interval01_list = soup.find("div", id="divSeries").find_all("ul", {"class": "interval01-list"})
for interval in interval01_list:
cars_list = interval.find_all("div", {"class": "interval01-list-cars"})
for car in cars_list:
car_info = car.find("div", {"class": "interval01-list-cars-infor"}).find("p").find("a")
car_name = car_info.text
car_href = 'https:' + car_info.get('href')
re = requests.get(car_href, headers=headers)
soup = BeautifulSoup(re.text, 'html.parser')
# 参数列表
param_list = soup.find("div", {'class': 'spec-param'}).find("div",
{'class': 'param-list'}).find_all(
"div", {"class": "cell"})
# 级别
car_level = param_list[0].find("p").text
# 续航里程
endurance = param_list[1].find("p").text
# 能源类型
energy_type = param_list[4].find("p").text
# 品牌
car_brand = brand_url[0][:brand_url[0].index('(')].strip()
# 车系
car_series = brand_url[1][:brand_url[1].index('(')].strip()
# 添加数据到数据库
car_dic = {"brand": car_brand, "sub_brand": sub_brand, "sale_status": sale_name,
"series": car_series, "model": car_name, "level": car_level,
"endurance": endurance, "energy_type": energy_type, "year": car_name[:car_name.index(' ')],
"first_char": pinyin.get(car_brand)[0:1].upper()}
car_sales.append(car_dic)
return car_sales
|