import csv
import os
from lxml import etree
import requests
header={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
def get(url):
list = []
res=requests.get(url,headers=header)
html=etree.HTML(res.content)
titles=html.xpath('//div[@class="pl2"]/a/@title')
authors=html.xpath('//p[@class="pl"]/text()')
evaluates=html.xpath('//span[@class="pl"]/text()')
links=html.xpath('//a[@class="nbg"]/@href')
jpgs=html.xpath('//a[@class="nbg"]/img/@src')
for title ,author,evaluate,link ,jpg in zip(titles,authors,evaluates,links,jpgs):
dic = {}
e=evaluate
e=e.replace('\n','').replace(" ",'')
dic['书名']=title
dic['作者']=author
dic['评论数']=e
dic['链接'] = link
dic['图片地址']=jpg
list.append(dic)
os.makedirs("C:\\豆瓣\\"+title)
pic=requests.get(jpg,headers=header)
res=requests.get(link,headers=header)
html=etree.HTML(res.text)
jianjie = html.xpath('//div[@class="intro"]//text()')
print(list)
with open("C:\\豆瓣\\"+title+"\\"+title+'.jpg', 'ab') as f:
for chunk in pic.iter_content(chunk_size=1000):
if chunk:
f.write(chunk)
with open("C:\\豆瓣\\"+title+"\\"+title+'.txt', 'a',encoding='utf-8') as f:
try:
f.write(jianjie[1])
except UnicodeEncodeError as e:
pass
with open("douban.csv", 'w', newline="") as f:
file = csv.writer(f)
for cow in list:
file.writerow(cow)
return list
for n in range(0,25,25):
url="https://book.douban.com/top250?start="+str(n)
a=get(url)
|