本案例只做学习,不做他用!
import os
import re
import requests
import random
from lxml import etree
from bs4 import BeautifulSoup
import threading
class Get_html:
def __init__(self):
self.headers=self.get_headers()
self.lst=[]
def get_headers(self):
ua = [
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
"Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)",
"Baiduspider-image+(+http://www.baidu.com/search/spider.htm)",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36",
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
"Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)",
"Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
"Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);",
"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
"Sosospider+(+http://help.soso.com/webspider.htm)",
"Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
]
headers = {'User-Agent': random.choice(ua),'Cookie':'kfz_uuid=4f06a81d8a81f5b256c7a2e6a89bdab8; PHPSESSID=51qaqh9hvd8e7ni2opq9sebop3; shoppingCartSessionId=856640faec4cf06b5ee437ad09fac1a3; kfz-tid=fe09c868b427ce1d411020c95c147719; TINGYUN_DATA=%7B%22id%22%3A%22XMf0fX2k_0w%23nUhCMQN2SSk%22%2C%22n%22%3A%22WebAction%2FURI%2Fproduct%252Fbrowse%252Fpc%22%2C%22tid%22%3A%222d44bda41e0e86a%22%2C%22q%22%3A0%2C%22a%22%3A1732%7D; Hm_lvt_bca7840de7b518b3c5e6c6d73ca2662c=1638946047; Hm_lpvt_bca7840de7b518b3c5e6c6d73ca2662c=1638946047; Hm_lvt_33be6c04e0febc7531a1315c9594b136=1638946048; Hm_lpvt_33be6c04e0febc7531a1315c9594b136=1638946048; kfz_trace=4f06a81d8a81f5b256c7a2e6a89bdab8|0|16abc3df3300221b|-; reciever_area=1006000000'
}
return headers
def book_type(self):
lst_name=[]
lst_type=[]
url='https://book.kongfz.com/Cwenxue/'
ip=['139.213.138.156:4210',
'113.241.139.30:4230',
'182.136.102.237:4245',
]
proxies={'http':'http://{}'.format(random.choice(ip)),'https':'https://{}'.format(random.choice(ip))}
html=requests.get(url,headers=self.get_headers()).text
bs=BeautifulSoup(html,'lxml')
type1=bs.select('div.link-item a')
for type2 in type1:
l=type2['href']
type=l[25:-1]
name=type2.text
lst_type.append(type)
lst_name.append(name)
for a,b in zip(lst_type,lst_name):
x = {'type': a, 'name': b}
self.lst.append(x)
return self.lst
def save_html(self,n):
for type1 in self.lst:
url = f'https://book.kongfz.com/C{type1["type"]}/w{n}/'
html=requests.get(url,headers=self.headers)
html.encoding='utf-8'
h=html.text
file = os.path.join('孔夫子旧书网/', type1['name'])
path=os.path.exists(file)
os.mkdir(file) if not path else print('文件夹已建立!')
with open(file+'/'+str(n)+'.html','w+',encoding='utf-8') as f:
f.write(h)
print(file+str(n)+'.html','已保存!')
class Save():
def __init__(self):
self.zuozhe=''
self.name=''
self.url=''
self.chubanshe=''
def read(self,n,h):
for h1 in h:
file = os.path.join('孔夫子旧书网/', h1['name'])
with open(file+'/'+str(n)+'.html','r',encoding='utf-8') as f:
text=f.read()
resp=re.compile('<div class="title" title="(.*?)">\s*<a href="(.*?)"')
title=resp.findall(text)
resp1=re.compile('<div class="zl-isbn-info">\s*<span class="text">(.*?)</span>\s*<span class="text">(.*?)</span>')
title2=resp1.findall(text)
for title3 in title2:
self.zuozhe=title3[0]
self.chubanshe=title3[1]
for title1 in title:
self.name=title1[0]
self.url=title1[1]
content=f'{self.name},{self.url},{self.zuozhe},{self.chubanshe}\n'
try:
with open('孔夫子旧书网.csv','a+',encoding='ANSI') as f:
f.write(content)
except UnicodeEncodeError:
print('已存储完毕!')
if __name__ == '__main__':
path=os.path.exists('孔夫子旧书网')
os.mkdir('孔夫子旧书网') if not path else print('文件夹已建立!')
html=Get_html()
h=html.book_type()
with open('孔夫子旧书网.csv', 'a+', encoding='ANSI') as f:
f.write(f'{"书名"},{"url"},{"作者"},{"出版社"}')
t=threading.BoundedSemaphore(5)
for n in range(1,20):
t=threading.Thread(target=html.save_html,args=(n,))
t.start()
save_csv = Save()
for n in range(1,20):
save_csv.read(n,h)
|