进行了学习之后,使用request进行爬取小说实验,用request获取网页,bs4提取目标信息,re解析数据清洗。代码很水,仅供参考。
import requests
import re
import os
import time
import random
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
class NovelSpider(object):
def __init__(self):
self.url='网址'#占位
self.spider_end = ''
def get_html(self,url):
headers = { 'User-Agent': UserAgent().random}
res=requests.get(url = url,headers = headers, timeout=(3,7))
html=res.text
return html
def parse_html(self,html,expression):
parse_html = BeautifulSoup(html, 'html.parser')
text = parse_html.select(expression)
return text
def save_html(self,filename,html):#1文件保存地址,需要创建request文件夹
with open(r"D:/request/"+filename,'w',encoding='utf-8') as f:
for i in html:
j = str(i)
j = j[3:-4]
j=j+'\n'
f.write(j)
def run(self):
url1 = '小说书名网址'#2小说网址
name = '小说名字'#3小说名字
expression1 = '#content p'
expression2 = '#list dd'
html2 = self.get_html(url1)
html2 = self.parse_html(html2, expression2)
re_list = []
re_list2 = []
for i in html2:
pattern1 = re.compile('"(.*?)"', re.S)
re_list.append(pattern1.findall(str(i)))
number = len(re_list)
for i in html2:
pattern1 = re.compile('>(.*?)<', re.S)
re_list2.append(pattern1.findall(str(i)))
file_count=0#读取文件数量
for dirpath, dirnames, filenames in os.walk(r'D:/request'):
for file in filenames:
file_count=file_count+1
begin = 1
if file_count != 0:
begin = file_count
for page in range(begin, number+1):
re_list_middle = str(re_list[page-1])
re_list_middle2 = str(re_list2[page-1])
url = '小说主网址'+re_list_middle[2:-2]
html=self.get_html(url)
html = self.parse_html(html,expression1)
prohibit_str = ['\\','/',':','?','*','|','"',"'",'.']
re_list_middle3 = ''
for i in re_list_middle2:
if i in prohibit_str :
pass
else:re_list_middle3 = re_list_middle3+i
filename=r'{}-{}.txt'.format(name,re_list_middle3[3:-3])
self.save_html(filename,html)
print(filename)
time.sleep(random.randint(1,2))
self.spider_end = '1'
if __name__=='__main__':
start=time.time()
while(1):
try:
spider=NovelSpider()
spider.run()
if spider.spider_end == '1':
break
except:
pass
end=time.time()
print('爬取完毕')
print('执行时间为:%.2f'%(end-start))
|