学习记录:这个爬的太慢了~~~慢,慢,慢........
网址:斗罗大陆5重生唐三最新章节_斗罗大陆5重生唐三全文免费阅读-笔趣阁
找到正文:
?代码:
"""
2022年
CSDN:抄代码抄错的小牛马
"""
import requests
from lxml import etree
def get_url():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'
}
url = 'https://www.52bqg.net/48_48686/'
response = requests.get(url=url, headers=headers)
response.encoding = 'gbk'
content = response.text
return content
def down_load(content):
tree = etree.HTML(content)
titles = tree.xpath('//div[@id="list"]/dl/dd/a ')[12:] # 正文开始
for i in titles:
# 章节标题
title = i.xpath('.//text()')[0] # 索引去[],str
# 章节链接
href = i.xpath('./@href')[0]
url = 'https://www.52bqg.net/48_48686/'
href = url + href
response_text = requests.get(href)
response_text.encoding = 'gbk'
content_text = response_text.text
tree = etree.HTML(content_text)
text = tree.xpath('//div[@id="content"]/text()')
text = "\n".join(text).replace('笔趣阁 www.52bqg.net,最快更新斗罗大陆5重生唐三 !', '').replace(
'新书上传,希望大家可以先收藏、推荐,正式连载将于5月20号。', '').replace(' ', '')
with open("斗罗大陆.txt", 'a', encoding='utf-8') as f:
f.write(title + "\n")
f.write(text + "\n")
print('%s下载成功' % title)
if __name__ == '__main__':
# 获取网页的源码(响应数据) response
content = get_url()
# 下载并保存数据到本地 + 解析网页
down_load(content)
运行:
?查看:
.?
|