import requests
from bs4 import BeautifulSoup
# 页面源代码有连接,找到连接
def save_book(chapters,name):
file = open(f'a/{name}', 'w+', encoding='utf-8')
for i in chapters:
# file.write('\t')
for ii in i.split(' '): # i.split(' ')用多个空白符分割字符串,保留一个空格部分;''表示空,
if ii.startswith('<div'): # 去掉每章开头多余的<div……></div>
ii = ""
ii = ii.replace("<p></p>", "") # 去掉每章最后多余的<p></p>
file.write(ii)
file.write('\n') # 每写完一句,换行,控制文本格式
print(name+'搞定')
def downliad(url, name):
tk = []
resp = requests.get(url)
newr = resp.text.replace('<br />', '')
page = BeautifulSoup(newr, 'html.parser')
content = page.find('div', attrs={'id': 'content'})
for strs in content:
st = str(strs)
if len(st.split('<br />')) > 1:
pass
else:
tk.append(st)
save_book(tk, name)
if __name__ == '__main__':
url = 'https://www.lewentxt.com/0/166/'
resp = requests.get(url)
# print(resp.text)
page = BeautifulSoup(resp.text, 'html.parser')
table = page.find('div', attrs={'class': 'listmain'})
dds = table.find_all('dd')
i = 0
url1 = url.rsplit('/', 3)[0]# 获得首页连接,与提取页面获取的连接相结合,获得小说的内容
for dd in dds:
if i < 6:
i = i+1
else:
i = i+1
a = dd.find('a')
name = a.text
href = a['href']
url2 = url1+href
downliad(url2, name)
|