import requests import re, os from lxml import etree ua = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'} url = 'https://www.bswtan.com/0/424/' os.chdir(r"C:\Users\Administrator\Desktop")
def get_info(url): ? ? r = requests.get(url, headers=ua) ? ? r.encoding ='utf-8' ? ? get_info_list = [] ? ? html = etree.HTML(r.text) ? ? dd_list = html.xpath('//*[@id="list"]/dl/dd') ? ? for dd in dd_list: ? ? ? ? title = dd.xpath('a/text()')[0] ? ? ? ? href = 'https://www.bswtan.com/0/424/' + dd.xpath('a/@href')[0] ? ? ? ? chapter = {'title': title,'href': href} ? ? ? ? get_info_list.append (chapter) ? ? return get_info_list
def get_content(get_info): ? ? for chapter_info in get_info: ? ? ? ? r1 =requests.get(url=chapter_info['href'], headers=ua) ? ? ? ? r1.encoding = 'utf-8' ? ? ? ? contents=re.findall('<div id="content">(.*?)</div>',r1.text) ? ? ? ? with open('./武动乾坤/' + chapter_info['title'] +'.txt','w',encoding='utf-8') as f: ? ? ? ? ? ? for content in contents: ? ? ? ? ? ? ? ? f.write(content.replace(' ',''). replace('<br/><br/>','\n' ).strip()) ? ? ? ? ? ? print('下载成功') if __name__ == '__main__': ? ? get_content(get_info(url))
|