import requests
from bs4 import BeautifulSoup
from concurrent import futures
from multiprocessing import Process
import time
import re
import random
from collections import deque
from queue import Queue
def chapterUrl_list():
soup = BeautifulSoup(open('ss.html',encoding='utf-8'),features='html.parser')
chapterurllist = Queue()
chapterurls = soup.find('div',attrs={"class":"listmain"}).find_all('a')
for url in chapterurls:
if url.get('href') is None : continue
if url.get('href').find('9832') < 0 : continue
chapterurllist.put(url.get('href'))
return chapterurllist
def write2txt(result):
r = result
r = r.result()
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text,"lxml")
title = soup.find('h1').get_text()
contents = soup.find_all(name='div',attrs={'class':'showtxt'})
for txt in contents:
print('title-----------------------------'+ title)
a = '\n'+title+'\n'+ txt.text
f = open(r'打更人.txt','a',encoding='utf-8')
f.write(a)
f.close()
header = [
'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36'
,'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'
,'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)'
,'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'
,'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)'
]
headers={'User-Agent':str(header[random.randint(0,4)])}
session = requests.Session()
session.headers.update(headers)
url_ls = chapterUrl_list()
print('--------------------------------------------------------------------------------------')
print(url_ls.qsize())
print('--------------------------------------------------------------------------------------')
text_ls = Queue()
executor = futures.ThreadPoolExecutor(max_workers=3)
ex = futures.ThreadPoolExecutor(max_workers=1)
while not url_ls.empty() :
ft = executor.submit(session.get,url_ls.get())
time.sleep(1)
text_ls.put(ft)
if not text_ls.empty():
ex.submit(write2txt,text_ls.get())
ps: 第一次写python 脚本, 嘿嘿嘿嘿, 乱写的. 参考了好多前辈的代码.终于不知道怎么地 他就能运行了. 用了多线程, 然而又好像没有多线程效果. 爬1000章用了接近15分钟… 头蒙, 有机会再添砖加瓦 改进吧.
|