1 问题描述
多线程爬取的小说内容是随机乱序的,如果在线程任务中执行写入文件的操作,则文件中章节的顺序也是混乱的
2 实现思路
- 由于最近学到了Java的线程安全,所以决定结合线程和队列来实现
具体思路:
- 首先将多线程爬取到的内容根据章节顺序放入优先队列中
- 待放入完成后再从优先队列中取出内容写入文件(因为放入时设置了优先级,因此取出时会根据优先级来取出)
3 实现方式
def getData(baseUrl, totalChapter):
for n in range(1, totalChapter):
firstUrl = baseUrl + "_" + str(n)
priQue.put((n, firstUrl))
pass
for k in range(5):
thread = GetThread(k)
thread.start()
threadList.append(thread)
pass
for t in threadList:
t.join()
pass
writeFileByOrder()
pass
def writeFileByOrder():
lockObj.acquire()
while not contentPriQue.empty():
data = contentPriQue.get()
index = data[0]
content = data[1]
writeToFile(content)
print('第 ', index, ' 章获取完毕')
pass
lockObj.release()
pass
4 全部源码
-
完整源码可从GitHub上获取
https://github.com/shinyMT/novel_python -
或者直接查看(因为是为了学习,因此抹掉了其中的小说网址和部分关于网站的关键信息)
import sys
import time
from bs4 import BeautifulSoup
import re
import urllib.request, urllib.error
import random
import queue
import threading
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 "
"Safari/537.36",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; "
".NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR "
"2.0.50727)",
]
findContent = re.compile(r'<p>(.*?)</p>')
priQue = queue.PriorityQueue()
contentPriQue = queue.PriorityQueue(maxsize=-1)
threadList = []
writeThreadList = []
lockObj = threading.Lock()
def createHeader():
headers = dict()
headers["User-Agent"] = random.choice(USER_AGENTS)
headers["Referer"] = "https://xxx.com"
return headers
pass
def askUrl(url):
global html
req = urllib.request.Request(url, headers=createHeader())
try:
response = urllib.request.urlopen(req)
html = response.read().decode('utf-8')
pass
except urllib.error.URLError as msg:
if hasattr(msg, "code"):
print(msg.code)
pass
if hasattr(msg, "reason"):
print(msg.reason)
pass
pass
return html
pass
def getData(baseUrl, totalChapter):
for n in range(1, totalChapter):
firstUrl = baseUrl + "_" + str(n)
priQue.put((n, firstUrl))
pass
for k in range(5):
thread = GetThread(k)
thread.start()
threadList.append(thread)
pass
for t in threadList:
t.join()
pass
writeFileByOrder()
pass
def analysisHTML(url):
html = askUrl(url)
soup = BeautifulSoup(html, "html.parser")
return soup
pass
def getPageNum(url):
soup = analysisHTML(url)
title = soup.select('h1[class="article-title"]')[0].string
try:
num = str(title).split('/')[1].split(')')[0]
pass
except IndexError as e:
num = 1
pass
return num
pass
def writeToFile(content):
with open('D:\\测试.text', 'a+') as f:
f.write(content)
pass
pass
def getTime():
currentTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
return currentTime
pass
class GetThread(threading.Thread):
def __init__(self, threadId):
threading.Thread.__init__(self)
self.threadId = threadId
pass
def run(self):
while not priQue.empty():
urlData = priQue.get()
firstUrl = urlData[1]
index = int(str(firstUrl).split('_')[2])
fileContent = ""
pageNum = getPageNum(firstUrl)
fileContent += "第" + str(index) + "章\n"
for j in range(1, int(pageNum) + 1):
detailUrl = firstUrl + "_" + str(j) + ".html"
soup = analysisHTML(detailUrl)
for item in soup.select('div[class="article-con"]'):
page = str(item)
content = re.findall(findContent, page)
for sentence in content:
single = str(sentence).replace('\u3000', '').replace('<p>', '\n')
fileContent += single
pass
pass
pass
fileContent += '\n'
contentPriQue.put((index, fileContent))
pass
pass
pass
def writeFileByOrder():
lockObj.acquire()
while not contentPriQue.empty():
data = contentPriQue.get()
index = data[0]
content = data[1]
writeToFile(content)
print('第 ', index, ' 章获取完毕')
pass
lockObj.release()
pass
def main(totalChapterNum):
baseUrl = "https://xxx.com/read_xx"
getData(baseUrl, totalChapterNum)
pass
if __name__ == '__main__':
main(8)
pass
注: 1.本文仅供学习参考,禁止以此作任何非法操作 2.文章仅代表个人观点,有任何错误欢迎批评指正
|