问题描述: python3.7 学习多线程的时候,一个问题,打印出来的结果为什么是这样的?
import threading
from queue import Queue
from lxml import etree
import requests
import json
class ThreadCrawl(threading.Thread):
def __init__(self, thread_name, page_queue, data_queue):
super(ThreadCrawl, self).__init__()
self.thread_name = thread_name
self.page_queue = page_queue
self.data_queue = data_queue
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
}
def run(self):
print("现在启动的线程名字是: " + self.thread_name)
while not CRAWL_EXIT:
try:
'''
一:队列为空,且 block 值为 True (默认值为 True) ,就会进入阻塞状态,直到队列有新的数据 (不会结束)
二:队列为空,block 值为 False, 报 Queue.empty() 异常
这里只放了 10 个页,所以选择第二种
'''
page = self.page_queue.get(False)
print(page)
url = "https://www.qiushibaike.com/8hr/page/" + str(page) + "/"
content = requests.get(url, headers=self.headers)
self.data_queue.put(content)
except Exception as e:
print("出错了: " + e)
pass
CRAWL_EXIT = False
PARSE_EXIT = False
def main():
page_queue = Queue(10)
for i in range(1, 11):
page_queue.put(i)
data_queue = Queue()
crawl_list = ["采集线程1号", "采集线程2号", "采集线程3号"]
thread_crawl = []
for thread_name in crawl_list:
thread = ThreadCrawl(thread_name, page_queue, data_queue)
thread.start()
thread_crawl.append(thread)
while not page_queue.empty():
print(str(page_queue.get()) + "队列还没有空")
global CRAWL_EXIT
CRAWL_EXIT = True
print("page_queue 为空")
'''
潜在线程
守护线程: 主线程结束,子线程也结束
所以下面创建非守护线程, 手动加一个阻塞状态
'''
for thread in thread_crawl:
print("等待线程 " + str(thread.thread_name) + " 的执行完成")
thread.join()
if __name__ == "__main__":
main()
为什么 1 , 2 , 4 , 7 没有正常 打印,字都省掉了?
问题解决: 待解决
|