[Python知识库] 【Python多线程】听力

开发: C++知识库 Java知识库 JavaScript Python PHP知识库人工智能区块链大数据移动开发嵌入式开发工具数据结构与算法开发测试游戏开发网络协议系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑笔记本显卡显示器固态硬盘硬盘耳机手机 iphone vivo oppo 小米华为单反装机图拉丁

-> Python知识库 -> 【Python多线程】听力 -> 正文阅读

[Python知识库]【Python多线程】听力

导入模块

import requests
import random
from pyquery import PyQuery as pq
import re
import threading
import os

设置ua，这里不赘述。

这里每一个选项都是一个列表页，首先我们获取所有列表页的url。

?这里用pyquery找到所有a节点后，items（）生成器，之后用列表解析直接返回一个列表。

def get_page():
	url = 'https://top.zhan.com/toefl/listen/alltpo.html'
	response = requests.get(url=url,headers=headers)
	html = response.text
	doc = pq(html)
	page_urls = doc('.cssTopTitleList.clearfix a').items()
	urls = [i.attr.href for i in page_urls]
	print(urls)
	return urls

接下来，用正则表达找到每个列表页中所有详情页（每个tpo中的每一个大题）的url。

用正则表达式找到，findall返回一个列表。

def get_tpo_url(url):
	response = requests.get(url=url,headers=headers)
	html = response.text
	doc = pq(html)
	if response.status_code == 200:
		url_pattern = re.compile('<a class="md_click sensors_maidian cssReview"[\s\S]*?href="(.*?)" target="_blank" event_type="E_5_14">学习/回顾</a>')
		urls = url_pattern.findall(html)
		#返回每个列表页所有单题页url
		print(urls)
		return urls

进到每个答题里面，发现还有5-6道小题，每道题都有自己的url，这里找到每个大题中所有小题的url，并综合成一个列表返回。?

#获取每一个set题url内所有题目url。返回的是每一个set大题里所有小题的urls的列表
def get_question_page(url):
	response = requests.get(url=url,headers=headers)
	html = response.text
	doc = pq(html)
	question_urls = doc('#footer_review a').items()
	question_urls = [url.attr.href for url in question_urls]
	print(question_urls)
	#每一个set题所有分题url的综合
	parse_page(question_urls)

拿到每个小题的url就好办了，这里我们要获取的是原文、问题、选项、副标题。?

def parse_page(set_question_urls_list):
	response = requests.get(url=set_question_urls_list[0],headers=headers)
	html = response.text
	doc = pq(html)
	title = doc('title').text()
	title = title.split(' ')[0]
	article = doc('.article').text()
	subtitle = doc('.arrow .last_crumbs').text()
	#这个列表是每一个set中所有（题目+选项）的综合
	set_que_plus_opt = []
	for url in set_question_urls_list:
		response = requests.get(url=url,headers=headers)
		html = response.text
		doc = pq(html)
		question = doc('.left.text').text()
		options = doc('.ops.sec ').items()
		options_list = [option.text() for option in options]
		question_plus_options = [question,options_list]
		set_que_plus_opt.append(question_plus_options)

	print(title,'\n',article,'\n',set_que_plus_opt)

	save_txt(title,article,set_que_plus_opt,subtitle)

	semaphore.release()

定义存储函数，这里将文档存储为txt，将每个tpo对应的所有大题存储成一个txt，方便查阅。?

def save_txt(title,article,set_que_plus_opt,subtitle):
	path = 'tpo_listening2'
	if os.path.exists(path) == False:
		os.mkdir(path)
	li = []
	for q_opt_set in set_que_plus_opt:
		q = ', '.join(q_opt_set[1])
		q_opt_set = q_opt_set[0] + '\n' + q
		li.append(q_opt_set)

	li_str = '\n'.join(li)
	li_str = li_str.replace('1.','\n1.').replace('2.','\n2.').replace('3.','\n3.').replace('4.','\n4.').replace('A. ','\nA. ').replace('B. ','\nB. ').replace('C. ','\nC. ').replace('D. ','\nD. ')\
	.replace('5.','\n5.').replace('6.','\n6.')

	with open(f"{path}/{title}.txt",'a+') as f:
		f.write(subtitle)
		f.write('\n\n')
		f.write(article)
		f.write('\n'*50)
		f.write(li_str)
		f.write('\n'*50)

最后定义主函数。开启多线程，BoundedSemaphore限制并发数。全部用时92.3s。

if __name__ == '__main__':
	urls = get_page()

	total_lis_set_urls = []
	for url in urls:
		#每个列表页所有set题url
		listening_set_urls = get_tpo_url(url)
		#将每个列表页所有set题url汇总至total
		total_lis_set_urls.extend(listening_set_urls)

	semaphore = threading.BoundedSemaphore(20)
	record_threads = []
	for i in total_lis_set_urls:
		semaphore.acquire()
		t1 = threading.Thread(target=get_question_page,args=(i,))
		t1.start()
		record_threads.append(t1)

成果展示。