导入模块
import requests
import random
from pyquery import PyQuery as pq
import re
import threading
import os
设置ua,这里不赘述。
这里每一个选项都是一个列表页,首先我们获取所有列表页的url。
?这里用pyquery找到所有a节点后,items()生成器,之后用列表解析直接返回一个列表。
def get_page():
url = 'https://top.zhan.com/toefl/listen/alltpo.html'
response = requests.get(url=url,headers=headers)
html = response.text
doc = pq(html)
page_urls = doc('.cssTopTitleList.clearfix a').items()
urls = [i.attr.href for i in page_urls]
print(urls)
return urls
接下来,用正则表达找到每个列表页中所有详情页(每个tpo中的每一个大题)的url。
用正则表达式找到,findall返回一个列表。
def get_tpo_url(url):
response = requests.get(url=url,headers=headers)
html = response.text
doc = pq(html)
if response.status_code == 200:
url_pattern = re.compile('<a class="md_click sensors_maidian cssReview"[\s\S]*?href="(.*?)" target="_blank" event_type="E_5_14">学习/回顾</a>')
urls = url_pattern.findall(html)
#返回每个列表页所有单题页url
print(urls)
return urls
进到每个答题里面,发现还有5-6道小题,每道题都有自己的url,这里找到每个大题中所有小题的url,并综合成一个列表返回。?
#获取每一个set题url内所有题目url。返回的是每一个set大题里所有小题的urls的列表
def get_question_page(url):
response = requests.get(url=url,headers=headers)
html = response.text
doc = pq(html)
question_urls = doc('#footer_review a').items()
question_urls = [url.attr.href for url in question_urls]
print(question_urls)
#每一个set题所有分题url的综合
parse_page(question_urls)
拿到每个小题的url就好办了,这里我们要获取的是原文、问题、选项、副标题。?
def parse_page(set_question_urls_list):
response = requests.get(url=set_question_urls_list[0],headers=headers)
html = response.text
doc = pq(html)
title = doc('title').text()
title = title.split(' ')[0]
article = doc('.article').text()
subtitle = doc('.arrow .last_crumbs').text()
#这个列表是每一个set中所有(题目+选项)的综合
set_que_plus_opt = []
for url in set_question_urls_list:
response = requests.get(url=url,headers=headers)
html = response.text
doc = pq(html)
question = doc('.left.text').text()
options = doc('.ops.sec ').items()
options_list = [option.text() for option in options]
question_plus_options = [question,options_list]
set_que_plus_opt.append(question_plus_options)
print(title,'\n',article,'\n',set_que_plus_opt)
save_txt(title,article,set_que_plus_opt,subtitle)
semaphore.release()
定义存储函数,这里将文档存储为txt,将每个tpo对应的所有大题存储成一个txt,方便查阅。?
def save_txt(title,article,set_que_plus_opt,subtitle):
path = 'tpo_listening2'
if os.path.exists(path) == False:
os.mkdir(path)
li = []
for q_opt_set in set_que_plus_opt:
q = ', '.join(q_opt_set[1])
q_opt_set = q_opt_set[0] + '\n' + q
li.append(q_opt_set)
li_str = '\n'.join(li)
li_str = li_str.replace('1.','\n1.').replace('2.','\n2.').replace('3.','\n3.').replace('4.','\n4.').replace('A. ','\nA. ').replace('B. ','\nB. ').replace('C. ','\nC. ').replace('D. ','\nD. ')\
.replace('5.','\n5.').replace('6.','\n6.')
with open(f"{path}/{title}.txt",'a+') as f:
f.write(subtitle)
f.write('\n\n')
f.write(article)
f.write('\n'*50)
f.write(li_str)
f.write('\n'*50)
最后定义主函数。开启多线程,BoundedSemaphore限制并发数。全部用时92.3s。
if __name__ == '__main__':
urls = get_page()
total_lis_set_urls = []
for url in urls:
#每个列表页所有set题url
listening_set_urls = get_tpo_url(url)
#将每个列表页所有set题url汇总至total
total_lis_set_urls.extend(listening_set_urls)
semaphore = threading.BoundedSemaphore(20)
record_threads = []
for i in total_lis_set_urls:
semaphore.acquire()
t1 = threading.Thread(target=get_question_page,args=(i,))
t1.start()
record_threads.append(t1)
成果展示。
|