最近好久没写代码了,突然想热热手于是就相中了起点中文网(●ˇ?ˇ●) 废话不多说,献上代码 我们先来分析分析起点中文网的网站 https://www.qidian.com/rank/yuepiao/year2022-month01/ 正常操作我们进入网站之后,按f12,点击network ,如下图  我们需要找到我们要爬取的内容,今天我们就爬取标题和月票数吧  **找到箭头所指的网址点进去查看它的预览(Preview)查找了之后发现,没有我们要找的数据,我们再看是否在Response中,用CTRL+f来搜索星门会发现在这个里面 **   这样我们就得到了题目,获取题目的代码如下
import random
import requests
from lxml import etree
url = 'https://www.qidian.com/rank/yuepiao/year2022-month01/'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36',
'referer': 'https://www.qidian.com/rank/',
'cookie': 'e1=%7B%22pid%22%3A%22qd_P_rank_01%22%2C%22eid%22%3A%22qd_C19%22%2C%22l1%22%3A4%7D; e2=%7B%22pid%22%3A%22qd_P_rank_01%22%2C%22eid%22%3A%22%22%2C%22l1%22%3A4%7D; _yep_uuid=fd95b6b7-090e-c6e5-cb8c-b8387e5b29ab; _ga=GA1.1.376581816.1643601078; newstatisticUUID=1643601078_1599172947; _csrfToken=m8mDkhtjc381bOHrIGiYTkE1g3bUzgPZjExmmO9l; _ga_FZMMH98S83=GS1.1.1643601077.1.1.1643601098.0; _ga_PFYW0QLV3P=GS1.1.1643601077.1.1.1643601098.0'
}
response = requests.get(url, headers=headers)
response_text = response.text
html_data = etree.HTML(response_text)
title_list = html_data.xpath('//h2/a/text()')
print(title_list)
**运行代码可以看到第一页的小说名字都出来了(以列表的形式) **  当然我们还要获得这些小说的月票数  可以看出月票数没有直接的显示出来那我们先把这未显示的拿到
re_data = re.findall('</style><span class=".*?">(.*?)</span>', response_text)
print(re_data)
效果如下  可以看出这跟网页上的显示的不一样啊,这是啥呀,于是可以猜想此月票数应该是进行了字体加密了为了验证此想法在字体上找到了一个src  并且此src还是动态的(心态崩了)每次进入此网页就会随机生成以下是我在network的font进行对比   于是获得动态字体url的代码如下
font_url = re.findall(r"format\('eot'\); src: url\('(.*?)'\) format\('woff'\)", response_text)[0]
print(font_url)
然后后面的思路就清晰了直接用获得的字体包来解密源码中加密的数据就行了
font_response = requests.get(font_url, headers=headers)
with open('jiemi.woff','wb')as f:
f.write(font_response.content)
font_obj = TTFont('jiemi.woff')
font_obj.saveXML('jiemi.xml')
cmap_dict = font_obj.getBestCmap()
print("字体加密映射表", cmap_dict)
for i in enumerate(re_data):
new_font_list = re.findall(r'\d+', i[1])
re_data[i[0]] = new_font_list
print("去掉特殊符号", re_data)
dict_e_a = {
"one": '1', "two": '2', "three": '3', "four": '4', "five": "5", "six": '6', "seven": "7", "eight": '8', "nine": '9',
"zero": '0'
}
for i in cmap_dict:
for j in dict_e_a:
if cmap_dict[i] == j:
cmap_dict[i] = dict_e_a[j]
print("替换成数字后的关系映射表", cmap_dict)
for i in re_data:
print(i)
for j in enumerate(i):
for k in cmap_dict:
if j[1] == str(k):
print(j[0])
i[j[0]] = cmap_dict[k]
print("解析之后的月票数", re_data)
list_ = []
for i in re_data:
j = ''
for k in i:
j += k
list_.append(j)
print("最终的月票明文数据列表", list_)
rank_dict = {}
for i in range(len(title_list)):
rank_dict[title_list[i]] = list_[i]
这样还不够我有搞了个多页,翻页不是很难,就是这个解密不是很好搞观察第一页第二页第三页的url的不同 第一页:https://www.qidian.com/rank/yuepiao/year2022-month01/ 第二页:https://www.qidian.com/rank/yuepiao/year2022-month01-page2/ 第三页:https://www.qidian.com/rank/yuepiao/year2022-month01-page3/ 发现规律,完整翻页代码如下
import random
import requests
import time
from lxml import etree
from fontTools.ttLib import TTFont
import re
pages = int(input('请输入要查询的页数'))
for page in range(pages):
if page == 0:
url = 'https://www.qidian.com/rank/yuepiao/year2022-month01/'
else:
pages_i=1
url = f'https://www.qidian.com/rank/yuepiao/year2022-month01-page{pages_i+page}/'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36',
'referer': 'https://www.qidian.com/rank/',
'cookie': 'e1=%7B%22pid%22%3A%22qd_P_rank_01%22%2C%22eid%22%3A%22qd_C19%22%2C%22l1%22%3A4%7D; e2=%7B%22pid%22%3A%22qd_P_rank_01%22%2C%22eid%22%3A%22%22%2C%22l1%22%3A4%7D; _yep_uuid=fd95b6b7-090e-c6e5-cb8c-b8387e5b29ab; _ga=GA1.1.376581816.1643601078; newstatisticUUID=1643601078_1599172947; _csrfToken=m8mDkhtjc381bOHrIGiYTkE1g3bUzgPZjExmmO9l; _ga_FZMMH98S83=GS1.1.1643601077.1.1.1643601098.0; _ga_PFYW0QLV3P=GS1.1.1643601077.1.1.1643601098.0'
}
response = requests.get(url, headers=headers)
response_text = response.text
html_data = etree.HTML(response_text)
title_list = html_data.xpath('//h2/a/text()')
print(title_list)
re_data = re.findall('</style><span class=".*?">(.*?)</span>', response_text)
print(re_data)
font_url = re.findall(r"format\('eot'\); src: url\('(.*?)'\) format\('woff'\)", response_text)[0]
font_response = requests.get(font_url, headers=headers)
with open('jiemi.woff','wb')as f:
f.write(font_response.content)
font_obj = TTFont('jiemi.woff')
font_obj.saveXML('jiemi.xml')
cmap_dict = font_obj.getBestCmap()
print("字体加密映射表", cmap_dict)
for i in enumerate(re_data):
new_font_list = re.findall(r'\d+', i[1])
re_data[i[0]] = new_font_list
print("去掉特殊符号", re_data)
dict_e_a = {
"one": '1', "two": '2', "three": '3', "four": '4', "five": "5", "six": '6', "seven": "7", "eight": '8', "nine": '9',
"zero": '0'
}
for i in cmap_dict:
for j in dict_e_a:
if cmap_dict[i] == j:
cmap_dict[i] = dict_e_a[j]
print("替换成数字后的关系映射表", cmap_dict)
for i in re_data:
print(i)
for j in enumerate(i):
for k in cmap_dict:
if j[1] == str(k):
print(j[0])
i[j[0]] = cmap_dict[k]
print("解析之后的月票数", re_data)
list_ = []
for i in re_data:
j = ''
for k in i:
j += k
list_.append(j)
print("最终的月票明文数据列表", list_)
rank_dict = {}
for i in range(len(title_list)):
rank_dict[title_list[i]] = list_[i]
print(f"第{page+1}最终的结果", rank_dict)
print('-'*50)
time.sleep(random.randint(1,2))
效果如下:
 喜欢此文章的可以点在关注我,后续会发布更多好文章(●’?’●)
|