爬取糗事百科视频--自娱自乐
使用的库文件
import requests
import re
from pyquery import PyQuery
代码如下
import requests
# from lxml import etree
import re
from pyquery import PyQuery
def send_requests():
global url
url = 'https://www.qiushibaike.com/video/page/4/'
global heads
heads = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3875.400 QQBrowser/10.8.4492.400'}
res = requests.get(url,headers = heads)
return res
'''<source src="//qiubai-video.qiushibaike.com/HT36G8LI1MBFJD8R_hd.mp4" type="video/mp4">'''
def parse():
res = send_requests()
# info = re.findall('<source src="(.*)" type=\'video/mp4\' />', res.text)
info = re.findall('<source src="(.*)" type=\'video/mp4\' />',res.text)
lat = []
for i in info:
lat.append('https:'+ i)
# print(lat)
return lat
#xpath 方法
# def parse_name():
# res = send_requests()
# e =etree.HTML(res)
# name = e.xpath('//div[@class="conten"]/text()')
# lst = []
# for i in range(len(name)):
# lst.append(name[i])
# print(lst)
def name():
res1 = PyQuery(send_requests().text)
name = [a.text for a in res1('div.content span')]
lst = []
for i in range(len(name)):
lst.append(name[i].split())
# intab = "?/|\.><:*"
# for s in intab:
# if s in lst:
# title = lst.remove(s)
return lst
# #
#
def down():
lst = name()
count = 0
for i in parse():
count+=1
resp = requests.get(i,headers = heads)
with open(r'void/' + str(lst[count-1]) + '.mp4', 'wb+') as fi:
fi.write(resp.content)
print("下载完成")
def start():
parse()
down()
# print(send_requests())
# # print(parse())
if __name__ == '__main__':
start()
成果展示:
?总结:
有bug
|