需要安装的第三方库
requests、json、xlwt
第一步:
? ? ? ? 进入快手网页版并登录,找到需要下载的视频用户,并进入主页,按F12进入开发者模式。刷新并点击XHR类型:
?
第二步:?
? ? ? ? 左侧Name下的graphql就是储存用户以及视频的地方,顺序不一定,但大多数的视频信息储存在第二个graphql中
?????????点击Preview就可以看到视频的信息,既然找到了,我们只需把它弄出来。但是我们发现每个graphql返回的url是相同的,所以我们不能用.get方法获取任何信息。因此,我们需要向url中传递一定的信息,才能返回我们需要的东西? ?
? ? ? ? ?而我们所需要传递的信息都在graphql的headers中(最底下)
?
?
? ? ? ? !!!cookie的值需要登录后获取
#头部信息
headers = {
"Cookie":"",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
}
#向浏览器传递的信息
post = {
"operationName":"visionProfilePhotoList",
"variables":{
"userId":userId,
"pcursor":pcursor,
"page":"profile"
},
"query":"query visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) {\n visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) {\n result\n llsid\n webPageArea\n feeds {\n type\n author {\n id\n name\n following\n headerUrl\n headerUrls {\n cdn\n url\n __typename\n }\n __typename\n }\n tags {\n type\n name\n __typename\n }\n photo {\n id\n duration\n caption\n likeCount\n realLikeCount\n coverUrl\n coverUrls {\n cdn\n url\n __typename\n }\n photoUrls {\n cdn\n url\n __typename\n }\n photoUrl\n liked\n timestamp\n expTag\n animatedCoverUrl\n stereoType\n videoRatio\n profileUserTopPhoto\n __typename\n }\n canAddComment\n currentPcursor\n llsid\n status\n __typename\n }\n hostName\n pcursor\n __typename\n }\n}\n"
}
????????在这里有一个问题,就是原网页中的query中含有:??? ?代表的意思就是换行。
query visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) {? visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) {? result? llsid? webPageArea? feeds {? type? author {? id? name? following? headerUrl? headerUrls {? cdn? url? __typename? }? __typename? }? tags {? type? name? __typename? }? photo {? id? duration? caption? likeCount? realLikeCount? coverUrl? coverUrls {? cdn? url? __typename? }? photoUrls {? cdn? url? __typename? }? photoUrl? liked? timestamp? expTag? animatedCoverUrl? stereoType? videoRatio? profileUserTopPhoto? __typename? }? canAddComment? currentPcursor? llsid? status? __typename? }? hostName? pcursor? __typename? }?}?
????????如果这样输入的话就会报错,因此我们可以用字符串的.replace方法(或者自己一个一个换)将"?"替换成"\n"。
str = "query visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) {? visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) {? result? llsid? webPageArea? feeds {? type? author {? id? name? following? headerUrl? headerUrls {? cdn? url? __typename? }? __typename? }? tags {? type? name? __typename? }? photo {? id? duration? caption? likeCount? realLikeCount? coverUrl? coverUrls {? cdn? url? __typename? }? photoUrls {? cdn? url? __typename? }? photoUrl? liked? timestamp? expTag? animatedCoverUrl? stereoType? videoRatio? profileUserTopPhoto? __typename? }? canAddComment? currentPcursor? llsid? status? __typename? }? hostName? pcursor? __typename? }?}?"
str_end = str.replace("?",r"\n")
#结果为:
str_end = "query visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) {\n visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) {\n result\n llsid\n webPageArea\n feeds {\n type\n author {\n id\n name\n following\n headerUrl\n headerUrls {\n cdn\n url\n __typename\n }\n __typename\n }\n tags {\n type\n name\n __typename\n }\n photo {\n id\n duration\n caption\n likeCount\n realLikeCount\n coverUrl\n coverUrls {\n cdn\n url\n __typename\n }\n photoUrls {\n cdn\n url\n __typename\n }\n photoUrl\n liked\n timestamp\n expTag\n animatedCoverUrl\n stereoType\n videoRatio\n profileUserTopPhoto\n __typename\n }\n canAddComment\n currentPcursor\n llsid\n status\n __typename\n }\n hostName\n pcursor\n __typename\n }\n}\n"
第三步:
????????通过requests.post方法获取所需要的信息,并用json.loads()方法将返回的信息转化成字典类型,方便信息的获取
FirstUrl = "https://www.kuaishou.com/graphql"
#利用post方法获取网页信息
response = requests.post(url=FirstUrl,headers=headers,json=post)
#利用json方法将返回的信息转化成字典形式
html = json.loads(response.text)
第四步:
? ? ? ??获取视频的url后下载视频并保存在创建的文件夹中
? ? ? ? 创建文件夹:
h = r"F:\快手\\"
if not os.path.exists(h):
os.mkdir(h)
? ? ? ? 下载视频:
for url in GET_URL():
video = requests.get(url).content
#遍历视频名称列表,将保存的视频与名称对应
for name in n:
sum = sum + 1
#打开创建的文件并写入
with open(h+name+".mp4","wb") as f:
f.write(video)
#判断下载是否成功
if True:
print("第"+str(sum)+"个下载成功!")
else:
print("第"+str(sum)+"个下载失败!")
#for name in n: 每次都会从列表n的第一个开始遍历,因此每下载一个,删除列表n的第一个值(确保每次能遍历到不同的名称)
n.pop(0)
#下载完成后跳出此循环,进行下一个的下载
break
第五步:
? ? ? ? 获取评论内容(有评论内容以及评论回复,这里不细说,稍后源代码奉上)
? ? ? ? 评论同样储存在graphql中,但是我们需要随便点开一个视频,于是就有新的graphql弹出来:
? ? ? ? ?利用与获取视频信息相同的方法获取评论信息即可,在这里只给大家看一下需要向url传递的信息,传递信息后用post方法获取评论信息,同样用json处理返回的信息并获取。
?源代码如下:
#------快手信息获取------
#coding=utf-8
import time
import requests
import os
import json
import xlwt
userId = input("请输入userID:")
pcursor = input("请输入pcursor:")
page = input("请输入excal刷新识别数(整数):")
#创建或打开文件夹
h = r"F:\快手\\"
if not os.path.exists(h):
os.mkdir(h)
#头部信息
headers = {
"Cookie":"kpf=PC_WEB; kpn=KUAISHOU_VISION; clientid=3; did=web_59821def1ca2acde8067d80822b7f86c; userId=2428747385; kuaishou.server.web_st=ChZrdWFpc2hvdS5zZXJ2ZXIud2ViLnN0EqABWIlXaYvq9DAtGoNKdgI3WBoWkDqkXMVZ1hOBmFke5aNG334pxKwEWXzRfHz1GvxeJtARKibURbo6S4PxirLF8n4XrsSLhon9VdfFjn35p7eyINL8GRFCeMd1wUhIFxxhk-LXkSwmwN1_lZnwnfAMlgimCPn7ZLqYJts9RxNxBrQg80cHSPYLIF2yIqkB2HtKsNhsJOZ0eBsIM204FwWRMRoScSev-lUTUQRM3QfVIiGGGpa1IiBFlVxlny2X_A6IIUiaC3mEBwsRlhIvfYUmU8M2OERVeygFMAE; kuaishou.server.web_ph=e882275ed97aff8bc584a342599794b5ad0d",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
}
#向浏览器传递的信息
post = {
"operationName":"visionProfilePhotoList",
"variables":{
"userId":userId,
"pcursor":pcursor,
"page":"profile"
},
"query":"query visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) {\n visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) {\n result\n llsid\n webPageArea\n feeds {\n type\n author {\n id\n name\n following\n headerUrl\n headerUrls {\n cdn\n url\n __typename\n }\n __typename\n }\n tags {\n type\n name\n __typename\n }\n photo {\n id\n duration\n caption\n likeCount\n realLikeCount\n coverUrl\n coverUrls {\n cdn\n url\n __typename\n }\n photoUrls {\n cdn\n url\n __typename\n }\n photoUrl\n liked\n timestamp\n expTag\n animatedCoverUrl\n stereoType\n videoRatio\n profileUserTopPhoto\n __typename\n }\n canAddComment\n currentPcursor\n llsid\n status\n __typename\n }\n hostName\n pcursor\n __typename\n }\n}\n"
}
FirstUrl = "https://www.kuaishou.com/graphql"
#利用post方法获取网页信息
response = requests.post(url=FirstUrl,headers=headers,json=post)
#利用json方法将返回的信息转化成字典形式
html = json.loads(response.text)
#获取第一层(字典类型)
First_message = html.get("data")
#获取第二层(字典类型)
Second_message = First_message.get("visionProfilePhotoList")
#获取第三层(列表类型)
Third_message = Second_message.get("feeds")
#遍历列表,将列表拆开
#获取视频名称
def GET_NAME():
a = 1
Name_list = []
for Onemessage in Third_message:
Get_Video_name = Onemessage.get("photo")
One_name = Get_Video_name.get("caption")
# 因为视频名称中有换行,识别不了,所以用replace方法将"\n"替换成任意字符
One_name_1 = One_name.replace("\n", "")
#视频名称查重,若重复则会覆盖上一个已经下载的视频
if One_name_1 not in Name_list:
Name_list.append(One_name_1)
else:
Name_list.append(One_name_1+str(a))
a = a+1
return Name_list
#获取视频链接
def GET_URL():
#创建空列表,将得到的url存放进去
Url_list = []
for Onemessage in Third_message:
Get_url = Onemessage.get("photo")
Url = Get_url.get("photoUrl")
Url_list.append(Url)
#返回得到的url列表
return Url_list
#下载视频
def Download_video():
#设置计数器
sum = 0
#用n接收GET_NAME,防止每次遍历都从头开始
n = GET_NAME()
#遍历循环列表中的url,用get方法获取信息
for url in GET_URL():
video = requests.get(url).content
#遍历视频名称列表,将保存的视频与名称对应
for name in n:
sum = sum + 1
#打开创建的文件并写入
with open(h+name+".mp4","wb") as f:
f.write(video)
#判断下载是否成功
if True:
print("第"+str(sum)+"个下载成功!")
else:
print("第"+str(sum)+"个下载失败!")
#for name in n: 每次都会从列表n的第一个开始遍历,因此每下载一个,删除列表n的第一个值(确保每次能遍历到不同的名称)
n.pop(0)
#下载完成后跳出此循环,进行下一个的下载
break
def Get_realLikeCount():
for datas in Third_message:
Up_realLikeCount = datas.get("photo")
#遍历获取每一条视频的点赞数量
realLikeCount = Up_realLikeCount.get("realLikeCount")
def Get_comment():
headers_comment = {
"Cookie": "kpf=PC_WEB; kpn=KUAISHOU_VISION; clientid=3; did=web_59821def1ca2acde8067d80822b7f86c; userId=2428747385; kuaishou.server.web_st=ChZrdWFpc2hvdS5zZXJ2ZXIud2ViLnN0EqABtI4bLUUjuMfNUEUaJXTxyRhtz5BDihYiVIwEfIm8ob0Tlb_wzKD7jpTK_sNDa9fBXVWbVX44OsbnSRsLW7IHwIY3LtBnPg-la24sXJ4B3QZXclUYyNNq-SFVmNC1ebXGV5_JusHsiNdwTcnmY_RAXUAzUoTjD3hNyeH9X7WP2h_mIofS3ifv-TEYybfVD35nh9G8UyBSfzKPLMcrDlyXIhoS7YoRGiN2PM_7zCD1Dj9m5oYoIiB9s5tmCfcl6X3tJNcAP9-YUs951NH3q3Vo1DE_E6PwVygFMAE; kuaishou.server.web_ph=7e511b27f7813874f11702ed1d7d0b73df13",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
}
#创建空列表用来存储所获取的信息
First_list_names = []
First_list_texts = []
First_comment_distribution = []
Second_comment_distribution = []
#遍历获取信息
for s in Third_message:
commentCount_url_1 = s.get("photo")
#获取评论所在的url id
commentCount_url = commentCount_url_1.get("id")
post_comment = {
"operationName": "commentListQuery",
"variables": {
"photoId": commentCount_url,
"pcursor": ""
},
"query": "query commentListQuery($photoId: String, $pcursor: String) {\n visionCommentList(photoId: $photoId, pcursor: $pcursor) {\n commentCount\n pcursor\n rootComments {\n commentId\n authorId\n authorName\n content\n headurl\n timestamp\n likedCount\n realLikedCount\n liked\n status\n subCommentCount\n subCommentsPcursor\n subComments {\n commentId\n authorId\n authorName\n content\n headurl\n timestamp\n likedCount\n realLikedCount\n liked\n status\n replyToUserName\n replyTo\n __typename\n }\n __typename\n }\n __typename\n }\n}\n"
}
#用post方法返回评论所在网页的信息
response_comment = requests.post(url=FirstUrl,headers=headers_comment,json=post_comment)
#用json方法将得到的信息处理成字典形式,再用.get方法获取信息
comment_html = json.loads(response_comment.text)
comment_text_1 = comment_html.get("data")
comment_text_2 = comment_text_1.get("visionCommentList")
comments = comment_text_2.get("rootComments")
#将每个视频的评论数储存在列表中
First_comment_distribution.append(len(comments))
#遍历获取评论的用户名以及评论信息
for comment in comments:
First_comment_name = comment.get("authorName")
First_comment_text = comment.get("content")
comment_second_text = comment.get("subComments")
#将得到的信息储存
First_list_names.append(First_comment_name)
First_list_texts.append(First_comment_text)
#获取每条评论的回复数量
Second_comment_distribution.append(len(comment_second_text))
#创建空列表储存评论回复的信息
Second_list_names = []
Second_list_texts = []
#遍历获取评论回复信息
for comment_second in comment_second_text:
Second_comment_name = comment_second.get("authorName")
Second_comment_text = comment_second.get("content")
#储存回复信息
Second_list_names.append(Second_comment_name)
Second_list_texts.append(Second_comment_text)
#将得到的所有信息返回
return First_list_names,First_list_texts,Second_list_names,Second_list_texts,First_comment_distribution,Second_comment_distribution
def Write_excal():
write_type = list(Get_comment())
write_name = write_type[0]
write_text = write_type[1]
write_lists = []
for i in range(len(write_name)):
write_lists.append(write_name[i])
write_lists.append(write_text[i])
n = 2
for p in range(0, len(write_lists), n):
write_list = write_lists[p:p + n]
write_list_end = [write_lists[p:p + n] for p in range(0, len(write_lists), n)]
file = xlwt.Workbook()
sheet1 = file.add_sheet(u'评论信息', cell_overwrite_ok=True)
rowTitle = [u'编号', u'用户名称', u'评论信息']
rowDatas = write_list_end
for j in range(len(rowTitle)):
sheet1.write(0, j, rowTitle[j])
for k in range(len(rowDatas)):
rowDatas[k].insert(0, k + 1)
for y in range(len(rowDatas[k])):
sheet1.write(k + 1, y, rowDatas[k][y])
file.save("F:\python爬虫\爬虫实战\快手\ "+"快手评论"+str(page)+".xlsx")
def Pr_output():
prtype = list(Get_comment())
# 一层评论用户名称
First_names = prtype[0]
# 一层评论内容
First_texts = prtype[1]
# 二层评论用户名称
Second_names = prtype[2]
# 二层评论内容
Second_texts = prtype[3]
# 二层评论分布
Second_nums = prtype[5]
# 一层评论分布
sums = prtype[4]
#创建空列表储存"每一条"视频的评论以及用户名称信息
c = []
d = []
#借助sums(一层评论分布)将每一个视频的评论划分开
for i in sums:
'''用切片方法截取每一条视频的评论以及名称
为了防止重复遍历,将之前遍历过的用del方法删除
将得到的信息储存在创建的列表中'''
c.append(First_texts[0:i])
del First_texts[0:i]
d.append(First_names[0:i])
del First_names[0:i]
for j in range(len(GET_NAME())):
#获取每一条视频的每一个评论以及用户名
names = d[j]
texts = c[j]
#判断评论是否为空,这里用names与texts都可以,他们俩是一一对应的
if names != []:
print("第" + str(j + 1) + "个作品的评论如下:")
#遍历输出每一个评论以及所对应的用户名
for k in range(len(names)):
print(names[k] + ": " + texts[k])
#利用Second_nums的数值判断每一个评论下是否含有回复,值为几则含有几个回复
for s in Second_nums:
if s == 0:
print(" 评论无回复")
else:
#用x限制输出的回复数量(输出的数量需要与Second_nums的值对应)
x = 0
#遍历输出回复的用户名称与回复的信息
for r in range(len(Second_texts)):
print(" 评论回复-" + "用户名称:" + Second_names[r] + " 评论信息:" + Second_texts[r])
x = x + 1
#当输出的数量等于Second_nums对应的值时,跳出循环
if x == Second_nums[0]:
break
#用del删除之前输出过的信息防止重复遍历
del Second_texts[0:x]
del Second_names[0:x]
#更新Second_nums值的信息(删除之间遍历过的第一个值,并跳出循环,让循环重新看开始)
Second_nums.pop(0)
break
else:
print("第" + str(j + 1) + "个作品无评论")
print("\n")
if __name__ == '__main__':
#设置程序开始的时间
start = time.perf_counter()
#调用函数
GET_NAME()
GET_URL()
print("开始下载视频".center(100, "-"))
Download_video()
print("下载结束".center(100, "-"))
print("\n")
#设置程序暂停两秒后运行
time.sleep(2)
Get_comment()
print("输出评论信息".center(100, "-"))
Pr_output()
print("输出结束".center(100, "-"))
print("\n")
print("正在生成excal".center(100,"-"))
Write_excal()
print("excal生成结束".center(100,"-"))
#计算程序运行的总时间
t = time.perf_counter() - start
print("总共用时:{:.2f}秒".format(t))
?说明:
????????一个graphql中能够获取的视频信息是有限的,若作者视频较多,则需要将页面向下滑动,刷新出新的视频,graphql也会随之出现,利用同样的方法即可获取。评论也是如此!
????????
|