写在前面: 此博客仅用于记录个人学习进度,学识浅薄,若有错误观点欢迎评论区指出。欢迎各位前来交流。(部分材料来源网络,若有侵权,立即删除)
Python实现爬取刷新微博推荐和最新好友微博|cookie调用|模拟登录
免责声明
- 代码仅用于学习,如被转载用于其他非法行为,自负法律责任
- 代码全部都是原创,不允许转载,转载侵权
情况说明
效果展示
代码讲解
引用库
import requests
import time
import json
from selenium import webdriver
import csv
cookie
参考博客
- 这边使用的是Selenium模拟登录的方法
- 函数如下:
def get_cookies():
driver = webdriver.Firefox()
url = 'https://www.weibo.com/'
driver.get(url)
time.sleep(3)
input('完成登陆后点击enter:')
time.sleep(3)
dictcookies = driver.get_cookies()
cookie = [item["name"] + "=" + item["value"] for item in dictcookies]
cookiestr = ';'.join(item for item in cookie)
print(cookiestr)
with open('wyycookie.txt', 'w') as f:
f.write(cookiestr)
print('cookies保存成功!')
driver.close()
- 这一行代码是将driver中获取到的cookie转换成requests能直接使用的格式
cookie = [item["name"] + "=" + item["value"] for item in dictcookies]
cookiestr = ';'.join(item for item in cookie)
with open('wbcookie.txt', 'w') as f:
f.write(cookiestr)
print('cookies保存成功!')
driver.close()
def read_cookie():
try:
print("[INFO]:正常尝试读取本地cookie")
with open('wbcookie.txt', 'r', encoding='utf8') as f:
Cookies = f.read()
except:
print("[ERROR]:读取失败,请手动登录并更新")
get_cookies()
read_cookie()
return Cookies
网页分析
爬取好友刷新页面
-
找到了 -
点击进去看一下 -
https://weibo.com/ajax/feed/unreadfriendstimeline?list_id=100017243538816&refresh=4&since_id=0&count=10 -
这是第一次点开的发送的包 -
每次刷新会发现list_id不一样 -
但是发现把list_id去掉也不影响 -
然后我们把网页向下滑 -
抓取新的包 -
https://weibo.com/ajax/feed/unreadfriendstimeline?list_id=100017243538816&refresh=4&max_id=1638885029309985&count=10 -
仔细看会发现传入了max_id这个参数就进入下一页 -
然后我们在第一页仔细找会找到max_id -
所以思路就是爬取第一页的信息以及max_id -
然后重新构造url去爬取下一些 -
形成循环
def get_friend_new(number):
max_id=0
results = []
for k in range(number):
if(k==0):
url = 'https://www.weibo.com/ajax/feed/unreadfriendstimeline?list_id=&refresh=4&since_id=0&count=10'
else:
url='https://www.weibo.com/ajax/feed/unreadfriendstimeline?list_id=&refresh=4&max_id={}&count=10'.format(max_id)
DATA=get_new(url)
results.append(DATA[1])
max_id=DATA[0]
return results
爬取推荐页面
- 两个页面大同小异
- 区别就是max_id在这里是规律的从0依次递增
- 并且每次url中也只有max_id改变
def get_hot_new(number):
results = []
for k in range(number):
url='https://www.weibo.com/ajax/feed/hottimeline?since_id=0&refresh=0&group_id=102803&containerid=102803&extparam=discover%7Cnew_feed&max_id={}&count=10'.format(k)
DATA = get_new(url)
results.append(DATA[1])
return results
爬取操作
代码讲解
- 考虑到两者的爬取后,页面的结构和解析方式是一样的所以封装成一个函数进行调用
results=[]
r = rs.get(url, headers=headers)
r.encoding = 'utf-8'
str_r = r.text
dict_r = json.loads(str_r)
max_id = dict_r['max_id']
- 将请求得到的内容转化为字典
- 提取出其中的max_id
- 首先把存储的所需信息的值输出查看结构
- 然后依次输出每一组微博字典中的键值对对应关系
for i in dict_r['statuses']:
data = []
data.append(i['created_at'])
data.append(i['user']['screen_name'])
text = i['text_raw'].replace(u'\u200b', '')
text = text.replace(u'\n', '')
data.append(text)
data.append(i['source'])
data.append(i['reposts_count'])
data.append(i['comments_count'])
data.append(i['attitudes_count'])
try:
pic = []
for dict_key, dict_value in i['pic_infos'].items():
pic.append(dict_value['original']['url'])
except:
pass
try:
vid = []
vid.append(i['url_struct'][0]['short_url'])
except:
pass
if ('retweeted_status' in i):
data.append('转发')
data.extend(pic)
data.extend(vid)
pic = []
vid = []
results.append(data)
data = []
data.append(i['retweeted_status']['created_at'])
data.append(i['retweeted_status']['user']['screen_name'])
text = i['retweeted_status']['text_raw'].replace(u'\u200b', '')
text = text.replace(u'\n', '')
data.append(text)
data.append(i['retweeted_status']['source'])
data.append(i['retweeted_status']['reposts_count'])
data.append(i['retweeted_status']['comments_count'])
data.append(i['retweeted_status']['attitudes_count'])
data.append('原创')
try:
pic = []
for dict_key, dict_value in i['retweeted_status']['pic_infos'].items():
pic.append(dict_value['original']['url'])
except:
pass
try:
vid = []
vid.append(i['url_struct'][0]['short_url'])
except:
pass
data.extend(pic)
data.extend(vid)
results.append(data)
else:
data.append('原创')
data.extend(pic)
data.extend(vid)
results.append(data)
- 发现在本条微博中,如果是转发,则会嵌套了原微博的信息
- 所以如果是转发的情况下
- 将pic和vid导入后,打上标签
- 将data添加到results
- 把data置空
- 重现上述提取内容的步骤
- 把原微博的信息也一并爬取下来
代码如下:
def get_new(url):
results=[]
r = rs.get(url, headers=headers)
r.encoding = 'utf-8'
str_r = r.text
dict_r = json.loads(str_r)
max_id = dict_r['max_id']
for i in dict_r['statuses']:
data = []
data.append(i['created_at'])
data.append(i['user']['screen_name'])
text = i['text_raw'].replace(u'\u200b', '')
text = text.replace(u'\n', '')
data.append(text)
data.append(i['source'])
data.append(i['reposts_count'])
data.append(i['comments_count'])
data.append(i['attitudes_count'])
try:
pic = []
for dict_key, dict_value in i['pic_infos'].items():
pic.append(dict_value['original']['url'])
except:
pass
try:
vid = []
vid.append(i['url_struct'][0]['short_url'])
except:
pass
if ('retweeted_status' in i):
data.append('转发')
data.extend(pic)
data.extend(vid)
pic = []
vid = []
results.append(data)
data = []
data.append(i['retweeted_status']['created_at'])
data.append(i['retweeted_status']['user']['screen_name'])
text = i['retweeted_status']['text_raw'].replace(u'\u200b', '')
text = text.replace(u'\n', '')
data.append(text)
data.append(i['retweeted_status']['source'])
data.append(i['retweeted_status']['reposts_count'])
data.append(i['retweeted_status']['comments_count'])
data.append(i['retweeted_status']['attitudes_count'])
data.append('原创')
try:
pic = []
for dict_key, dict_value in i['retweeted_status']['pic_infos'].items():
pic.append(dict_value['original']['url'])
except:
pass
try:
vid = []
vid.append(i['url_struct'][0]['short_url'])
except:
pass
data.extend(pic)
data.extend(vid)
results.append(data)
else:
data.append('原创')
data.extend(pic)
data.extend(vid)
results.append(data)
for j in results:
print(j)
time.sleep(1)
return [max_id,results]
- 以及考虑到爬取好友微博需要max_id
- 所以get_new()返回的值为一个列表
- 其中包括了max_id和爬取结果results
写入文件
def write(results):
with open("爬取结果.csv", "a", encoding="gb18030", newline="") as csvfile:
writer = csv.writer(csvfile)
print("[INFO]正在写入csv文件中")
for i in results:
writer.writerows(i)
主函数
if __name__ == "__main__":
Cookies=read_cookie()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
'Cookie': '{}'.format(Cookies)}
results=get_hot_new(10)
write(results)
全部代码
from lxml import etree
import requests
import time
import json
from selenium import webdriver
import csv
def read_cookie():
try:
print("[INFO]:正常尝试读取本地cookie")
with open('wbcookie.txt', 'r', encoding='utf8') as f:
Cookies = f.read()
except:
print("[ERROR]:读取失败,请手动登录并更新")
get_cookies()
read_cookie()
return Cookies
def get_cookies():
driver = webdriver.Firefox()
url = 'https://www.weibo.com/'
driver.get(url)
time.sleep(3)
input('完成登陆后点击enter:')
time.sleep(3)
dictcookies = driver.get_cookies()
cookie = [item["name"] + "=" + item["value"] for item in dictcookies]
cookiestr = ';'.join(item for item in cookie)
print(cookiestr)
with open('wbcookie.txt', 'w') as f:
f.write(cookiestr)
print('cookies保存成功!')
driver.close()
rs = requests.session()
def get_hot_new(number):
results = []
for k in range(number):
url='https://www.weibo.com/ajax/feed/hottimeline?since_id=0&refresh=0&group_id=102803&containerid=102803&extparam=discover%7Cnew_feed&max_id={}&count=10'.format(k)
DATA = get_new(url)
results.append(DATA[1])
return results
def get_friend_new(number):
max_id=0
results = []
for k in range(number):
if(k==0):
url = 'https://www.weibo.com/ajax/feed/unreadfriendstimeline?list_id=&refresh=4&since_id=0&count=10'
else:
url='https://www.weibo.com/ajax/feed/unreadfriendstimeline?list_id=&refresh=4&max_id={}&count=10'.format(max_id)
DATA=get_new(url)
results.append(DATA[1])
max_id=DATA[0]
return results
def get_new(url):
results=[]
r = rs.get(url, headers=headers)
r.encoding = 'utf-8'
str_r = r.text
dict_r = json.loads(str_r)
max_id = dict_r['max_id']
for i in dict_r['statuses']:
data = []
data.append(i['created_at'])
data.append(i['user']['screen_name'])
text = i['text_raw'].replace(u'\u200b', '')
text = text.replace(u'\n', '')
data.append(text)
data.append(i['source'])
data.append(i['reposts_count'])
data.append(i['comments_count'])
data.append(i['attitudes_count'])
try:
pic = []
for dict_key, dict_value in i['pic_infos'].items():
pic.append(dict_value['original']['url'])
except:
pass
try:
vid = []
vid.append(i['url_struct'][0]['short_url'])
except:
pass
if ('retweeted_status' in i):
data.append('转发')
data.extend(pic)
data.extend(vid)
pic = []
vid = []
results.append(data)
data = []
data.append(i['retweeted_status']['created_at'])
data.append(i['retweeted_status']['user']['screen_name'])
text = i['retweeted_status']['text_raw'].replace(u'\u200b', '')
text = text.replace(u'\n', '')
data.append(text)
data.append(i['retweeted_status']['source'])
data.append(i['retweeted_status']['reposts_count'])
data.append(i['retweeted_status']['comments_count'])
data.append(i['retweeted_status']['attitudes_count'])
data.append('原创')
try:
pic = []
for dict_key, dict_value in i['retweeted_status']['pic_infos'].items():
pic.append(dict_value['original']['url'])
except:
pass
try:
vid = []
vid.append(i['url_struct'][0]['short_url'])
except:
pass
data.extend(pic)
data.extend(vid)
results.append(data)
else:
data.append('原创')
data.extend(pic)
data.extend(vid)
results.append(data)
for j in results:
print(j)
time.sleep(1)
return [max_id,results]
def write(results):
with open("爬取结果.csv", "a", encoding="gb18030", newline="") as csvfile:
writer = csv.writer(csvfile)
print("[INFO]正在写入csv文件中")
for i in results:
writer.writerows(i)
if __name__ == "__main__":
Cookies=read_cookie()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
'Cookie': '{}'.format(Cookies)}
results=get_hot_new(10)
write(results)
|