学习《Python3 网络爬虫实践开发》第6章Ajax数据爬取
????????6.3节为用Python 来模拟Ajax 请求,把微博爬取下来,但因为时间原因,微博的页面参数已经调整过,书中的爬取过程也就不适应了,因此,按照教材的方法,对微博的Ajax请求进行分析,原page参数取消,转而使用since_id代替,since_id经分析为一次AJAX请求响应结果列表第一个微博的ID(首页的Ajax请求链接无此参数)。
????????即首页url为 https://m.weibo.cn/api/container/getIndex?type=uid&value=2830678474&containerid=1076032830678474
????????下一次请求的url为 https://m.weibo.cn/api/container/getIndex?type=uid&value=2830678474&containerid=1076032830678474&since_id=["获取得到的since_id"]
????????那么这个since_id从哪里获取呢,经分析,它在响应数据data->cardlistInfo->since_id位置下,这个参数就是下一次请求第一个微博的ID。
#这里以第一页返回的数据为例
data: {,…}
cardlistInfo: {containerid: "1076032830678474", v_p: 42, show_style: 1, total: 2355, since_id: 4721851010385421}
containerid: "1076032830678474"
show_style: 1
since_id: 4721851010385421
total: 2355
v_p: 42
cards: [{card_type: 9, itemid: "1076032830678474_-_4725986917353698",…},…]
scheme: "sinaweibo://cardlist? containerid=1076032830678474&type=uid&value=2830678474&oid=4053125562625130&luicode=10000011&lfid=1005052830678474&v_p=42"
showAppTips: 0
????????其他参数获取位置同文章内容,在此不赘述。
????????经过调试,每次请求获取的微博数量不再是固定的10个了,而是小于等于10,这里需要注意!
????????代码如下:
import requests
from urllib.parse import urlencode,urlparse
import time
base_url = 'https://m.weibo.cn/api/container/getIndex?'
headers ={
'Host':'m.weibo.cn',
'Referer': 'https://m.weibo.cn/u/2830678474',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'
}
#获取首页
def get_first_page():
query = {
'type':'uid',
'value':'2830678474',
'containerid':'1076032830678474',
}
url = base_url + urlencode(query)
try:
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.json()
except requests.ConnectionError as e:
print('Get First Page Error',e.args)
#获取后续页面
def get_page(since_id):
query = {
'type':'uid',
'value':'2830678474',
'containerid':'1076032830678474',
'since_id':since_id
}
url = base_url + urlencode(query)
try:
response = requests.get(url,headers=headers,verify=False)
if response.status_code == 200:
return response.json()
except requests.ConnectionError as e:
print('Get First Page Error',e.args)
#获取since_id
def get_since_id(json):
if json:
since_id = json.get('data').get('cardlistInfo').get('since_id')
return since_id
return None
#获取total值
def get_total_num(json):
if json:
total = json.get('data').get('cardlistInfo').get('total')
return total
return None
from pyquery import PyQuery as pq
#获取微博信息
def parse_page(json):
if json:
items = json.get('data').get('cards')
for item in items:
item = item.get('mblog')
weibo = {}
weibo['id'] = '\'' + str(item.get('id'))
weibo['text'] = pq(item.get('text')).text()
weibo['attitudes'] = item.get('attitudes_count')
weibo['comments'] = item.get('comments_count')
weibo['reposts'] = item.get('reposts_count')
yield weibo
#存储到mongo数据库中
from pymongo import MongoClient
Client = MongoClient()
db = Client['weibo']
collection = db['weibo']
def save_to_mongo(result):
if collection.insert(result):
print('Save to Mongo')
#存储到CSV
import csv
import os
def save_to_csvfile(result):
fieldnames = list(result.keys())
## 第一次打开文件时,第一行写入表头
if not os.path.exists('./weibo.csv'):
with open('weibo.csv','a',encoding='utf-8-sig',newline='') as csvfile: # newline='' 去除空白行
writer = csv.DictWriter(csvfile,fieldnames=fieldnames)
writer.writeheader()
with open('weibo.csv','a',encoding='utf-8-sig',newline='') as csvfile: # newline='' 去除空白行
writer = csv.DictWriter(csvfile,fieldnames=fieldnames)
writer.writerow(result)
print("Write Success")
if __name__ == '__main__':
json = get_first_page()
since_id = get_since_id(json)
#total = get_total_num(json)
#print(total) #文章中并没有获取微博总数
results = parse_page(json)
count = 0
for result in results:
#save_to_mongo(result) #保存至mongodb
save_to_csvfile(result) #保存为csv文件
count += 1
print(count)
while since_id: #或者可以用count与total进行比较,判断微博有没有爬取完
json = get_page(since_id)
since_id = get_since_id(json)
results = parse_page(json)
for result in results:
#save_to_mongo(result) #保存至mongodb
save_to_csvfile(result) #保存为csv文件
count += 1
time.sleep(0.5)
print(count)
|