数据获取方案
data:image/s3,"s3://crabby-images/d784e/d784e2c0d3599f7b1539c342eeb4c0d72a630b5a" alt=""
在浏览数据中数据会自动发起请求(可通过Python模拟鼠标滚动实现自动化)
import pyautogui # pip install pyautogui
import time
import progressbar
widgets=['进度:',progressbar.Timer(),']',progressbar.Percentage(),progressbar.Bar(), '(',progressbar.ETA(),')']
bar=progressbar.ProgressBar(widgets=widgets)
for i in bar(range(2*12*60)):
pyautogui.scroll(-1000)
time.sleep(2)
data:image/s3,"s3://crabby-images/d8248/d8248e0261a4e6d03190102c65b06a1702be8496" alt=""
将请求返回的数据另存为文件
data:image/s3,"s3://crabby-images/ee8eb/ee8ebf744498029ee9ce3a325c877b8183319b60" alt=""
数据解析
file=open('data/pharmsnap.zhihuiya.com.har',encoding='UTF-8')
text=file.read()
file.close()
import re
pattern=re.compile('"text":.+offset.+limit.+total.+items.+',re.M)
data=pattern.findall(text)
data:image/s3,"s3://crabby-images/6e9f0/6e9f0c7be872511654169341b42c73f4f8f50eef" alt=""
数据处理?
data:image/s3,"s3://crabby-images/e4623/e462381305732af9bdf7e177eb3c19e9ab0618c4" alt=""
?
data:image/s3,"s3://crabby-images/37482/374820d08cb79566f3a577cf22c0150404fbee5d" alt=""
功能封装?
def To_DataFrame(item):
# 提取内容
xhr=item.replace('\\"','"').replace('true','True').replace('false','False')[8:]
dic=eval(xhr[1:-1])
offset=dic['offset']
limit=dic['limit']
total=dic['total']
items=dic['items']
df=pd.DataFrame(items)
df['offset']=offset
df['limit']=limit
df['total']=total
return df
data:image/s3,"s3://crabby-images/e7580/e7580cc08573d363e34871d6b746f0e06520c52f" alt=""
df=pd.DataFrame()
for item in data:
df = pd.concat([df,To_DataFrame(item)])
?
data:image/s3,"s3://crabby-images/c2e5b/c2e5b29b2538c2fdf1ac9f9e7071340a6dbc0017" alt=""
保存数据?
df.to_excel('./data/pharmsnap.xlsx')
?
|