喜欢上某个基金网站,但是数据基本上是动态网页的方式,看了一下selenium,requests方案,后面选择了playwright,折腾了一天,基本上可以出相关数据了。 一、同步方案
from asyncio.windows_events import NULL
from playwright.sync_api import sync_playwright
import pandas as pd
import asyncio
import re
from playwright.async_api import async_playwright
url = "http://fund.****.com.cn/"
proxy_to_use = {
'server': '127.0.0.1:80'
}
def get_index_info(str_code):
values = {}
code_url = url + str_code +"/interduce.html#interduce"
with sync_playwright() as p:
browser = p.chromium.launch(proxy=proxy_to_use, headless=False)
page = browser.new_page()
page.goto(code_url)
all_items = page.query_selector_all('.g-dialog')
for item in all_items:
text = item.inner_text()
values[str_code] = text
browser.close()
return values
def trim_multi_char(input_string):
strs = list(input_string)
strs_len = len(strs)
if strs_len < 1:
return input_string
out_strs = []
for i in range(strs_len):
if i == 0:
if strs[0] not in [" ",""]:
out_strs.append(strs[0])
else:
if (strs[i] != strs[i-1] and strs[i] not in [" ",""]) or ( strs[i] == strs[i-1] and strs[i] not in ["\n"," ",""]):
out_strs.append(strs[i])
return ''.join(out_strs)
async def async_get_index_info(code):
values = {}
code_url = url + str(code) +"/interduce.html#interduce"
async with async_playwright() as p:
browser_type = p.chromium
browser = await browser_type.launch(headless=False)
page = await browser.new_page()
await page.goto(code_url)
all_items = await page.query_selector_all('.g-dialog')
async for item in all_items:
vv = await item.inner_text().split("\n")
assert (len(vv)%2 == 0),f"code :{code} items项key->value不对称错误,请确认!"
n = 0
async for v in vv:
if n%2 ==0:
values[vv[n]] = await vv[n+1]
n = await n+1
await browser.close()
return values
def get_basket_index_raw_data(raw_codes):
codes = []
for _code in raw_codes:
fix_code = ""
str_code = str(_code)
if len(str_code)<6:
fix_code = "0"*(6-len(str_code)) + str_code
else:
fix_code = str_code
codes.append(fix_code)
data = []
n = 1
for code in codes:
print(f" download code : {code} => 第 {n} 个, 总共 {len(codes)} 个")
value = get_index_info(code)
data.append(value)
return data
def trim_data(raw_data):
data = {}
for dict_code in raw_data:
for code in dict_code.keys():
raw = dict_code[code]
raw_trim = trim_multi_char(raw)
contents = raw_trim.split("\n")
assert (len(contents)%2 == 0),f"code :{code} items项key->value不对称错误,请确认!"
for i in range(len(contents)):
if i%2 ==0 and i < len(contents)-1:
_key = contents[i]
if _key == "":
continue
value = contents[i+1]
if "--" in value:
value = ""
if _key in data.keys():
data[_key].append(value)
else:
data[_key] = [value]
return data
index_fund_path = r"C:\Users\Desktop\index_funds.csv"
funds_df = pd.read_csv(index_fund_path,encoding="gbk")
raw_codes = list(funds_df.codes)
raw_data = get_basket_index_raw_data(raw_codes)
dict_data = trim_data(raw_data)
df = pd.DataFrame(dict_data)
df.to_csv(r"C:\Users\Desktop\index_pacong.csv",encoding="gbk")
输出csv样例: 总体上还是成功了。
难点复盘: 1、playwright的安装 特别是在windows下的安装,可能大概率有折腾的可能。 2、找到所需要的节点 比如:
all_items = page.query_selector_all('.g-dialog')
这个过程可能需要不断试。
二、后续改进:
1、异步 目前,异步方案还没有调通,因为playwright install还没有成功,是通过把下载几个浏览器文件 安装的,估计版本还有问题。 关于playwright的安装,还是有不少烦心事。折腾了好几个小时。
可以参考安装资料
2、ip代理池 目前,还没有看到象requests库一样的代理池方案,目前还在研究中。在ip代理池没有弄好前,只好做少量的测试,担心ip被封。 当然,本身也就是学习,不用于其它目的。
|