参考链接
CSDN
学习资料
xPath-菜鸟教程 正则表达式-菜鸟教程 正则表达式在线测试-菜鸟教程
代码
"""
Created on Tue Sep 7 21:16:33 2021
@author: DELL
"""
'''
def pachong():
headers = {
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
# 所有股票的table
table_url = 'http://summary.jrj.com.cn/hybk/400115934.shtml'
page_data = requests.get(table_url).content.decode("gbk")
data_tree = etree.HTML(page_data)
# 爬取“证券代码”得到一个codes_list
print(data_tree)
if page_data.find("jrj-topDiv highlightedCol"):
codes_list = data_tree.xpath("//*[@class=\"jrj-topDiv\"]/a/@href")
print(codes_list)
# 对codes_list中的每一个code爬取对应的news_url
#news_url = 'http://stock.jrj.com.cn/share,' + code + ',ggxw.shtml'
# 读取系统当前日期,得到date_range = [当前日期-1年,当前日期]
# news_url的new_list里,爬取每个<li>里面的<span>和<i>
# <span>中的href是新闻内容的url,即news_content_url
# <i>的文本就是日期,要在date_range区间里
# 对每个news_content_url爬取title,date,content,origin,得到一个字典dic
# 将dic写入数据库
#pachong()
'''
import requests
import json
import pandas as pd
import time
import re
import datetime
from dateutil.relativedelta import relativedelta
from lxml import etree
import openpyxl
def spider():
'''
爬虫主函数
'''
headers = {
'User-Agent': 'Mozilla/5.0'
}
url = 'http://q.jrjimg.cn/?q=cn|s|bk400115934&c=m&n=hqa&o=pl,d&p=1050&_dc=1631090941576'
r = requests.get(url, headers=headers)
text = r.text
data = text_2_dict(text)
codes_list, stock_names = dict_2_codes_list(data)
code_news_dic = capture_page(codes_list, stock_names)
def text_2_dict(text):
text = text.replace('\n','')
text = text.replace('var hqa=','')
text = text.replace(';','')
pattern = '(\w+)(\:)'
text = re.sub(pattern,lambda m:'\"' + m.group(1) + '\"' + m.group(2),text)
data = json.loads(text)
return data
def dict_2_codes_list(data):
lists = data['HqData']
codes_list = []
stock_names = []
for l in lists:
codes_list.append(l[1])
stock_names.append(l[2])
return codes_list, stock_names
def my_zip(a,b):
r = []
for i in range(len(a)):
r.append([a[i],b[i]])
return r
def page_2_list(Begin_date, page_data, data_tree):
if page_data.find("new list"):
news_title_li = data_tree.xpath("//ul[@class='newlist']/li/span/a/text()")
news_href_li = data_tree.xpath("//ul[@class='newlist']/li/span/a/@href")
news_date_li = data_tree.xpath("//ul[@class='newlist']/li/i/text()")
news_date_href_li = my_zip(news_date_li, news_href_li)
pattern = '【龙虎榜】'
Min_date = my_filter(pattern, Begin_date, news_title_li, news_date_href_li)
return news_date_href_li, Min_date
def getDateBegin():
date_now = datetime.datetime.now()
earlist_date = (date_now - relativedelta(years=1))
return earlist_date
def my_filter(pattern, Begin_date, news_title_li, news_date_href_li):
Min_date = datetime.datetime.now()
idx = 0
n = len(news_title_li)
while idx < n:
title = news_title_li[idx]
if title.find(pattern) != -1:
del news_date_href_li[idx]
del news_title_li[idx]
n -= 1
else:
idx += 1
for idx, item in enumerate(news_date_href_li):
t = item[0]
detester = t.split(' ')[0]
date_ = datetime.datetime.strptime(detester,'%Y-%m-%d')
if date_ < Min_date:
Min_date = date_
item[0] = date_.strftime('%Y-%m-%d')
filter_old(Begin_date, news_date_href_li)
return Min_date
def filter_old(Begin_date, news_date_href_li):
idx = 0
n = len(news_date_href_li)
while idx < n:
item = news_date_href_li[idx]
date_ = item[0]
if datetime.datetime.strptime(date_, '%Y-%m-%d') < Begin_date:
del news_date_href_li[idx]
n -= 1
else:
idx += 1
def capture_page(codes_list, stock_names):
codes_names_list = my_zip(codes_list, stock_names)
code_news_dic = {}
for code, name in codes_names_list:
url = 'http://stock.jrj.com.cn/share,' + code + ',ggxw.shtml'
print('开始爬取:{}'.format(url))
page_data = requests.get(url).content.decode("gbk")
data_tree = etree.HTML(page_data)
Begin_date = getDateBegin()
code_news_dic[code] = {}
code_news_dic[code]['content'], Min_date = page_2_list(Begin_date, page_data, data_tree)
code_news_dic[code]['name'] = name
if Min_date < Begin_date:
print('Min_date < Begin_date,停止爬子网')
continue
else:
page_idx = 2
while Min_date >= Begin_date:
url_sub = 'http://stock.jrj.com.cn/share,' + code + ',ggxw_' + str(page_idx) + '.shtml'
print('\t开始爬取:{}'.format(url_sub))
page_data = requests.get(url_sub).content.decode("gbk")
data_tree = etree.HTML(page_data)
code_news_dic_append, Min_date = page_2_list(Begin_date, page_data, data_tree)
if len(code_news_dic_append) == 0:
break
code_news_dic[code]['content'] += code_news_dic_append
page_idx += 1
return code_news_dic
def capture_news(url):
page_data = requests.get(url).content.decode("gbk")
data_tree = etree.HTML(page_data)
if page_data.find("page_newslib") != -1:
news_title = data_tree.xpath("//div[@class='titmain']/h1/text()")
news_time = data_tree.xpath("//div[@class='titmain']/p[@class='inftop']/span[1]")
news_orign = data_tree.xpath("//div[@class='titmain']/p[@class='inftop']/span[2]")
news_content = data_tree.xpath("//div[@class='texttit_m1']/*/text()")
spider()
|