背景
CSDN里有不少优秀的技术大佬,创作了很多的技术文章,获得了很多同行的浏览、关注和点赞,抛开个人技术方向,于我们而言,应该如何才能写出热款爆文呢,是写艰深难懂的深度分析,还是写小白能懂的入门文章,那些知识点和技术方向才是大家最关注的呢?
期望
通过分析CSDN热榜标题,分析出如何起出亮眼标题及文章内容热词。
实现
技术
Scrapy 数据爬取 jieba 中文分词 wordcloud 生成词云图
前期分析
通过浏览器“开发者工具”分析,以下为我们所需要的URL
全站综合榜
https://blog.csdn.net/phoenix/web/blog/hotRank?page=0&pageSize=20
领域内容榜
https://blog.csdn.net/phoenix/web/blog/hotRank?page=0&pageSize=20&child_channel=大数据
作者周榜
https://blog.csdn.net/phoenix/web/blog/weeklyRank?page=0&pageSize=20
作者总榜
https://blog.csdn.net/phoenix/web/blog/allRank?page=0&pageSize=20
通过作者周榜/总榜,我们可以获取到热门作者的主页,进入其主页,按访问量对其文章进行排序,可以获取访问量最大的一些文章
各热门博主热门文章
https://blog.csdn.net/qq_37751911/?orderby=ViewCount
内容爬取
创建Spider
新建6个Spider
- SummaryArticleSpider 爬取全站综合热榜文章列表
- ContentArticleSpider 爬取领域内容热榜文章列表
- WeekAuthorSpider 爬取周热榜作者列表,含主页链接
- WeekArticleSpider 进入周热榜作者主页,爬取热门文章列表
- AllAuthorSpider 爬取综合热榜作者列表,含主页链接
- AllArticleSpider 进入综热合榜作者主页,爬取热门文章列表
# ContentArticleSpider
class ContentArticleSpider(scrapy.Spider):
name = 'content'
allowed_domains = ['https://blog.csdn.net/']
start_url = 'https://blog.csdn.net/phoenix/web/blog/hotRank?page={0}&pageSize={1}&child_channel={2}'
content_types = ["c/c++", "java", "javascript", "php", "python", "人工智能", "区块链", "大数据", "移动开发", "嵌入式", "开发工具", "数据结构与算法", "测试", "游戏", "网络", "运维"]
rank_type = RANK_TYPES[1]
def __init__(self):
parent_dir = "{0}/{1}".format(BASE_OUTPUT_DIR, CURRENT_DATE)
if not os.path.isdir(parent_dir):
os.mkdir(parent_dir)
logging.info("目录{0}已创建".format(parent_dir))
article_file_name = "{0}/{1}.csv".format(parent_dir, self.rank_type)
with open(article_file_name, mode="w") as f:
f.write("知识领域,标题,评论数,收藏数,浏览数,文章链接\n")
def start_requests(self):
for content_type in self.content_types:
for page_num in range(MAX_PAGE_NUM):
yield scrapy.Request(self.start_url.format(page_num, PAGE_SIZE, content_type.lower()),
callback=self.parse,
dont_filter=True)
def parse(self, response):
response_str = (str(response.body, 'utf-8'))
response_json = json.loads(response_str)
content_type = urllib.parse.unquote(str(response.url).split("=")[-1])
if response_json['code'] == 200:
articles = response_json['data']
for article in articles:
article_title = article['articleTitle']
article_url = article['articleDetailUrl']
comment_count = article['commentCount']
favor_count = article['favorCount']
view_count = article['viewCount']
yield {
"rankType": self.rank_type,
"contentType": content_type,
"articleTitle": article_title,
"articleUrl": article_url,
"commentCount": comment_count,
"favorCount": favor_count,
"viewCount": view_count
}
# WeekArticleSpider
class WeekArticleSpider(scrapy.Spider):
name = 'week_article'
allowed_domains = ['https://blog.csdn.net/']
start_url = 'https://blog.csdn.net/{1}'
urls = {}
rank_type = RANK_TYPES[4]
def __init__(self):
parent_dir = "{0}/{1}".format(BASE_OUTPUT_DIR, CURRENT_DATE)
author_file_name = "{0}/{1}.csv".format(parent_dir, RANK_TYPES[2])
if not os.path.isdir(parent_dir):
os.mkdir(parent_dir)
logging.info("目录{0}已创建".format(parent_dir))
with open(author_file_name, mode='r') as lines:
next(lines)
for line in lines:
self.urls[line.split(",")[0]] = "{0}?orderby=ViewCount".format(line.split(",")[1])
article_file_name = "{0}/{1}.csv".format(parent_dir, self.rank_type)
with open(article_file_name, mode="w") as f:
f.write("博主,码龄,标题,评论数,浏览数,文章链接\n")
def start_requests(self):
for user_name, url in self.urls.items():
yield scrapy.Request(url,
callback=self.parse,
dont_filter=True)
@retry(tries=3, delay=5000)
def parse(self, response):
response_html = Selector(response)
try:
age = response_html.xpath('//*[@id="asideProfile"]/div[1]/div[2]/div[2]/span[1]/text()').get().replace("码龄", "")
nick_name = response_html.xpath('//*[@id="uid"]/span/text()').get()
card_coms = response_html.xpath('//*[@id="articleMeList-blog"]/div[2]/div[*]')
for card_com in card_coms:
title = "".join(card_com.xpath('h4/a/text()').getall()).strip()
url = card_com.xpath('h4/a/@href').get()
view_count = card_com.xpath('div/p/span[2]/text()').get()
comment_count = card_com.xpath('div/p/span[3]/text()').get()
yield {
"rankType": self.rank_type,
"nickName": nick_name,
"codeAge": age,
"articleTitle": title,
"commentCount": comment_count,
"viewCount": view_count,
"articleUrl": url
}
except Exception as e:
logging.warning("访问出现问题:{0}".format(str(e)))
输出爬取结果
输出6个文件
-
SummaryRankArticle.csv 标题,评论数,收藏数,浏览数,文章链接 -
ContentRankArticle.csv 知识领域,标题,评论数,收藏数,浏览数,文章链接 -
WeekRankAuthor.csv 博主,主页链接 -
WeekRankArticle.csv 博主,码龄,标题,评论数,浏览数,文章链接 -
AllRankAuthor.csv 博主,主页链接 -
AllRankArticle.csv 博主,码龄,标题,评论数,浏览数,文章链接
分析标题热词
使用jieba对标题进行分词,进行清洗后,使用wordcloud绘制词云图
#coding=utf-8
import matplotlib.pyplot as plt
import jieba
from wordcloud import wordcloud
def analyze(file_path, result_file_name):
# 无用词汇
exclude_words = ['(',')','(',')','的','了','与','和','中','你','我']
# 全部标题
titles = []
with open(file_path,'r') as lines:
next(lines)
for line in lines:
titles.append(line.split(",")[0].strip())
# 标题分词列表
segments = list(jieba.cut("".join(titles), cut_all=True))
# 去除无用词汇
result = " ".join(filter(lambda w: w not in exclude_words,segments))
# 绘制词云图
wc = wordcloud.WordCloud(
font_path='/Library/Fonts/Arial Unicode.ttf',
background_color='white',
max_font_size=100,
min_font_size=10,
max_words=200
)
wc.generate(result)
wc.to_file('{0}.png'.format(result_file_name))
# 展示词云图
plt.figure(result_file_name)
plt.imshow(wc)
plt.axis('off')
plt.show()
if __name__=='__main__':
date = '20210719'
file_dict = [{
"rank_type":"AllRankArticle",
"result_name":"综合热门博主文章热词"
},{
"rank_type":"ContentRankArticle",
"result_name":"领域热榜文章热词"
},{
"rank_type":"SummaryRankArticle",
"result_name":"综合热榜文章热词"
},{
"rank_type":"WeekRankArticle",
"result_name":"周热榜博主文章热词"
}]
for file in file_dict:
file_path = "{0}/{1}.csv".format(date, file["rank_type"])
result_file_name = file["result_name"]
analyze(file_path,result_file_name)
分析结果
领域热榜文章热词
综合热榜文章热词
周热榜博主文章热词
综合热门博主文章热词
分析结论
综合分析如下,写作方向可做参考
- 编程语言可侧重于Python、Java
- 曲高和寡,可多写些初级安装、使用教程或者拿来即用的功能实现
- 面试题是经久不衰的热门搜索关键词,大家都是比较务实功利的
|