背景

CSDN里有不少优秀的技术大佬，创作了很多的技术文章，获得了很多同行的浏览、关注和点赞，抛开个人技术方向，于我们而言，应该如何才能写出热款爆文呢，是写艰深难懂的深度分析，还是写小白能懂的入门文章，那些知识点和技术方向才是大家最关注的呢？

期望

通过分析CSDN热榜标题，分析出如何起出亮眼标题及文章内容热词。

在这里插入图片描述

实现

技术

Scrapy 数据爬取
jieba 中文分词
wordcloud 生成词云图

前期分析

通过浏览器“开发者工具”分析，以下为我们所需要的URL

全站综合榜

https://blog.csdn.net/phoenix/web/blog/hotRank?page=0&pageSize=20

领域内容榜

https://blog.csdn.net/phoenix/web/blog/hotRank?page=0&pageSize=20&child_channel=大数据

作者周榜

https://blog.csdn.net/phoenix/web/blog/weeklyRank?page=0&pageSize=20

作者总榜

https://blog.csdn.net/phoenix/web/blog/allRank?page=0&pageSize=20

通过作者周榜/总榜，我们可以获取到热门作者的主页，进入其主页，按访问量对其文章进行排序，可以获取访问量最大的一些文章

各热门博主热门文章

https://blog.csdn.net/qq_37751911/?orderby=ViewCount

内容爬取

创建Spider

新建6个Spider

SummaryArticleSpider 爬取全站综合热榜文章列表
ContentArticleSpider 爬取领域内容热榜文章列表
WeekAuthorSpider 爬取周热榜作者列表，含主页链接
WeekArticleSpider 进入周热榜作者主页，爬取热门文章列表
AllAuthorSpider 爬取综合热榜作者列表，含主页链接
AllArticleSpider 进入综热合榜作者主页，爬取热门文章列表

# ContentArticleSpider
class ContentArticleSpider(scrapy.Spider):
    name = 'content'
    allowed_domains = ['https://blog.csdn.net/']
    start_url = 'https://blog.csdn.net/phoenix/web/blog/hotRank?page={0}&pageSize={1}&child_channel={2}'
    content_types = ["c/c++", "java", "javascript", "php", "python", "人工智能", "区块链", "大数据", "移动开发", "嵌入式", "开发工具", "数据结构与算法", "测试", "游戏", "网络", "运维"]
    rank_type = RANK_TYPES[1]

    def __init__(self):
        parent_dir = "{0}/{1}".format(BASE_OUTPUT_DIR, CURRENT_DATE)
        if not os.path.isdir(parent_dir):
            os.mkdir(parent_dir)
            logging.info("目录{0}已创建".format(parent_dir))

        article_file_name = "{0}/{1}.csv".format(parent_dir, self.rank_type)
        with open(article_file_name, mode="w") as f:
            f.write("知识领域,标题,评论数,收藏数,浏览数,文章链接\n")

    def start_requests(self):
        for content_type in self.content_types:
            for page_num in range(MAX_PAGE_NUM):
                yield scrapy.Request(self.start_url.format(page_num, PAGE_SIZE, content_type.lower()),
                                     callback=self.parse,
                                     dont_filter=True)

    def parse(self, response):
        response_str = (str(response.body, 'utf-8'))
        response_json = json.loads(response_str)

        content_type = urllib.parse.unquote(str(response.url).split("=")[-1])

        if response_json['code'] == 200:
            articles = response_json['data']

            for article in articles:
                article_title = article['articleTitle']
                article_url = article['articleDetailUrl']
                comment_count = article['commentCount']
                favor_count = article['favorCount']
                view_count = article['viewCount']

                yield {
                    "rankType": self.rank_type,
                    "contentType": content_type,
                    "articleTitle": article_title,
                    "articleUrl": article_url,
                    "commentCount": comment_count,
                    "favorCount": favor_count,
                    "viewCount": view_count
                }

# WeekArticleSpider
class WeekArticleSpider(scrapy.Spider):
    name = 'week_article'
    allowed_domains = ['https://blog.csdn.net/']
    start_url = 'https://blog.csdn.net/{1}'
    urls = {}
    rank_type = RANK_TYPES[4]

    def __init__(self):
        parent_dir = "{0}/{1}".format(BASE_OUTPUT_DIR, CURRENT_DATE)
        author_file_name = "{0}/{1}.csv".format(parent_dir, RANK_TYPES[2])

        if not os.path.isdir(parent_dir):
            os.mkdir(parent_dir)
            logging.info("目录{0}已创建".format(parent_dir))

        with open(author_file_name, mode='r') as lines:
            next(lines)
            for line in lines:
                self.urls[line.split(",")[0]] = "{0}?orderby=ViewCount".format(line.split(",")[1])

        article_file_name = "{0}/{1}.csv".format(parent_dir, self.rank_type)
        with open(article_file_name, mode="w") as f:
            f.write("博主,码龄,标题,评论数,浏览数,文章链接\n")


    def start_requests(self):
        for user_name, url in self.urls.items():
            yield scrapy.Request(url,
                                 callback=self.parse,
                                 dont_filter=True)

    @retry(tries=3, delay=5000)
    def parse(self, response):
        response_html = Selector(response)
        try:
            age = response_html.xpath('//*[@id="asideProfile"]/div[1]/div[2]/div[2]/span[1]/text()').get().replace("码龄", "")
            nick_name = response_html.xpath('//*[@id="uid"]/span/text()').get()

            card_coms = response_html.xpath('//*[@id="articleMeList-blog"]/div[2]/div[*]')
            for card_com in card_coms:
                title = "".join(card_com.xpath('h4/a/text()').getall()).strip()
                url = card_com.xpath('h4/a/@href').get()
                view_count = card_com.xpath('div/p/span[2]/text()').get()
                comment_count = card_com.xpath('div/p/span[3]/text()').get()
                yield {
                    "rankType": self.rank_type,
                    "nickName": nick_name,
                    "codeAge": age,
                    "articleTitle": title,
                    "commentCount": comment_count,
                    "viewCount": view_count,
                    "articleUrl": url
                }
        except Exception as e:
            logging.warning("访问出现问题：{0}".format(str(e)))

输出爬取结果

输出6个文件

SummaryRankArticle.csv

标题,评论数,收藏数,浏览数,文章链接
ContentRankArticle.csv

知识领域,标题,评论数,收藏数,浏览数,文章链接
WeekRankAuthor.csv

博主,主页链接
WeekRankArticle.csv

博主,码龄,标题,评论数,浏览数,文章链接
AllRankAuthor.csv

博主,主页链接
AllRankArticle.csv

博主,码龄,标题,评论数,浏览数,文章链接

分析标题热词

使用jieba对标题进行分词，进行清洗后，使用wordcloud绘制词云图

#coding=utf-8
import matplotlib.pyplot as plt
import jieba
from wordcloud import wordcloud

def analyze(file_path, result_file_name):
    # 无用词汇
    exclude_words = ['(',')','（','）','的','了','与','和','中','你','我']
    # 全部标题
    titles = []

    with open(file_path,'r') as lines:
        next(lines)
        for line in lines:
            titles.append(line.split(",")[0].strip())


    # 标题分词列表
    segments = list(jieba.cut("".join(titles), cut_all=True))
    # 去除无用词汇
    result = " ".join(filter(lambda w: w not in exclude_words,segments))
    # 绘制词云图
    wc = wordcloud.WordCloud(
        font_path='/Library/Fonts/Arial Unicode.ttf',
        background_color='white',
        max_font_size=100,
        min_font_size=10,
        max_words=200
    )
    wc.generate(result)
    wc.to_file('{0}.png'.format(result_file_name))

    # 展示词云图
    plt.figure(result_file_name)
    plt.imshow(wc)
    plt.axis('off')
    plt.show()

if __name__=='__main__':
    date = '20210719'
    file_dict = [{
        "rank_type":"AllRankArticle",
        "result_name":"综合热门博主文章热词"
    },{
        "rank_type":"ContentRankArticle",
        "result_name":"领域热榜文章热词"
    },{
        "rank_type":"SummaryRankArticle",
        "result_name":"综合热榜文章热词"
    },{
        "rank_type":"WeekRankArticle",
        "result_name":"周热榜博主文章热词"
    }]

    for file in file_dict:
        file_path = "{0}/{1}.csv".format(date, file["rank_type"])
        result_file_name = file["result_name"]

        analyze(file_path,result_file_name)