douban_book_comment.py
import requests
import pandas as pd
import re
import time
import csv
from bs4 import BeautifulSoup
import os
from urllib import request
header = {
'Content-Type':'text/html; charset=utf-8','User-Agent':'header'
}
Cookie = {
'cookie'
}
url_1="https://book.douban.com/subject/6709783/comments/?start="
url_2="&status=P&sort=new_score"
i=0
while True:
url=url_1+str(i*20)+url_2
print(url)
try:
html=requests.get(url,headers=header,cookies=Cookie)
soup = BeautifulSoup(html.content,'lxml')
comment_time_list = soup.find_all('span', attrs={'class': 'comment-time'})
if len(comment_time_list)==0:
break
use_name_list=soup.find_all('span', attrs={'class': 'comment-info'})
comment_list=soup.find_all('span', attrs={'class': 'short'})
rating_list=soup.find_all('span',attrs={'class': re.compile(r"allstar(\s\w+)?")})
for jj in range(len(comment_time_list)):
data1=[(comment_time_list[jj].string,
use_name_list[jj].a.string,
comment_list[jj].string,
rating_list[jj].get('class')[1],
rating_list[jj].get('title'))]
data2 = pd.DataFrame(data1)
data2.to_csv('douban_book.csv', header=False, index=False, mode='a+',encoding="utf-8-sig")
print('page '+str(i+1)+' has done')
except:
print("something is wrong")
print('page'+str(i+1)+' has done')
i=i+1
time.sleep(2)
html.close()
douban_score_analysis.py
import pandas as pd
from collections import Counter
df = pd.read_csv('douban_book.csv')
recommend_list = df['recommend'].values.tolist()
num_count = Counter(recommend_list)
print(num_count)
grouped = df.groupby('recommend').describe().reset_index()
recommend = grouped['recommend'].values.tolist()
print(recommend)
sentiment_average = df.groupby('recommend')['score'].mean()
sentiment_scores = sentiment_average.values
print(sentiment_scores)
douban_sentiment_analysis.py
import pandas as pd
from snownlp import SnowNLP
from snownlp import sentiment
df = pd.read_csv('douban_book.csv',header=None,usecols=[2])
contents = df.values.tolist()
print(len(contents))
score = []
for content in contents:
try:
s = SnowNLP(content[0])
score.append(s.sentiments)
except:
print("something is wrong")
score.append(0.5)
print(len(score))
data2 = pd.DataFrame(score)
data2.to_csv('sentiment.csv', header=False, index=False, mode='a+')
douban_wordcloud.py
import jieba
import numpy as np
import PIL.Image as Image
import pandas as pd
from wordcloud import WordCloud
from wordcloud import STOPWORDS
def chinese_jieba(text):
wordlist_jieba = jieba.cut(text)
space_wordlist =" ".join(wordlist_jieba)
return space_wordlist
df = pd.read_csv('douban_book.csv')
comment_list = df['comment'].values.tolist()
recommend_list = df['recommend'].values.tolist()
text = ""
for jj in range(len(comment_list)):
if recommend_list[jj] == 1:
text = text + chinese_jieba(comment_list[jj])
print(text)
stopwords = set()
content = [line.strip() for line in open('hit_stopwords.txt', 'r', encoding='utf-8').readlines()]
stopwords.update(content)
stopwords.update(['说','本','书','豆瓣','本书','一本','想','写','阅读','建议'])
w = WordCloud(font_path="msyh.ttc",
background_color="white",
max_font_size=150,
max_words=2000,
stopwords=stopwords)
w.generate(text)
image = w.to_image()
w.to_file('ciyun1.png')
image.show()
print(stopwords)
|