前言
百度搜索的内容一般包含标题、摘要、网址、时间信息,本次主要实现根据搜索整理30页左右百度的搜索条例成csv文档。
原理
百度爬虫比较简单,模拟浏览器访问就可以爬取到所要的数据,访问某个关键字第几页的网址构成为:
"http://www.baidu.com/s?wd={}&pn={}".format(urllib.parse.quote(word),number)
之后就是解析对应的标签提取信息了。
因为要提取关键字,所以解析得到摘要后需要对摘要进行结巴分词,分词后使用停用词表去掉停用词,最后整理高频词语为关键词。
代码所用停用词表下载: 链接: https://pan.baidu.com/s/1SOMFPaQodZPUyJncQCo-Qw 提取码: 7ipf
代码
from bs4 import BeautifulSoup
import sys
import requests
import re
import importlib
import urllib
import jieba
from collections import Counter
import csv
importlib.reload(sys)
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, compress',
'Accept-Language': 'en-us;q=0.5,en;q=0.3',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'
}
def remove_stop_word(word_list):
stopWord = open("./stop_words_ch.txt", 'r', encoding='GBK').read().strip()
stopWordList = stopWord.splitlines()
stopWordList.append("天前")
stopWordList.append("来源")
stopWordList.append("位于")
stopWordList.append("当年")
stopWordList.append("为此")
stopWordList.append("pdf")
stopWordList.append("https")
stopWordList.append("http")
return list(set(word_list) - set(stopWordList))
def geturl(num_pages,word):
result = []
for i in range(num_pages):
print("==>Parsing page {},total page {}.".format(i,num_pages))
number = i * 10
path = "http://www.baidu.com/s?wd={}&pn={}".format(urllib.parse.quote(word),number)
content = requests.get(path, headers=headers)
soup = BeautifulSoup(content.text, 'html.parser')
tagh3 = soup.find_all('div',class_='result c-container new-pmd')
for h3 in tagh3:
msg_dict = {"title": "", "abstract": "", "url": "", "time": "", "key_word": "", "class": ""}
try:
title = h3.find(name="h3", attrs={"class": re.compile("t")}).find('a').text.replace("\"", "")
abstract = h3.find(name="div", attrs={"class": re.compile("c-abstract")}).text.replace("\"", "")
url = h3.find(name="a", attrs={"class": re.compile("c-showurl")}).get('href')
except:
continue
try:
time = h3.find(name="div", attrs={"class": re.compile("c-abstract")}).find(name='span').text
except:
time = ""
try:
source = h3.find(name="div",class_="f13 c-gap-top-xsmall se_st_footer user-avatar").find(name='a').text
except:
source = None
msg_dict["title"] = title
msg_dict["abstract"] = abstract
msg_dict["url"] = url
msg_dict["time"] = time
if "www" in source:
msg_dict["class"] = "网站"
elif "网" in source:
msg_dict["class"] = "新闻"
elif "百度" in source:
msg_dict["class"] = "百度文库"
else:
msg_dict["class"] = "其他"
seg_word = jieba.cut(abstract)
seg_word = remove_stop_word(seg_word)
collection_words = Counter(seg_word)
try:
for most in collection_words.most_common(3):
k = most[0]
if len(k) == 1:
continue
if k[0] in ["1","2","3","4","5","6","7","8","9","0"]:
continue
msg_dict["key_word"] = k
break
except:
msg_dict["key_word"] = ""
result.append(msg_dict)
return result
if __name__ == '__main__':
f = open('result.csv', 'w', encoding='utf-8')
csv_writer = csv.writer(f)
csv_writer.writerow(["序号","标题", "摘要", "网址", "时间", "关键词", "类别"])
res = geturl(num_pages=30,word="国庆节")
print("==>Start write result to CSV file...")
for index,r in enumerate(res):
csv_writer.writerow([str(index+1),r['title'],r['abstract'],r["url"],r["time"],r["key_word"],r["class"]])
f.close()
print("==>Finish! Total msg number is {}".format(len(res)))
|