Python爬虫教程(超详细)
下面以爬取亚马逊图书销售排行榜top100为例
话不多说,直接上代码(都可以直接使用) 1. 获取网页数据,存入excel文件
from urllib import request, error
import xlwt
from bs4 import BeautifulSoup
import re
class Spider:
def __init__(self, baseUrl, itemInfs):
self.dataList = []
self.baseUrl = baseUrl
self.itemInfs = itemInfs
def getHtml(self, baseUrl):
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}
req = request.Request(baseUrl, headers=head)
html = ""
try:
response = request.urlopen(req)
html = response.read().decode("utf-8")
except error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
def parseData(self, it):
data = []
res = re.findall(eval(self.itemInfs["name"]), it)
res = re.sub(r'\\n', '', str(res))
res = re.sub(r' ', '', str(res))
res = re.sub(r"\(.*?\)|\【.*?\】|\(.*?\)", "", str(res))
data.append(eval(res))
res = re.findall(eval(self.itemInfs["author"]), it)
data.append((res))
res = re.findall(eval(self.itemInfs["price"]), it)
data.append((res))
res = re.findall(eval(self.itemInfs["star"]), it)
res = str.replace(str(res), ' ', '')
data.append(eval(res))
res = re.findall(eval(self.itemInfs["judgeNum"]), it)
res = str.replace(str(res), ',', '')
data.append(eval(res))
return data
def getData(self):
for i in range(2):
html = self.getHtml(self.baseUrl+str(i+1))
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all('li', class_="zg-item-immersion"):
data = self.parseData(str(item))
self.dataList.append(data)
print(data)
def saveXls(self, filename):
workbook = xlwt.Workbook(encoding="utf-8")
worksheet = workbook.add_sheet('sheet1')
keys = self.itemInfs.keys()
for i, key in enumerate(keys):
worksheet.write(0, i, key)
for i, book in enumerate(self.dataList):
for j, item in enumerate(book):
worksheet.write(i + 1, j, item)
workbook.save(filename)
print("保存完成")
def main():
baseurl = "https://www.amazon.cn/gp/bestsellers/digital-text/ref=zg_bs_pg_2?ie=UTF8&pg="
itemInfs = {
"name": '''re.compile(r'<div aria-hidden="true" .{0,300}?>(.{0,300}?)</div>',re.S)''',
"author": '''re.compile(r'<span class="a-size-small a-color-base">(.{0,100}?)</span>')''',
"price": '''re.compile(r'<span class="p13n-sc-price">.(.*?)</span>')''',
"star": '''re.compile(r'<span class="a-icon-alt">...(.*?).</span>')''',
"judgeNum": '''re.compile(r'<a class="a-size-small a-link-normal" .{0,300}?>(.{0,100}?)</a>', re.S)''',
}
savexlsname = 'Amazon.xls'
spider = Spider(baseurl, itemInfs)
spider.getData()
spider.saveXls(savexlsname)
if __name__ == "__main__":
main()
print("爬取完毕!")
2. 将excel数据读取出来存入json文件
import pandas as pd
data = pd.read_excel(io='Amazon.xls')
df = pd.DataFrame(data)
df.to_json('Amazon1.json', orient="records")
其实可以直接存json文件,但excel比较常用,就也演示一下
3. 数据分析
import pandas as pd
import numpy as np
df = pd.read_json('Amazon1.json', orient='records')
df[df.values == ''] = np.nan
df = df.fillna(method='bfill')
df = df.sort_values(['star'], ascending=False)
print('*******star 排名 60*******')
print(df.iloc[0:60, 0:4])
df = df.sort_values(['price'], ascending=False)
print('*******price 最贵*******')
print(df.iloc[0])
avg = df.iloc[:, 3].mean()
va = df.iloc[:, 3].var()
print('star 值的均值:{}'.format(avg))
print('star 值的方差:{}'.format(va))
print(df.corr())
到这就结束啦,如有问题,大家评论区讨论。
|