期末项目要求自己在天池上找或者爬取数据进行分析,所以我试着用python写了一个小爬虫爬取懂球帝的部分球员数据,为什么是部分,因为全部球员太多了,花费时间会很长很长。直接附代码,新手一个,可能写的不会太好看。
懂球帝球员界面:
我试了一下/player/后面的索引是从50000001开始,一直到5064****,说明懂球帝球员数据一共有六十多万条
获取的数据有:
- 球员名称
- 球员俱乐部
- 国籍
- 身高
- 位置(有教练,也有裁判)
- 年龄
- 体重
- 号码
- 生日
- 惯用脚
- 职业生涯(年)
- 累计出场
- 累计进球
- 累计助攻
- 累计黄牌
- 累计红牌
- 综合能力评分
- 速度评分
- 力量评分
- 防守评分
- 盘带评分
- 传球评分
- 射门评分
import urllib
import csv
from bs4 import BeautifulSoup
from lxml import etree
# 检查是否存在球员
def checkHtml(num):
url = "https://www.dongqiudi.com/player/%s.html" % num
html = askURL(url)
soup = BeautifulSoup(html, "html.parser")
name = soup.find('p', attrs={'class': 'china-name'})
if (name == None):
print('无效网站')
return 'none'
else:
return soup
# 获取数据,并储存
def getData(soup):
# url = "https://www.dongqiudi.com/player/%s.html" % num
# html = askURL(url)
# soup = BeautifulSoup(html, "html.parser")
# 姓名
name = soup.find('p', attrs={'class': 'china-name'})
name = str(name)
con = etree.HTML(name)
namestr = con.xpath("//p/text()")
name = namestr[0]
print(name)
# 获取详细信息list
detail_list = []
detail_info_div = soup.find('div',attrs={'class': 'detail-info'})
# con2 = etree.HTML(detail_info_div)
detail_info_ul = detail_info_div.find_all('li')
for each in detail_info_ul:
detail = each.text.strip()
detail_list.append(detail)
# print(detail_list)
# 俱乐部
club = str(detail_list[0]).replace('俱乐部:' ,'')
# print('俱乐部', club)
# 国籍
contry = str(detail_list[1]).replace('国 籍:' ,'')
# print('国籍', contry)
# 身高
height = 0
heightstr = str(detail_list[2]).replace('CM', '')
heightstr = heightstr.replace('身 高:', '')
if heightstr != '':
height = int(heightstr)
# print('身高', height)
# 位置
location = str(detail_list[3]).replace('位 置:', '')
# print('位置', location)
# 年龄
age = 0
agestr = str(detail_list[4]).replace('年 龄:', '')
agestr = agestr.replace('岁', '')
if agestr != '':
age = int(agestr)
# print('年龄', age)
# 体重
weight = 0
weightstr = str(detail_list[5]).replace('体 重:', '')
weightstr = weightstr.replace('KG', '')
if weightstr != '':
weight = weightstr
# print('体重', weight)
# 号码
number = 0
numberstr = str(detail_list[6]).replace('号 码:', '')
numberstr = numberstr.replace('号', '')
if numberstr != '':
number = int(numberstr)
# print('号码', number)
# 生日
birth = str(detail_list[7]).replace('生 日:', '')
# print(birth)
# 惯用脚
foot = str(detail_list[8]).replace('惯用脚:', '')
# print(foot)
# 获取俱乐部比赛数据详细信息list
total_con_wrap_div = soup.find('div', attrs={'class': 'total-con-wrap'})
total_con_wrap_td = str(total_con_wrap_div.find_all('p', attrs={'class': 'td'}))
con3 = etree.HTML(total_con_wrap_td)
detail_info_list = con3.xpath("//p//span/text()")
detail_info_list_years = con3.xpath("//p")
# 一线队时间(年)
years = len(detail_info_list_years) - 1
# print('一线队时长', len(detail_info_list_years) - 1)
# 总计上场次数
total_session = 0
for i in range(2, len(detail_info_list), 9):
if detail_info_list[i] == '~':
detail_info_list[i] = 0
total_session = total_session+int(detail_info_list[i])
# print('累计出场数', total_session)
# 总计进球数
total_goals = 0
for i in range(4, len(detail_info_list), 9):
if detail_info_list[i] == '~':
detail_info_list[i] = 0
total_goals = total_goals + int(detail_info_list[i])
# print('累计进球数', total_goals)
# 总计助攻数
total_assist = 0
for i in range(5, len(detail_info_list), 9):
if detail_info_list[i] == '~':
detail_info_list[i] = 0
total_assist = total_assist + int(detail_info_list[i])
# print('累计助攻数', total_assist)
# 总计黄牌数
total_yellow_card = 0
for i in range(6, len(detail_info_list), 9):
if detail_info_list[i] == '~':
detail_info_list[i] = 0
total_yellow_card = total_yellow_card + int(detail_info_list[i])
# print('累计黄牌数', total_yellow_card)
# 总计红牌数
total_red_card = 0
for i in range(7, len(detail_info_list), 9):
if detail_info_list[i] == '~':
detail_info_list[i] = 0
total_red_card = total_red_card + int(detail_info_list[i])
# print('累计红牌数', total_red_card)
# 获取总评分
average = 0
speed = 0
power = 0
guard = 0
dribbling = 0
passing = 0
shooting = 0
grade_average = soup.find('p', attrs={'class': 'average'})
if grade_average != None:
con4 = etree.HTML(str(grade_average))
average = con4.xpath("//p//b/text()")
average = int(average[0])
# print('综合能力', average)
# 详细评分
grade_detail_div = soup.find('div', attrs={'class': 'box_chart'})
if grade_detail_div != None:
con5 = etree.HTML(str(grade_detail_div))
grade_detail = con5.xpath("//div//span/text()")
# 速度
speed = int(grade_detail[0])
# print(speed)
# 力量
power = int(grade_detail[1])
# print(power)
# 防守
guard = int(grade_detail[2])
# print(guard)
# 盘带
dribbling = int(grade_detail[3])
# print(dribbling)
# 传球
passing = int(grade_detail[4])
# print(passing)
# 射门
shooting = int(grade_detail[5])
# print(shooting)
# 写进文件
csv.writer(f).writerow([name, club, contry, height, location, age, weight, number, birth, foot, years, total_session,
total_goals, total_assist, total_yellow_card, total_red_card, average, speed, power,
guard, dribbling, passing, shooting])
# 得到指定一个URL的网页内容
def askURL(url):
head = { # 模拟浏览器头部信息,向豆瓣服务器发送消息
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36 Edg/96.0.1054.29"
}
# 用户代理,表示告诉豆瓣服务器,我们是什么类型的机器、浏览器(本质上是告诉浏览器,我们可以接收什么水平的文件内容)
request = urllib.request.Request(url, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
f = open("足球运动员.csv", mode="a", encoding='utf-8')
# csv.writer(f).writerow(["姓名","俱乐部","国籍","身高(CM)","位置","年龄(岁)","体重(KG)","号码","生日","惯用脚","职业生涯(年)",
# "累计出场","累计进球","累计助攻","累计黄牌","累计红牌","综合能力","速度","力量","防守","盘带","传球","射门"])
for num in range(50184113, 50184150):
print(num)
soup = checkHtml(num)
if soup != 'none':
getData(soup)
# getData(num)
结果截图:
?花了4,5个小时,一共爬了三万多条数据
原码和.txt我也放到了gitee上lzk: 这是我的仓库https://gitee.com/lizengkunnb666/lzk.git
这是第一次学着写一个稍微大一点的爬虫程序,还不是很成熟
|