目录
一、爬取网页基本信息
二、将CSV文件转换为XLSX格式
一、爬取网页基本信息
网页头 Header:
网页URL:?
网页源代码:
?
#通过re来提取想要的有效信息
import re
import csv
import requests
def fun():
for i in range(10):
print(f"正在爬取第{i}页")
url = "https://movie.douban.com/top250?start={}&filter=".format(i*25)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55"
}
resp = requests.get(url, headers=headers)
page_content = resp.text
# 解析数据
obj = re.compile(r'<li>.*?<em class="">.*?</em>.*?<a href="(?P<link>.*?)">.*?'
r'<span class="title">(?P<name>.*?)</span>.*?'
r'<p class="">.*?<br>(?P<year>.*?) .*?'
r'<span class="rating_num" property="v:average">(?P<content>.*?)</span>.*?'
r'<span>(?P<num>.*?)</span>', re.S)
# 开始匹配
result = obj.finditer(page_content)
# 不读取空白行
f = open("data.csv", 'a+', newline='')
csvwriter = csv.writer(f)
for it in result:
# print(it.group("link"))
# print(it.group("name"))
# print(it.group("year").strip())
# print(it.group("content"))
# print(it.group("num"))
dic = it.groupdict()
dic['year'] = dic['year'].strip()
csvwriter.writerow(dic.values())
# print(it.group())
if __name__ == '__main__':
fun()
二、将CSV文件转换为XLSX格式
import csv
import xlwt
def csv_to_xlsx():
with open('data.csv', 'r') as f:
#csv.reader 读取的结果是列表
read = csv.reader(f)
#创建一个工作簿
workbook = xlwt.Workbook()
#创建一个sheet对象
sheet = workbook.add_sheet('data')
#修改写入格式 居中对齐
stytle=xlwt.XFStyle() #创建一个样式对象 初始化样式
al=xlwt.Alignment()
al.horz=0x02
al.vert=0x01
stytle.alignment=al
list1=["链接","电影名称","年份","评分","评价人数"]
a=0
for i in list1:
sheet.write(0,a,i,style=stytle)
a=a+1
l = 1
for line in read:#读取每一个列表
r = 0
for i in line:#读取每一个列表单元的内容
sheet.write(l, r, i,style=stytle) # 一个一个将单元格数据写入
r = r + 1
l = l + 1
workbook.save('1.xlsx') # 保存Excel
if __name__ == '__main__':
csv_to_xlsx()
?
|