xpath:
import requests
import time
import csv
from requests import RequestException
from lxml import etree
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
'Cookie': 'bid=bq6bl4kmTuw; douban-fav-remind=1; __gads=ID=591697b17f1fe23a-220f8b8865cf0092:T=1639058234:RT=1639058234:S=ALNI_MZO_48BdnjRSkE5opDvE-zAaE9tAA; ll="108288"; __utmz=223695111.1639215564.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _vwo_uuid_v2=D23ED6BBB550E9DB6F96FC7952ABEC279|a138b60c8a7b45c62db689ccc572217f; push_noty_num=0; push_doumail_num=0; ap_v=0,6.0; __utmz=30149280.1639231925.6.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=30149280.25152; dbcl2="251525456:JrZGQt48U80"; ck=heCZ; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1639235024%2C%22https%3A%2F%2Faccounts.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; __utmb=30149280.0.10.1639235088; __utmc=30149280; __utma=30149280.635194538.1639058235.1639231925.1639235088.7; __utma=223695111.1210278848.1639058235.1639226759.1639235088.6; __utmb=223695111.0.10.1639235088; __utmc=223695111; _pk_id.100001.4cf6=79142103d75fe9fc.1639058235.6.1639235410.1639226759.'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return '请求失败'
except RequestException:
return '爬取错误'
def parse():
moviesList = []
for i in range(10):
url = 'https://movie.douban.com/top250?start=' + str(25 * i) + '&filter='
response = get_one_page(url)
html = etree.HTML(response)
li = html.xpath('//ol[@class="grid_view"]/li')
for getresult in li:
index = getresult.xpath('./div/div/em/text()')[0]
name = getresult.xpath('./div/div[2]/div[1]/a[1]/span[1]/text()')[0]
director_actor = getresult.xpath('./div/div[2]/div[2]/p[1]/text()')[0].strip('\n').strip('\xa0')
director = director_actor[:director_actor.find('主演')].strip()
actor = director_actor[director_actor.find('主演'):].strip()
movies_release_date = getresult.xpath('./div/div[2]/div[2]/p/text()')[1].strip()[0:4]
movies_score = getresult.xpath('./div/div[2]/div[2]/div/span[2]/text()')[0]
movies_score_num = getresult.xpath('./div/div[2]/div[2]/div/span[4]/text()')[0]
movies_introduce = getresult.xpath('./div/div[2]/div[2]/p[2]/span/text()')[0]
moviesList.append([index, name, director, actor, movies_release_date, movies_score, movies_score_num, movies_introduce])
time.sleep(2)
return moviesList
def write_to_file(moviesList):
with open('xpath_result.csv', 'w', encoding='utf-8', newline='') as csvfile:
writer = csv.writer(csvfile)
fieldnames = ['电影排名', '电影名称', '导演', '主演', '发布日期', '评分', '评分人数', '简介']
writer.writerow(fieldnames)
for i in moviesList:
writer.writerow(i)
if __name__ == '__main__':
moviesList = parse()
write_to_file(moviesList)
beautiful soup:
import requests
import time
import csv
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
'Cookie': 'bid=bq6bl4kmTuw; douban-fav-remind=1; __gads=ID=591697b17f1fe23a-220f8b8865cf0092:T=1639058234:RT=1639058234:S=ALNI_MZO_48BdnjRSkE5opDvE-zAaE9tAA; ll="108288"; __utmz=223695111.1639215564.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _vwo_uuid_v2=D23ED6BBB550E9DB6F96FC7952ABEC279|a138b60c8a7b45c62db689ccc572217f; push_noty_num=0; push_doumail_num=0; ap_v=0,6.0; __utmz=30149280.1639231925.6.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=30149280.25152; dbcl2="251525456:JrZGQt48U80"; ck=heCZ; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1639235024%2C%22https%3A%2F%2Faccounts.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; __utmb=30149280.0.10.1639235088; __utmc=30149280; __utma=30149280.635194538.1639058235.1639231925.1639235088.7; __utma=223695111.1210278848.1639058235.1639226759.1639235088.6; __utmb=223695111.0.10.1639235088; __utmc=223695111; _pk_id.100001.4cf6=79142103d75fe9fc.1639058235.6.1639235410.1639226759.'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return '请求失败'
except RequestException:
return '爬取错误'
def parse():
moviesList = []
for i in range(10):
url = 'https://movie.douban.com/top250?start=' + str(25 * i) + '&filter='
response = get_one_page(url)
soup = BeautifulSoup(response, 'lxml')
li = soup.select('ol li')
for getresult in li:
index = getresult.find('div', class_='pic').find('em').get_text()
name = getresult.find('span', class_='title').get_text()
director_actor = getresult.find(class_='bd').p.get_text().strip().split('\n')
actor_infos1 = director_actor[0].split('\xa0\xa0\xa0')
movie_director = actor_infos1[0]
movie_role = actor_infos1[1] if len(actor_infos1)>1 else ""
movies_release_date = director_actor[1].strip().split('\xa0/\xa0')[0]
movies_score = getresult.find(class_='rating_num').get_text()
movies_score_num = getresult.find(class_='star').get_text().strip().split('\n')[2]
movies_introduces = getresult.find(class_='inq')
movies_introduce = movies_introduces.get_text() if movies_introduces else ''
moviesList.append([index, name, movie_director, movie_role, movies_release_date, movies_score, movies_score_num, movies_introduce])
time.sleep(2)
return moviesList
def write_to_file(moviesList):
with open('bs4_result.csv', 'w', encoding='utf-8', newline='') as csvfile:
writer = csv.writer(csvfile)
fieldnames = ['电影排名', '电影名称', '导演', '主演', '发布日期', '评分', '评分人数', '简介']
writer.writerow(fieldnames)
for i in moviesList:
writer.writerow(i)
if __name__ == '__main__':
moviesList = parse()
write_to_file(moviesList)
pyquery:
import requests
import time
import csv
from requests import RequestException
from pyquery import PyQuery as pq
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
'Cookie': 'bid=bq6bl4kmTuw; douban-fav-remind=1; __gads=ID=591697b17f1fe23a-220f8b8865cf0092:T=1639058234:RT=1639058234:S=ALNI_MZO_48BdnjRSkE5opDvE-zAaE9tAA; ll="108288"; __utmz=223695111.1639215564.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _vwo_uuid_v2=D23ED6BBB550E9DB6F96FC7952ABEC279|a138b60c8a7b45c62db689ccc572217f; push_noty_num=0; push_doumail_num=0; ap_v=0,6.0; __utmz=30149280.1639231925.6.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=30149280.25152; dbcl2="251525456:JrZGQt48U80"; ck=heCZ; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1639235024%2C%22https%3A%2F%2Faccounts.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; __utmb=30149280.0.10.1639235088; __utmc=30149280; __utma=30149280.635194538.1639058235.1639231925.1639235088.7; __utma=223695111.1210278848.1639058235.1639226759.1639235088.6; __utmb=223695111.0.10.1639235088; __utmc=223695111; _pk_id.100001.4cf6=79142103d75fe9fc.1639058235.6.1639235410.1639226759.'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return '请求失败'
except RequestException:
return '爬取错误'
def parse():
moviesList = []
for i in range(10):
url = 'https://movie.douban.com/top250?start=' + str(25 * i) + '&filter='
response = get_one_page(url)
html = pq(response)
li = html('div.item').items()
for getresult in li:
index = getresult.find('.pic em').text()
name = getresult.find('.hd span.title').text().split('/')[0]
director_actor = getresult.find('div.bd p:eq(0)').text().split('\n')[0]
director = director_actor[:director_actor.find('主演')].strip()
actor = director_actor[director_actor.find('主演'):].strip()
movies_release_date = getresult.find('div.bd p:eq(0)').text().split('\n')[1].strip()[0:4]
movies_score = getresult.find('span.rating_num').text()
movies_score_num = getresult.find('.star span').text().strip().split()[1]
movies_introduce = getresult.find('span.inq').text()
moviesList.append([index, name, director, actor, movies_release_date, movies_score, movies_score_num, movies_introduce])
time.sleep(2)
return moviesList
def write_to_file(moviesList):
with open('pyquery_result.csv', 'w', encoding='utf-8', newline='') as csvfile:
writer = csv.writer(csvfile)
fieldnames = ['电影排名', '电影名称', '导演', '主演', '发布日期', '评分', '评分人数', '简介']
writer.writerow(fieldnames)
for i in moviesList:
writer.writerow(i)
if __name__ == '__main__':
moviesList = parse()
write_to_file(moviesList)
部分结果截图
全部结果CSV在 我的CSDN的资源中下载
|