简述
代码直接借助python的selenium的库来控制浏览器,与requests和bs4相比,selenium允许你用高级多的方法实现网页交互,但是因为他启动了Web浏览器,下载文件会比较慢,并且难以在后台运行。假如你考虑的不是通过此程序来练习,那么就直接退出这篇文章。
思路上面,在父页面获取10部电影或者电视剧的link,然后跳转到电影详情页,进行信息爬取。
import os,re
from time import sleep
from selenium import webdriver
URL = "https://www.douban.com/"
URLmovie = 'https://movie.douban.com/explore#!type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=0'
URLseries = ''
webdri_path = r'C:\Users\86151\AppData\Local\Google\Chrome\Application\chromedriver.exe'
global flag
os.mkdir(os.path.join(".","douban"))
os.mkdir(os.path.join(".","douban","movie"))
os.mkdir(os.path.join(".","douban","series"))
def getInfo(titleOfWork,aim):
releaseTime = driver.find_element_by_class_name('year')
if int(re.sub(r"\D","",releaseTime.text)) < 2018:
return
fileName = re.sub(r"\D","",releaseTime.text) + re.sub(r"\W","",titleOfWork)
print(fileName)
os.mkdir(os.path.join(".","douban",aim,fileName))
driver.find_element_by_xpath('//*[@id="mainpic"]/a/img').screenshot(os.path.join(".","douban",aim,fileName,"pic.png"))
with open(os.path.join(".","douban",aim,fileName,"info.txt"),"a") as movieFile:
movieFile.write(titleOfWork+"\n")
movieFile.write("上映日期:" + re.sub(r"\D","",releaseTime.text)+"\n")
director = driver.find_element_by_class_name('attrs').text
movieFile.write("导演:" + director+"\n")
starring = driver.find_element_by_class_name('actor').text
movieFile.write(starring+"\n")
introduction = driver.find_element_by_class_name('related-info').text
movieFile.write(introduction+"\n")
movieFile.write("链接:"+driver.current_url)
global flag
flag +=1
def openURL(aim,Links):
global flag
flag = 0
for link in Links:
titleOfWork = link.text
print(type(link.text[-3:]))
if ("." not in titleOfWork[-3:]) or (float(titleOfWork[-3:]) < 7):
continue
driver.execute_script(js.format(link.get_attribute("href")))
driver.switch_to_window(driver.window_handles[-1])
getInfo(titleOfWork[:-3],aim)
driver.close()
driver.switch_to_window(driver.window_handles[0])
if flag == 10:
break
try:
driver = webdriver.Chrome(webdri_path)
driver.get(URLmovie)
driver.implicitly_wait(3)
driver.maximize_window()
sleep(1)
js = "window.open('{}','_blank');"
driver.find_element_by_xpath('//*[@id="content"]/div/div[1]/div/div[2]/div[1]/form/div[3]/div[1]/label[2]/input').click()
driver.find_elements_by_css_selector('.tag-list label')[4].click()
movieLinks = driver.find_elements_by_class_name("item")
openURL("movie",movieLinks)
driver.find_element_by_xpath('//*[@id="db-nav-movie"]/div[2]/div/ul/li[3]/a').click()
seriesLinks = driver.find_elements_by_class_name("item")
openURL("series",seriesLinks)
driver.quit()
except Exception as e:
print(e)
常见问题
- 不能运行,看看是否安装webdriver。我用的是谷歌浏览器,所以使用的是谷歌的webdriver,请安装对应浏览器的对应版本。
- 有可能显示到一个让你登录的页面,这是由于我在刚开始打开页面时不是真正的豆瓣主页面。可以试着改变网络,或者增加跳转。
|