一、基础环境
1.1版本环境
window10 + pycharm2019.3+python3.7
1.2依赖包
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
from selenium.webdriver.common.keys import Keys
import lxml
from lxml import etree
import os
import urllib.request
import json
import uuid
import random
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
from fpdf import FPDF
from PIL import Image
1.3浏览器驱动
selenium加载浏览器环境需要下载对应的浏览器驱动,随便选择一个喜欢用的就可以了,但是优先推荐Chrome,这里我选择的也是Chrome。
下载地址:http://npm.taobao.org/mirrors/chromedriver/ ,
注意:下载与直接浏览器版本相适应的
二、网页分析
目标网页: http://dxs.moe.gov.cn/zx/hd/sxjm/sxjmlw/2020qgdxssxjmjslwzs/ 进去之后我们可以发现我们需要爬取的文件在这里 接着,我们再进去,结果我们发现,网站为了防止我们爬虫,将文件设置成了 图片的方式展示。 这估计是网站为了反爬虫设置的,即使我们将图片爬取下来了,其可阅读性将大大降低,但是我们不怕,秉着python爬虫的原则——>可见即可爬,我们可以利用python的第三方库PIL和FPDF,将图片合成pdf来阅读。 基本分析完毕,我们来进行代码操作
三、代码分析
3.1 反反爬虫操作
option = ChromeOptions()
option.add_argument('--headless')
option.add_experimental_option('excludeSwitches', ['enable-automation'])
3.2声明浏览器对象
browser = webdriver.Chrome(options=option, executable_path=r'D:\Google\Chrome\Application\chromedriver.exe')
3.3获取目标文件的urls
url = "http://dxs.moe.gov.cn/zx/hd/sxjm/sxjmlw/2020qgdxssxjmjslwzs/"
browser.get(url)
Urls = []
try:
Class_urls = browser.find_elements_by_xpath('//*[@class="three-tit"]/h4/a')
for temp_urls in Class_urls:
Urls.append(temp_urls.get_attribute("href"))
except NoSuchElementException:
print('没找到')
3.4 获取图片的urls
for Article_url in Urls:
Img_urls = []
browser.get(Article_url)
Article_title = browser.find_element_by_xpath('/html/body/div[1]/div[3]/div[1]/div[2]').get_attribute("textContent")
print(Article_title)
try:
ariticle_urls = browser.find_elements_by_xpath('//*[@class="imgslide-wra"]/img')
for temp_urls in ariticle_urls:
Img_urls.append(temp_urls.get_attribute("src"))
except NoSuchElementException:
print('没找到')
3.5下载文件
def download(url, name, path):
if not os.path.exists(path):
os.makedirs(path)
urllib.request.urlretrieve(url, path + name + '.jpg')
print("%s图片下载成功" % name)
3.6 处理图片
def makePdf(pdfName, title,i):
cover = Image.open(path + title.strip()+str(i)+ '.jpg')
width, height = cover.size
pdf = FPDF(unit="pt", format=[width, height])
for j in range(1,i+1):
pdf.add_page()
pdf.image(path + title.strip()+str(j)+ '.jpg', 0, 0)
pdf.output(pdfName, "F")
四、结果展示
五、完整代码
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
from selenium.webdriver.common.keys import Keys
import lxml
from lxml import etree
import os
import urllib.request
import json
import uuid
import random
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
from fpdf import FPDF
from PIL import Image
path = 'D:/Desktop/国赛优秀论文/图片/'
Oringin_url = 'http://dxs.moe.gov.cn/'
def del_file(path):
ls = os.listdir(path)
for i in ls:
c_path = os.path.join(path, i)
if os.path.isdir(c_path):
del_file(c_path)
else:
os.remove(c_path)
def makePdf(pdfName, title,i):
cover = Image.open(path + title.strip()+str(i)+ '.jpg')
width, height = cover.size
pdf = FPDF(unit="pt", format=[width, height])
for j in range(1,i+1):
pdf.add_page()
pdf.image(path + title.strip()+str(j)+ '.jpg', 0, 0)
pdf.output(pdfName, "F")
def download(url, name, path):
if not os.path.exists(path):
os.makedirs(path)
urllib.request.urlretrieve(url, path + name + '.jpg')
print("%s图片下载成功" % name)
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
browser = webdriver.Chrome(options=option, executable_path=r'D:\Google\Chrome\Application\chromedriver.exe')
url = "http://dxs.moe.gov.cn/zx/hd/sxjm/sxjmlw/2020qgdxssxjmjslwzs/"
browser.get(url)
Urls = []
try:
Class_urls = browser.find_elements_by_xpath('//*[@class="three-tit"]/h4/a')
for temp_urls in Class_urls:
Urls.append(temp_urls.get_attribute("href"))
except NoSuchElementException:
print('没找到')
for Article_url in Urls:
Img_urls = []
browser.get(Article_url)
Article_title = browser.find_element_by_xpath('/html/body/div[1]/div[3]/div[1]/div[2]').get_attribute("textContent")
print(Article_title)
try:
ariticle_urls = browser.find_elements_by_xpath('//*[@class="imgslide-wra"]/img')
for temp_urls in ariticle_urls:
Img_urls.append(temp_urls.get_attribute("src"))
except NoSuchElementException:
print('没找到')
i = 0
for src_url in Img_urls:
i += 1
download(src_url, Article_title.strip() + str(i), path)
makePdf("D:/Desktop/国赛优秀论文/"+Article_title.strip()+'.pdf', Article_title.strip(),i)
print(Article_title.strip() +"下载成功")
|