在一些实际项目中,经常需要进行网页截图,这里就分享一个通过scrapy+selenium进行网页截图。
首先我们需要安装selenium库
pip intall selenium
当我们使用selenium进行截图时通常使用的方法类是:
get_screenshot_as_file(截图保存路径)
from selenium import webdriver
from time import sleep
driver = webdriver.Chrome()
driver.get('http://www.baidu.com/')
driver.get_screenshot_as_file("./截图.png")
sleep(2)
driver.quit()
那么如何利用scrapy联合 selenium呢 首先我们创建一个类 然后定义好 登录帐号密码 各种组件xpath路径等变量
class BjobOrderRecordSpider(scrapy.Spider):
name = 'bjob_record_spider'
username_input_eml = '//*[@id="__layout"]/div/div[1]/div/div/div/div[2]/div/div[2]/form[1]/div[1]/div/div[1]/input'
username = 'xxxx'
password_input_eml = '//*[@id="__layout"]/div/div[1]/div/div/div/div[2]/div/div[2]/form[1]/div[2]/div/div[1]/input'
password = 'xxx'
login_url = 'http://xxx.cn/b2badmin/order/orderList'
load_execl_data = 'F:\\xxxxx.xlsx'
sheet1 = '办公用品及低值易耗品'
img_save_path_name = '文件名'
img_save_order_sn_name = 'EC单号'
login_eml = '//*[@id="__layout"]/div/div[1]/div/div/div/div[2]/div/div[2]/form[1]/div[3]/div/button'
seach_q_eml = '/html/body/div/div/div[3]/section/div/div[2]/section/div[1]/div/div[1]/div/label[1]/span'
seach_eml = '//*[@id="app"]/div/div[3]/section/div/div[1]/div/div[1]/div[2]/button[1]'
detail_eml = '/html/body/div[1]/div/div[3]/section/div/div[2]/section/div[2]/div[1]/div[3]/table/tbody/tr/td[6]/div/span/span/span/span/div/span'
detail_eml2 = '/html/body/div[1]/div/div[3]/section/div/div[2]/section/div[2]/div[1]/div[3]/table/tbody/tr/td[6]/div/span/span/span/span/div/div'
qgdid_eml = '/html/body/div/div/div[3]/section/div/div[1]/div/div[1]/div[1]/form/div[1]/div/div[2]/input'
shot_dir = 'F:\\xxxx\\网页截图\\xxxx\\'
然后再 初始化一个 selenium浏览器 对象
def __init__(self, *args, **kwargs):
self.pool3 = self.mysql_connection3()
driver = 'Firefox'
if driver == 'Chrome':
chrome_options = ChromeOptions()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('window-size=1920x1080')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--start-maximized')
chrome_options.add_argument(
'--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"')
chrome_options.add_argument('--incognito')
self.browser = webdriver.Chrome(
chrome_options=chrome_options
)
else:
firefox_options = FirefoxOptions()
firefox_options.add_argument('--no-sandbox')
firefox_options.add_argument('--window-size=1920x1080')
firefox_options.add_argument('--disable-gpu')
firefox_options.add_argument('--start-maximized')
firefox_options.add_argument('--disable-infobars')
firefox_options.add_argument('--incognito')
self.browser = webdriver.Firefox(
firefox_options=firefox_options,
)
self.driver = None
self.cookies = None
super(BjobOrderRecordSpider, self).__init__(*args, **kwargs)
之后便是业务代码 如 登录、获取订单列表 获取订单详情 截图 等模块的封装
def start_requests(self):
self.browser.get(self.login_url)
username = self.browser.find_element_by_xpath(self.username_input_eml)
password = self.browser.find_element_by_xpath(self.password_input_eml)
username.clear()
username.send_keys(self.username)
password.clear()
password.send_keys(self.password)
self.browser.find_element_by_xpath(
self.login_eml).click()
time.sleep(2)
self.getScreenShot()
def getScreenShot(self):
bjob_order_list = self.select_all3("select * from xxxxx'", ())
i_num = 0
for loadd in bjob_order_list:
i_num = i_num + 1
print("截图进度:", i_num)
typeState = str(loadd["type"])
company_name = loadd["company_name"]
qgdId_e = loadd["sc_order"]
win = loadd["win"]
if win != "xx":
print("非xx %s" % qgdId_e)
continue
order_sn = loadd["img_name"]
if os.path.isdir('%s%s\\%s' % (self.shot_dir, company_name, typeState)) == False:
os.makedirs('%s%s\\%s' % (self.shot_dir, company_name, typeState))
if os.path.exists('%s\\%s\\%s\\%s-1_01.png' % (self.shot_dir, company_name, typeState, order_sn)):
print("文件已经存在", '%s\\%s\\%s\\%s-1_01.png' % (self.shot_dir, company_name, typeState, order_sn))
continue
time.sleep(1)
qgdId = self.browser.find_element_by_xpath(self.qgdid_eml)
qgdId.clear()
qgdId.send_keys(str(qgdId_e).strip())
self.clickElement(self.seach_eml)
time.sleep(3)
self.clickElement(self.seach_q_eml)
time.sleep(1)
if self.isElementPresent(self.detail_eml) == False:
if self.isElementPresent(self.detail_eml2) == False:
continue
else:
self.clickElement(self.detail_eml2)
else:
self.clickElement(self.detail_eml)
time.sleep(3)
windows = self.browser.current_window_handle
time.sleep(2)
all_handles = self.browser.window_handles
if len(all_handles) >= 2:
self.screenshotByhandles(all_handles, windows, order_sn, company_name, typeState)
self.browser.switch_to.window(windows)
def screenshotByhandles(self, all_handles, windows, order_sn, company_name, typeState):
i = 0
for handle in all_handles:
if handle != windows:
i = i + 1
s = str(i)
self.browser.switch_to.window(handle)
width = self.browser.execute_script("return document.documentElement.scrollWidth")
height = self.browser.execute_script("return document.documentElement.scrollHeight")
time.sleep(1)
self.browser.set_window_size(width, 2048)
self.clickElement('//*[@id="tab-first"]')
self.shotToFile(order_sn, s, company_name, typeState, 1)
self.clickElement('//*[@id="tab-second"]')
self.shotToFile(order_sn, s, company_name, typeState, 2)
def shotToFile(self, order_sn, s, company_name, typeState, type=1):
try:
pathscreenshot = '%s%s\\%s\\%s-%s_0%s.png' % (
self.shot_dir, company_name, typeState, order_sn, s, str(type))
pathscreenshot = str(pathscreenshot).replace('\t', '').replace('\n', '').strip()
self.browser.get_screenshot_as_file(pathscreenshot)
if type == 2:
self.browser.close()
except Exception as e:
print(e)
time.sleep(2)
self.shotToFile(order_sn, s, type, company_name, typeState)
def isElementPresent(self, by):
"""
用来判断元素标签是否存在,
"""
try:
self.browser.find_element_by_xpath(by)
except NoSuchElementException as e:
return False
else:
return True
def loadExeclData(self, path, sheetname="Sheet1"):
data_path = path
sheetname = sheetname
get_data = ExcelData(data_path, sheetname, 0)
datas = get_data.readExcel(1)
return datas
|