import time
from scrapy.http import HtmlResponse
from scrapy.utils.python import global_object_name
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from gerapy_selenium.pretend import SCRIPTS as PRETEND_SCRIPTS
from gerapy_selenium.settings import *
import urllib.parse
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.support import expected_conditions as EC
from twisted.internet.threads import deferToThread
from xvfbwrapper import Xvfb
from gerapy_selenium.downloadermiddlewares import SeleniumMiddleware
logger = logging.getLogger('gerapy.selenium')
class myMiddleware(SeleniumMiddleware):
"""
Downloader middleware handling the requests with Selenium
"""
def _retry(self, request, reason, spider):
"""
get retry request
:param request:
:param reason:
:param spider:
:return:
"""
if not self.retry_enabled:
return
retries = request.meta.get('retry_times', 0) + 1
retry_times = self.max_retry_times
if 'max_retry_times' in request.meta:
retry_times = request.meta['max_retry_times']
stats = spider.crawler.stats
if retries <= retry_times:
logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s",
{'request': request, 'retries': retries, 'reason': reason},
extra={'spider': spider})
retryreq = request.copy()
retryreq.meta['retry_times'] = retries
retryreq.dont_filter = True
retryreq.priority = request.priority + self.priority_adjust
if isinstance(reason, Exception):
reason = global_object_name(reason.__class__)
stats.inc_value('retry/count')
stats.inc_value('retry/reason_count/%s' % reason)
return retryreq
else:
stats.inc_value('retry/max_reached')
logger.error("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
{'request': request, 'retries': retries, 'reason': reason},
extra={'spider': spider})
@classmethod
def from_crawler(cls, crawler):
"""
init the middleware
:param crawler:
:return:
"""
settings = crawler.settings
logging_level = settings.get('GERAPY_SELENIUM_LOGGING_LEVEL', GERAPY_SELENIUM_LOGGING_LEVEL)
logging.getLogger('selenium.webdriver.remote.remote_connection').setLevel(logging_level)
logging.getLogger('urllib3.connectionpool').setLevel(logging_level)
cls.window_width = settings.get('GERAPY_SELENIUM_WINDOW_WIDTH', GERAPY_SELENIUM_WINDOW_WIDTH)
cls.window_height = settings.get('GERAPY_SELENIUM_WINDOW_HEIGHT', GERAPY_SELENIUM_WINDOW_HEIGHT)
cls.headless = settings.get('GERAPY_SELENIUM_HEADLESS', GERAPY_SELENIUM_HEADLESS)
cls.ignore_https_errors = settings.get('GERAPY_SELENIUM_IGNORE_HTTPS_ERRORS',
GERAPY_SELENIUM_IGNORE_HTTPS_ERRORS)
cls.executable_path = settings.get('GERAPY_SELENIUM_EXECUTABLE_PATH', GERAPY_SELENIUM_EXECUTABLE_PATH)
cls.disable_extensions = settings.get('GERAPY_SELENIUM_DISABLE_EXTENSIONS',
GERAPY_SELENIUM_DISABLE_EXTENSIONS)
cls.hide_scrollbars = settings.get('GERAPY_SELENIUM_HIDE_SCROLLBARS', GERAPY_SELENIUM_HIDE_SCROLLBARS)
cls.mute_audio = settings.get('GERAPY_SELENIUM_MUTE_AUDIO', GERAPY_SELENIUM_MUTE_AUDIO)
cls.no_sandbox = settings.get('GERAPY_SELENIUM_NO_SANDBOX', GERAPY_SELENIUM_NO_SANDBOX)
cls.disable_setuid_sandbox = settings.get('GERAPY_SELENIUM_DISABLE_SETUID_SANDBOX',
GERAPY_SELENIUM_DISABLE_SETUID_SANDBOX)
cls.disable_gpu = settings.get('GERAPY_SELENIUM_DISABLE_GPU', GERAPY_SELENIUM_DISABLE_GPU)
cls.download_timeout = settings.get('GERAPY_SELENIUM_DOWNLOAD_TIMEOUT',
settings.get('DOWNLOAD_TIMEOUT', GERAPY_SELENIUM_DOWNLOAD_TIMEOUT))
cls.screenshot = settings.get('GERAPY_SELENIUM_SCREENSHOT', GERAPY_SELENIUM_SCREENSHOT)
cls.pretend = settings.get('GERAPY_SELENIUM_PRETEND', GERAPY_SELENIUM_PRETEND)
cls.sleep = settings.get('GERAPY_SELENIUM_SLEEP', GERAPY_SELENIUM_SLEEP)
cls.retry_enabled = settings.getbool('RETRY_ENABLED')
cls.max_retry_times = settings.getint('RETRY_TIMES')
cls.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES'))
cls.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST')
return cls()
def _process_request(self, request, spider):
"""
use pyppeteer to process spider
:param request:
:param spider:
:return:
"""
v_display = Xvfb(width=self.window_width, height=self.window_height)
v_display.start()
kwargs = {}
options = ChromeOptions()
kwargs['options'] = options
if self.headless:
options.add_argument('--headless')
if self.pretend:
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option('useAutomationExtension', False)
if self.executable_path:
kwargs['executable_path'] = self.executable_path
if self.window_width and self.window_height:
options.add_argument(f'--window-size={self.window_width},{self.window_height}')
if self.disable_gpu:
options.add_argument('--disable-gpu')
if self.hide_scrollbars:
options.add_argument('--hide-scrollbars')
if self.ignore_https_errors:
options.add_argument('--ignore-certificate-errors')
if self.disable_extensions:
options.add_argument('--disable-extensions')
if self.mute_audio:
options.add_argument('--mute-audio')
if self.no_sandbox:
options.add_argument('--no-sandbox')
if self.disable_setuid_sandbox:
options.add_argument('--disable-setuid-sandbox')
selenium_meta = request.meta.get('selenium') or {}
logger.debug('selenium_meta %s', selenium_meta)
_proxy = request.meta.get('proxy')
if selenium_meta.get('proxy') is not None:
_proxy = selenium_meta.get('proxy')
if _proxy:
options.add_argument('--proxy-server=' + _proxy)
browser = webdriver.Chrome(**kwargs)
browser.set_window_size(self.window_width, self.window_height)
_pretend = self.pretend
if selenium_meta.get('pretend') is not None:
_pretend = selenium_meta.get('pretend')
if _pretend:
for script in PRETEND_SCRIPTS:
browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': script
})
_timeout = self.download_timeout
if selenium_meta.get('timeout') is not None:
_timeout = selenium_meta.get('timeout')
browser.set_page_load_timeout(_timeout)
try:
browser.get(request.url)
browser.implicitly_wait(10)
except TimeoutException:
browser.close()
return self._retry(request, 504, spider)
parse_result = urllib.parse.urlsplit(request.url)
domain = parse_result.hostname
_cookies = []
if isinstance(request.cookies, dict):
_cookies = [{'name': k, 'value': v, 'domain': domain}
for k, v in request.cookies.items()]
else:
for _cookie in _cookies:
if isinstance(_cookie, dict) and 'domain' not in _cookie.keys():
_cookie['domain'] = domain
for _cookie in _cookies:
browser.add_cookie(_cookie)
if _cookies:
browser.refresh()
if selenium_meta.get('wait_for'):
_wait_for = selenium_meta.get('wait_for')
try:
logger.debug('waiting for %s', _wait_for)
WebDriverWait(browser, _timeout).until(
EC.presence_of_element_located((By.CSS_SELECTOR, _wait_for))
)
except TimeoutException:
logger.error('error waiting for %s of %s', _wait_for, request.url)
browser.close()
return self._retry(request, 504, spider)
if selenium_meta.get('script'):
_script = selenium_meta.get('script')
logger.debug('evaluating %s', _script)
browser.execute(_script)
_sleep = self.sleep
if selenium_meta.get('sleep') is not None:
_sleep = selenium_meta.get('sleep')
if _sleep is not None:
logger.debug('sleep for %ss', _sleep)
time.sleep(_sleep)
body = browser.page_source
_screenshot = self.screenshot
if selenium_meta.get('screenshot') is not None:
_screenshot = selenium_meta.get('screenshot')
screenshot_result = None
if _screenshot is not None:
screenshot_result = browser.get_screenshot_as_png()
logger.debug('close selenium')
browser.close()
v_display.stop()
response = HtmlResponse(
request.url,
status=200,
body=body,
encoding='utf-8',
request=request
)
if screenshot_result:
response.meta['screenshot'] = screenshot_result
return response
def process_request(self, request, spider):
"""
process request using pyppeteer
:param request:
:param spider:
:return:
"""
logger.debug('processing request %s', request)
return deferToThread(self._process_request, request, spider)
def _spider_closed(self):
pass
def spider_closed(self):
"""
callback when spider closed
:return:
"""
return deferToThread(self._spider_closed)
主要添加了xvfbwrapper,让浏览器开启的时候使用xvfb,关闭的时候关闭就可以了
|