from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import os
import requests
import urllib
from bs4 import BeautifulSoup
import re
import functools
from threading import Thread
def timeout(timeout):
def deco(func):
@functools.wraps(func)
def wrapper(*args,**kwargs):
res = [Exception('function [%s] timeout [%s seconds] exceeded!' % (func.__name__,timeout))]
def newFunc():
try:
res[0] = func(*args,**kwargs)
except Exception as e:
res[0] = e
t = Thread(target=newFunc)
t.daemon = True
try:
t.start()
t.join(timeout)
except Exception as je:
print('error starting thread')
raise je
ret = res[0]
if isinstance(ret,BaseException):
raise ret
return ret
return wrapper
return deco
@timeout(5)
def save_img(count, img_src, picpath):
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Mobile Safari/537.36'
}
filename = os.path.join(picpath,'bing_924_{}.jpg'.format(count))
if os.path.exists(filename):
return
r = requests.get(img_src, headers=headers, timeout=(3, 7))
with open(filename, 'wb') as f:
f.write(r.content)
f.close()
print(filename, 'saved..')
url = 'https://cn.bing.com/images'
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
pos = 0
picpath = '../gylj/imgs_924'
os.makedirs(picpath, exist_ok=True)
count = len(os.listdir(picpath))
error_count = 0
keys = ['','','','']
for key in keys:
print(f'processing {key} 中....')
browser = webdriver.Chrome(options=options)
browser.get(url)
browser.maximize_window()
browser.find_element_by_id("sb_form_q").clear()
browser.find_element_by_id("sb_form_q").send_keys(key, Keys.ENTER)
browser.implicitly_wait(2)
browser.find_element_by_class_name("fltIdtTit").click() # 点击筛选
browser.implicitly_wait(1)
browser.find_element_by_xpath('/html/body/div[3]/div[2]/div/ul/li[1]/span/span').click() # 点击图片尺寸
browser.implicitly_wait(1)
browser.find_element_by_xpath('/html/body/div[3]/div[2]/div/ul/li[1]/div/div/a[3]/span').click() # 选择中
browser.implicitly_wait(1)
i = 0
while i <= 20: #滚动页面次数
print(f'页面下拉一次,一共下拉 {i+1} 次')
js = 'var q=document.documentElement.scrollTop={}'.format(pos)
pos += 5000 # 下拉滚动条
i = i + 1
browser.execute_script(js)
time.sleep(2) # 等待页面加载5秒
allhtml = browser.page_source # 获取页面源代码
browser.quit() # 拿到页面源代码后关闭浏览器
# print(allhtml)
bs = BeautifulSoup(allhtml, 'lxml')
list01 = bs.find_all('a', class_='iusc') # 获取所有class='iusc'的<a>标签
src_list = re.findall(r'\"murl\"\:\"(.*?)\"\,\"', str(list01)) # 正则匹配图片地址
print(len(src_list))
for i, img_url in enumerate(src_list):
try:
save_img(count, img_url, picpath)
count += 1
except:
continue
|