学了一整天多线程爬虫之类的封装函数调用方法,老是没有反应,而单步爬虫可以爬取下来,但是显速度慢,求大神解惑,爬虫的定义函数方法到底怎么用,是什么样的思想?
代码如下,定义函数之前的单步可以爬取,也就是仅会这个方法的爬虫,封装函数的爬虫至今没成功过!
from multiprocessing.dummy import Pool
import requests
from lxml import etree
import PySimpleGUI as sg
import time
import random
# path=str(sg.popup_get_folder('选择保存文件夹'))
rooturl='https://wallroom.io'
headers = {
'authority': 'wallroom.io',
'cache-control': 'max-age=0',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'none',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8'}
start_time=time.time()
#网站主页面
response=requests.get(rooturl,headers=headers)
response.encoding='utf-8'
parsel=etree.HTML(response.text)
pagelist = parsel.xpath('//div[@class="image-list"]/div/a/@href')
for page in pagelist:
url=rooturl + page
#图片详情页
#此步骤比较耗时且阻塞,因此在此步加入多线程
# 问题:在此步抓取不到任何信息,似乎该函数没有获取到上面的url
def get_detail(url):
response = requests.get(url,headers=headers)
response.encoding='utf-8'
parsel=etree.HTML(response.text)
imgurl = parsel.xpath('/html/body/nav/div[3]/a/@href')
name = parsel.xpath('/html/body/nav/div[3]/span/b[1]/text()')
downloadurl = [rooturl + img for img in imgurl]
dic={
'name':name,
'url':downloadurl
}
print(dic)
#下载图片按钮
# 问题:在此步抓取不到任何信息,似乎该函数没有获取到上面的url
def save_img(dic):
url=dic['url']
name = dic['name']
print('正在下载-----',name)
data = requests.get(url,headers=headers).content
filepath = 'C:/Users/DWX/Desktop/111/'+ str(name)+'.jpeg'
with open(filepath, 'wb') as 变量名:
变量名.write(data)
print('下载-----', name, '-----成功')
pool=Pool(6)
pool.imap(get_detail,url)
pool.close()
pool.join()
|