Python爬取表情包
任务:爬取某斗图网站的223页表情包:
一、获取图片URL
url = "https://aidotu.com/search/0-0-0-1.html"
def getURLs():
urls = []
for i in range(223):
url = "https://aidotu.com/search/0-0-0-" + str(i+1) + ".html"
urls.append(url)
return urls
二、发送请求获取图片数据并写入文件
import requests
from fake_useragent import UserAgent
from parsel import Selector
def download(url):
headers = {
"User-Agent": UserAgent().random
}
res = requests.get(url=url, headers=headers)
select = Selector(res.text)
result_list = select.xpath("//div[@class='layui-col-sm3 layui-col-xs6']")
for i in result_list:
img_url = i.xpath("./a/img/@src").extract_first()
img_title = i.xpath("./a/img/@alt").extract_first()
img_name = img_title + "." + img_url.split(".")[-1]
img_data = requests.get("https:" + img_url, headers=headers).content
with open("/Users/yanzhuang/Downloads/img/" + img_name, "wb") as f:
f.write(img_data)
print(img_name + "===== 下载成功!")
三、线程池
- 由于数据量较大,需要使用多线程来提高效率,本次任务采用线程池 ThreadPoolExecutor 来解决。
from concurrent.futures import ThreadPoolExecutor
if __name__ == "__main__":
urls = getURLs()
pool = ThreadPoolExecutor(max_workers=8)
for url in urls:
pool.submit(download, url)
|