内容介绍
使用爬虫将网站上Cos小姐姐的图片抓取到本地让然后将图片拼接成PDF相册,很简单的一个应用,具体自己需要的内容仔细阅读下代码就很容易找到了。
来看一下最终的结果,然后有兴趣的自己慢慢看一遍代码,有基础的小伙伴看一遍就会了。
元神COS小姐姐 应用技术包括:
- Scrapy 爬虫框架
- PIL 图片处理
- os 文件处理
Scrapy 代码实现
item.py
title = scrapy.Field()
url = scrapy.Field()
time = scrapy.Field()
author = scrapy.Field()
content = scrapy.Field()
source = scrapy.Field()
middlewares.py
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
from scrapy.utils.project import get_project_settings
import random
import sys
sys.path.append('.')
settings = get_project_settings()
class RotateUserAgentMiddleware(UserAgentMiddleware):
def process_request(self, request, spider):
referer = request.url
if referer:
request.headers["referer"] = referer
USER_AGENT_LIST = settings.get('USER_AGENT_LIST')
user_agent = random.choice(USER_AGENT_LIST)
if user_agent:
request.headers.setdefault('user-Agent', user_agent)
print(f"user-Agent:{user_agent}")
class MyProxyMiddleware(object):
def process_request(self, request, spider):
request.meta['proxy'] = 'http://127.0.0.1:xxxxx'
pipline.py
def __init__(self):
host = settings["MONGODB_HOST"]
port = settings["MONGODB_PORT"]
dbname = settings["MONGODB_DATABASE"]
sheetname = settings["MONGODB_TABLE"]
username = settings["MONGODB_USER"]
password = settings["MONGODB_PASSWORD"]
client = pymongo.MongoClient(host=host, port=port, username=username, password=password)
mydb = client[dbname]
self.post = mydb[sheetname]
def process_item(self, item, spider):
data = dict(item)
self.post.insert(data)
return item
settings.py
MONGODB_HOST = "localhost"
MONGODB_PORT = 27017
MONGODB_DATABASE = "PicData"
MONGODB_TABLE = "PicData"
MONGODB_USER = ""
MONGODB_PASSWORD = ""
USER_AGENT_LIST = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
spider.py
name = 'all_data'
allowed_domains = []
start_urls = [
['http://www.photos18.com/?category_id=8&page={}&per-page=100', 'COSPLAY'],
]
def start_requests(self):
for data in self.start_urls:
url = data[0]
source = data[1]
for num in range(1, 101):
yield scrapy.Request(
url=url.format(num),
meta={
'url': url,
'source': source,
},
callback=self.parse
)
def parse(self, response):
Item_title = response.xpath('//div[@class="card"]/div[@class="card-body p-2"]/a/text()').extract()
Item_url = response.xpath('//div[@class="card"]/div[@class="card-body p-2"]/a/@href').extract()
for each in range(len(Item_title)):
item = Photos18ComItem()
item['title'] = Item_title[each].strip()
item['url'] = parse.urljoin(response.url, Item_url[each])
item['source'] = response.meta["source"]
yield scrapy.Request(item['url'], callback=self.parse_detail, meta={'item': item})
def parse_detail(self, response):
item = response.meta["item"]
folder = os.getcwd() + "/data/" + item['title']
if not os.path.exists(folder):
os.makedirs(folder)
Item_Img = response.xpath('//div[@class="my-2 imgHolder"]/img/@data-src').extract()
for url in Item_Img:
url_ = parse.urljoin(item['url'], url).split("?")[0]
print(url_)
html = requests.get(url_)
with open(folder + "/" + str(int(time.time())) + ".jpg", "wb") as f:
f.write(html.content)
yield item
启动爬虫后数据都抓取到本地拉。
图片合并成PDF文件
import os
import re
from PIL import Image
import os
def combine2Pdf( folderPath, pdfFilePath ):
files = os.listdir( folderPath )
pngFiles = []
sources = []
for file in files:
if 'jpg' in file:
pngFiles.append( folderPath + file )
pngFiles.sort()
output = Image.open( pngFiles[0] )
pngFiles.pop( 0 )
for file in pngFiles:
pngFile = Image.open( file )
if pngFile.mode == "RGB":
pngFile = pngFile.convert( "RGB" )
sources.append( pngFile )
output.save( pdfFilePath, "pdf", save_all=True, append_images=sources )
file_dir ="./data/"
L=[]
for file in os.listdir(file_dir):
folder = "./data/" + file + "/"
pdfFile = "./pdf/" + file + ".pdf"
try:
combine2Pdf( folder, pdfFile )
except:
print(file,"图片存在错误")
然后所有的图片都合并成PDF文件拉,方便阅读哈哈。
|