pipline
-
Image Pipline(爬取图片) # settings.py
IMAGES_STORE = './images'
# piplines.py
from scrapy import Request
from scrapy.exceptions import DropItem
from scrapy.piplines.images import ImagesPipline
class ImagePipline(ImagesPipline):
# 接收spider生成的item,取出url生成Request对象
def get_media_requests(self,item,info):
yield Requests(item['url'])
# 返回保存的文件名
def file_path(self,request,response=None,info=None):
url = request.url
file_name = url.split('/')[-1]
return file_name
# item下载完成时的处理方法
# results为该item对应的下载结果
def item_completed(self,results,item,info):
image_paths = [x['path'] for ok,x in results if ok]
if not image_paths:
raise DropItem('Image Download Failed')
return item
-
MysqlPipline(Mysql数据库) # settings.py
MYSQL_HOST='localhost'
MYSQL_DATABASE= 'database'
MYSQL_PORT = 3306
MYSQL_USER = 'root'
MYSQL_PASSWORD = 'root'
# piplines.py
import pymysql
class MysqlPipline():
def __init__(self,host,database,user,password,port):
self.host = host
self.database = database
self.user = user
self.password = password
self.port = port
# 拿去到settings.py中与mysql相关的参数
@classmethod
def from_crawler(cls,crawler):
return cls(
host = crawler.settings.get('MYSQL_HOST'),
database = crawler.settings.get('MYSQL_DATABASE'),
user = crawler.settings.get('MYSQL_USER'),
password = crawler.settings.get('MYSQL_PASSWORD'),
port = crawler.settings.get('MYSQL_PORT'),
)
# spider开启时被自动调用
def open_spider(self,spider):
self.db = pymysql.connect(self.host,self.user,self.password,self.database,charset='utf-8',port=self.port)
self.cursor= self.db.cursor()
# spider关闭时被自动调用
def close_spider(self,spider):
self.db.close()
# process_item是必须要实现的方法,pipline会默认用这个方法对item进行处理
def process_item(self,item,spider):
data = dict(item)
keys = ','.join(data.keys())
values = ','.join(['%s']*len(data))
sql = ''
self.cursor.execute(sql,tuple(data.vaules()))
self.db.commit()
return item
|