一、利用已有的cookies:通过在spiders下的爬虫文件中重写start_requests方法,在回调函数中提取数据
class BaiduSpider(scrapy.Spider):
name = 'baidu'
allowed_domains = ['baidu.com']
start_urls = ['http://baidu.com/']
# 重写start_requests方法
def start_requests(self):
cookies = """pgv_pvi = 8991404032;
rpdid = kmkkkilsxxdosoqkoxqww;
CURRENT_QUALITY = 64;
bsource = search_360;
innersign = 1"""
cookies = {i.split('=')[0]:i.split('=')[1] for i in cookies.split(";")}
#构造请求
yield scrapy.Request(self.start_urls[0], callback=self.parse(),
cookies=cookies)
def parse(self, response):
print(response.text)
print(response.xpath('//div'))#.extract()
二、post登陆之使用scrapy.FormRequest
class GithubSpider(scrapy.Spider):
name= 'github'
allowed_domains=['github.com']
start_urLs=['https://github.com/Login']
def parse(self,response):
authenticity_token=response.xpath("//input[@name='authenticity_token']/@value").extract_first()
utf8 = response.xpath("//input[@name='utf8']/@value").extract_first()
commit=response.xpath("//input[@name='commit']/@value").extract_first()
post_data = dict(
Login="noobpythoner",
EeSswaTc="zhoudaweL23",
authenticity_taken=authenticity_token,
utf8=utf8,
cammit=commit
)
yield scrapy.FormRequest(
"https://github.com/session",
farmdata=post_data,
callback=self.after_Login)
def after_Login(self,response):
#对登陆后的页面进行处理
#print(re.findalL("noobpythoner|NoobPythoner,response.body.decode()))
pass
三、若表单中含有action属性,还可以使用FormRequest.from_response自动寻找表单
class Github2Spider(scrapy.Spider):
name ='github2'
alLowed_domains =['github.com']
start_urls =['https://github.com/Login']
def parse(self,response):
yield scrapy.FormRequest.from_response(
response,#自动的从response中寻找from表单,多个表单时可以通过添加其他参数进行定位
formdata={"Login":"noobpythoner","password":"daWei1231"},
caLlback = self.after_Login
)
def after_login(self,response):
#对登陆后的页面进行处理
print(re.findaLl("noobpythoner|NoobPythoner",response.body.decode()))
|