# 爬虫部分
import scrapy
from lxml import etree
from ..items import SssItem
class TianQiSpider(scrapy.Spider):
name='TianQi'
allowed_domains=['www.tianqijun.com']
start_urls=[]
for i in range(1,47):
url='https://www.tianqijun.com/lvyou/index_'+str(i)+'.html'
start_urls.append(url)
def parse(self,response,**kwargs):
dom=etree.HTML(response.text)
xp=dom.xpath("//div[@class='list']/a/@title")
xp_href=dom.xpath("//div[@class='list']/a/@href")
for x in range(len(xp)):
item=SssItem()
item['title']=xp[x]
item['href']=xp_href[x]
print(item['title'])
with open('天奇生活测试.txt','a+') as f:
f.write(item['title']+'\n')
yield item
yield scrapy.Request(url='https://www.tianqijun.com/'+item['href'],callback=self.parser_1)
def parser_1(self,response):
dom_1=etree.HTML(response.text)
xp_1=dom_1.xpath("//div[@class='detailTitle']/h2/text()")
for i in xp_1:
item=SssItem()
item['title_z']=i
print(item['title_z'])
with open('天奇生活子链接测试.txt','a+') as f1:
f1.write(item['title_z']+'\n')
yield item
# items.py文件配置
import scrapy
class SssItem(scrapy.Item):
title=scrapy.Field()
href=scrapy.Field()
title_z=scrapy.Field()
# pipelines.py文件配置
from itemadapter import ItemAdapter
class SssPipeline:
def process_item(self,item,spider):
return item
# settings.py文件配置
BOT_NAME = 'SSS'
SPIDER_MODULES = ['SSS.spiders']
NEWSPIDER_MODULE = 'SSS.spiders'
USER_AGENT='Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
ROBOTSTXT_OBEY=False
CONCURRENT_REQUESTS = 2
# 启动函数配置
from scrapy import cmdline
cmdline.execute(['scrapy','crawl','TianQi'])
|