def get_body(url):
respond = requests.get(url)
html = parsel.Selector(respond.text)
title = html.css('body > div > div > div > div > div > h3::text').get()
origin = html.css('body > div > div > div> div > div> p > a::text').get()try: #因为部分诗词没有注释和故事,所以在这里进行报错
note = html.css('body > div > div > div> div:nth-child(2) > div > p:nth-child(4)::text').get()
stories = html.css('body > div > div > div > div:nth-child(3) > div > p::text').getall()
story ="\n".join(stories) #对列表进行拼接
except:
pass
appreciates = html.css('body > div > div > div > div:nth-child(3) > div > p::text').getall()
appreciate ='\n'.join(appreciates)
def get_url(url):
respond = requests.get(url)
html = parsel.Selector(respond.text)
url_links = html.css('body > div > div > div > div > div > ul > li > a::attr(href)').getall()for url_link in url_links:
url_link ='https://www.xungushici.com'+url_link
print(url_link)get_body(url_link)
再分析每一页
不难得出规律 进行编写
def get_page():
url ='https://www.xungushici.com/mingjus/p'for i inrange(1,3):
html ='https://www.xungushici.com/mingjus/p%d'%i
get_url(html)
完整代码
# -*- coding = utf-8-*-
# @Time :2021/7/321:02
# @File : 尝试爬取寻古诗词网.py
# @Software : PyCharm
import requests
import parsel
def get_page():
url ='https://www.xungushici.com/mingjus/p'for i inrange(1,3):
html ='https://www.xungushici.com/mingjus/p%d'%i
get_url(html)
def get_url(url):
respond = requests.get(url)
html = parsel.Selector(respond.text)
url_links = html.css('body > div > div > div > div > div > ul > li > a::attr(href)').getall()for url_link in url_links:
url_link ='https://www.xungushici.com'+url_link
print(url_link)get_body(url_link)
def get_body(url):
respond = requests.get(url)
html = parsel.Selector(respond.text)
title = html.css('body > div > div > div > div > div > h3::text').get()
origin = html.css('body > div > div > div> div > div> p > a::text').get()try:
note = html.css('body > div > div > div> div:nth-child(2) > div > p:nth-child(4)::text').get()
stories = html.css('body > div > div > div > div:nth-child(3) > div > p::text').getall()
story ="\n".join(stories) #对列表进行拼接
except:
pass
appreciates = html.css('body > div > div > div > div:nth-child(3) > div > p::text').getall()
appreciate ='\n'.join(appreciates)withopen(title +'.txt', mode='a', encoding='utf-8')as f:
f.write(title)
f.write('\n')
f.write('出自于:'+'\n'+origin+'\n')try:
f.write('注释:'+'\n'+note+'\n')
f.write('故事:'+'\n'+story+'\n')
except:
pass
f.write('赏析:'+'\n'+appreciate+'\n')
f.close()get_page()