[Python知识库] 期末爬虫项目：爬取豆瓣图书

开发: C++知识库 Java知识库 JavaScript Python PHP知识库人工智能区块链大数据移动开发嵌入式开发工具数据结构与算法开发测试游戏开发网络协议系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑笔记本显卡显示器固态硬盘硬盘耳机手机 iphone vivo oppo 小米华为单反装机图拉丁

-> Python知识库 -> 期末爬虫项目：爬取豆瓣图书 -> 正文阅读

[Python知识库]期末爬虫项目：爬取豆瓣图书

在这次项目中主要关于网页爬取方面，通过request，bs4,re,time lxml,random对豆瓣图书进行爬取。

首先通过请求模块获取页面的html：

def get_one_page(url):

try:
????head:['Mozilla/5.0' ,'AppleWebKit/537.36' 'Chrome/96.0.4664.110''Mobile Safari/537.36 Edg/96.0.1054.62']
????headers={
????????'user-agent':head[random.randint(0,2)]
????}
????response = requests.get(url,headers=headers)
????print(response.text)
????if response.status_code ==200:
????????return responde.text
????return None
except RequesException:
????return None

然后通过re模块获取图书信息:

def get_request_res(pattern_text,html):
????pattern = re.compile(pattern_text.re.s)
????res + re.findall(pattern,html)
????if len(res)>0:
????????return res[0].split('<',1)[0][1:] #split() 通过指定分隔符对字符串进行切片
????else:
????????return 'NULL'；

其中，split()：通过指定分隔符对字符串进行切片处理

接着通过bs4模块获取图书信息：

def get_bares(selector,hyml):
????soup = BueatifulSoup(html,'lxml')
????res=soup.select(selector)
????#if res is not None or len(res) is not 0:
????# ??return res[0].string
????#else:
????# ??return'NULL'
????if res is None:
????????return'NULL'
????elif len(res)==0:
????????return 'NULL'
????else:
????????return res[0].string
#Get other info bu bs ,module
def get_ba_img_res(selsctor,html):
????soup = BeaufulSoup(html,'lxml')
????res = soup.select(selector)
????if len(res) is not 0: ??#len() 函数返回对象中项目的数量。当对象是字符串时，len() 函数返回字符串中的字符数。
????????return str(res[0])
????else:
????????return 'NULL'

其中：soup.select():筛选元素，可通过标签名查找，类名查找，ID名查找，组合查找，属性查找。返回类型是list。

Len()函数：返回对象中项目的数量。当项目是字符串时，len()函数返回字符串中的字符数。

最后用re模块解析html 的有用信息（书籍名字，作者，出版社，出版年等等）以便存入Excel表中：

def parse_one_page(html):
????book_info={}
????book_name=get_bs_res('div>h1>span',html)
????#print('Book_name',book_name)
????book_info['book_name']=book_name
????#info>a:nth_child(2)
????author = get_bs_res('div>span:nth-child(1>a)',html)
????if author is None:
????????author = get_bs_res('#info>a:nth_child(2)',html)
????#print('Author',author)
????authhor=anthor.replace(" ","")
????author=anthor.replace("\n","")
????book_info['Author']=author

????pulisher =get_request_res(u'出版社：(.*?) ',html)
????#print(Publisher',publisher)
????book_info['publisher']=publisher

????pubilsh_timt =get_request_res(u'出版年:</span(.*?) ',html)
????#print('publish_time',publish_time)
????book_info['publish_time'] = publish_time

????ISBN =get_request_res(u'ISBN;(.*?) ',html)
????#print('ISBN',ISBN)
????book_info['ISBN']=ISBN

????img_label=get_bs_img_res('#mainpic>a>img',html)
????pattem =re.compile('src="(.*?)"'.re.s)
????im = re.findall(patten,img_label)
????if len(img) is not 0:
????????#print('img-src',img[0])
????????bool_info['img-src']=img[0]
????else:
????????#print('src notfound')
????????book_info['img_src']='NULL'

????book_intro=get_ba_res('#link-report > div:nth-child(1>div>p)',html)
????#print('bbx introduction',book_intro)
????book_info['book-intro']=book_intro

????author_intro=get_ba_res('#comtent>div>div.article>div.related_info>div:nth-child(4)>div>div>p',html)
????#print('author_introduction',author_intro)
????book_info['author_intro']=author_intro

????grade = get_bs_res('div>div.rating_self.clearfix>strong',html)
????if len(grade)==1:
????????#print('Score',grade[1:])
????????book_info['Score']='NULL'
????else:
????????#print('Score no mark')
????????book_info['Score']=grade[1:]

??comment_num=get_bs_res('#interest_sectl>div>div.rating_sel.clearfix>div>div.rating_sum>span>a>span',html)
????#print('comment',comment_num)
????book_info['comments']=comment_num

????five_satrs=get_bs_res('#interest_sectl>div>span:nth-child(5)',html)
????#print('5-stars',five_stars)
????book_info['5_stars']=five_stars

????four_stars =get_bs_res('#interest_sectl>div>span:nth-child(9)',html)
????#print('4-stars',four_stars)
????book_info['4_stars']=four_stars

????three_stars =get_bs_res('#interest_sectl>div>span:nth-child(13)',html)
????#print('3-stars',three_stars)
????book_info['3_stars']=three_stars

????two_stars =get_bs_res('#interest_sectl>div>span:nth-child(17)',html)
????#print('2-stars',two_stars)
????book_info['2_stars']=two_stars

????one_stars = get_bs_res('#interest_sectl>div>span:nth-child(21)', html)
????# print('1-stars',one_stars)
????book_info['1_stars'] = one_stars

????return book_info