在这次项目中主要关于网页爬取方面,通过request,bs4,re,time lxml,random对豆瓣图书进行爬取。
首先通过请求模块获取页面的html:
def get_one_page(url):
try: ????head:['Mozilla/5.0' ,'AppleWebKit/537.36' 'Chrome/96.0.4664.110''Mobile Safari/537.36 Edg/96.0.1054.62'] ????headers={ ????????'user-agent':head[random.randint(0,2)] ????} ????response = requests.get(url,headers=headers) ????print(response.text) ????if response.status_code ==200: ????????return responde.text ????return None except RequesException: ????return None
然后通过re模块获取图书信息:
def get_request_res(pattern_text,html): ????pattern = re.compile(pattern_text.re.s) ????res + re.findall(pattern,html) ????if len(res)>0: ????????return res[0].split('<',1)[0][1:] #split() 通过指定分隔符对字符串进行切片 ????else: ????????return 'NULL';
其中,split():通过指定分隔符对字符串进行切片处理
接着通过bs4模块获取图书信息:
def get_bares(selector,hyml): ????soup = BueatifulSoup(html,'lxml') ????res=soup.select(selector) ????#if res is not None or len(res) is not 0: ????# ??return res[0].string ????#else: ????# ??return'NULL' ????if res is None: ????????return'NULL' ????elif len(res)==0: ????????return 'NULL' ????else: ????????return res[0].string #Get other info bu bs ,module def get_ba_img_res(selsctor,html): ????soup = BeaufulSoup(html,'lxml') ????res = soup.select(selector) ????if len(res) is not 0: ??#len() 函数返回对象中项目的数量。当对象是字符串时,len() 函数返回字符串中的字符数。 ????????return str(res[0]) ????else: ????????return 'NULL'
其中:soup.select():筛选元素,可通过标签名查找,类名查找,ID名查找,组合查找,属性查找。返回类型是list。
Len()函数:返回对象中项目的数量。当项目是字符串时,len()函数返回字符串中的字符数。
最后用re模块解析html 的有用信息(书籍名字,作者,出版社,出版年等等)以便存入Excel表中:
def parse_one_page(html): ????book_info={} ????book_name=get_bs_res('div>h1>span',html) ????#print('Book_name',book_name) ????book_info['book_name']=book_name ????#info>a:nth_child(2) ????author = get_bs_res('div>span:nth-child(1>a)',html) ????if author is None: ????????author = get_bs_res('#info>a:nth_child(2)',html) ????#print('Author',author) ????authhor=anthor.replace(" ","") ????author=anthor.replace("\n","") ????book_info['Author']=author
????pulisher =get_request_res(u'出版社:</span>(.*?)<br/>',html) ????#print(Publisher',publisher) ????book_info['publisher']=publisher
????pubilsh_timt =get_request_res(u'出版年:</span(.*?)<br/>',html) ????#print('publish_time',publish_time) ????book_info['publish_time'] = publish_time
????ISBN =get_request_res(u'ISBN;</span>(.*?)<br/>',html) ????#print('ISBN',ISBN) ????book_info['ISBN']=ISBN
????img_label=get_bs_img_res('#mainpic>a>img',html) ????pattem =re.compile('src="(.*?)"'.re.s) ????im = re.findall(patten,img_label) ????if len(img) is not 0: ????????#print('img-src',img[0]) ????????bool_info['img-src']=img[0] ????else: ????????#print('src notfound') ????????book_info['img_src']='NULL'
????book_intro=get_ba_res('#link-report > div:nth-child(1>div>p)',html) ????#print('bbx introduction',book_intro) ????book_info['book-intro']=book_intro
????author_intro=get_ba_res('#comtent>div>div.article>div.related_info>div:nth-child(4)>div>div>p',html) ????#print('author_introduction',author_intro) ????book_info['author_intro']=author_intro
????grade = get_bs_res('div>div.rating_self.clearfix>strong',html) ????if len(grade)==1: ????????#print('Score',grade[1:]) ????????book_info['Score']='NULL' ????else: ????????#print('Score no mark') ????????book_info['Score']=grade[1:]
??comment_num=get_bs_res('#interest_sectl>div>div.rating_sel.clearfix>div>div.rating_sum>span>a>span',html) ????#print('comment',comment_num) ????book_info['comments']=comment_num
????five_satrs=get_bs_res('#interest_sectl>div>span:nth-child(5)',html) ????#print('5-stars',five_stars) ????book_info['5_stars']=five_stars
????four_stars =get_bs_res('#interest_sectl>div>span:nth-child(9)',html) ????#print('4-stars',four_stars) ????book_info['4_stars']=four_stars
????three_stars =get_bs_res('#interest_sectl>div>span:nth-child(13)',html) ????#print('3-stars',three_stars) ????book_info['3_stars']=three_stars
????two_stars =get_bs_res('#interest_sectl>div>span:nth-child(17)',html) ????#print('2-stars',two_stars) ????book_info['2_stars']=two_stars
????one_stars = get_bs_res('#interest_sectl>div>span:nth-child(21)', html) ????# print('1-stars',one_stars) ????book_info['1_stars'] = one_stars
????return book_info
代码运行效果(由于电脑原因request库出现问题,使用班上其他同学电脑运行的部分截图):
?
?
|