爬取的是豆瓣书籍top250的所有书籍信息和图片,然后做了一个小小的功能界面
- 首先用到的一些库,其中sys和os不用下系统自带,其他都需要自己下
from PyQt5 import QtCore, QtGui, QtWidgets
import requests
from lxml import etree
import sys
import time
import os
import pandas as pd
from PyQt5.QtWidgets import QApplication -
UI界面是基于qtdesigner做的,效果如图 -
主要代码解释:headers就是模拟浏览器样式,让爬取更安全。对于网页的分页我使用了一个for循环,利用requests获取了网页HTML源码,用lxml库里的etree.HTML()函数将HTML文档里的字符串转变成_Element()对象(注意etree.HTML函数里面是传的txt类型的),然后再用xpath把书籍的信息和图片抓取出来。因为有些书籍没有简介,所以做了一个判断if语句。然后对于价格和评分的查询,使用了两个槽函数,先将书名传进来,然后根据书名匹配书籍,再获取价格和评分。其中那个mark变量就是来标识匹配到书籍,然后跳出循环用的。 -
数据的存储和功能的实现 ? ? ? -
最后上代码 # -*- coding: utf-8 -*-
# Form implementation generated from reading ui file 'spider.ui'
#
# Created by: PyQt5 UI code generator 5.15.4
#
# WARNING: Any manual changes made to this file will be lost when pyuic5 is
# run again. Do not edit this file unless you know what you are doing.
from PyQt5 import QtCore, QtGui, QtWidgets
import requests
from lxml import etree
import sys
import time
import os
import pandas as pd
from PyQt5.QtWidgets import QApplication
class Ui_Form(object):
def setupUi(self, Form):
Form.setObjectName("Form")
Form.resize(400, 300)
self.label = QtWidgets.QLabel(Form)
self.label.setGeometry(QtCore.QRect(60, 40, 71, 20))
self.label.setObjectName("label")
self.lineEdit = QtWidgets.QLineEdit(Form)
self.lineEdit.setGeometry(QtCore.QRect(130, 40, 113, 20))
self.lineEdit.setObjectName("lineEdit")
self.pushButton = QtWidgets.QPushButton(Form)
self.pushButton.setGeometry(QtCore.QRect(70, 80, 75, 23))
self.pushButton.setObjectName("pushButton")
self.pushButton_2 = QtWidgets.QPushButton(Form)
self.pushButton_2.setGeometry(QtCore.QRect(230, 80, 75, 23))
self.pushButton_2.setObjectName("pushButton_2")
self.label_2 = QtWidgets.QLabel(Form)
self.label_2.setGeometry(QtCore.QRect(30, 130, 31, 16))
self.label_2.setObjectName("label_2")
self.lineEdit_2 = QtWidgets.QLineEdit(Form)
self.lineEdit_2.setGeometry(QtCore.QRect(60, 130, 113, 20))
self.lineEdit_2.setObjectName("lineEdit_2")
self.label_3 = QtWidgets.QLabel(Form)
self.label_3.setGeometry(QtCore.QRect(190, 130, 31, 16))
self.label_3.setObjectName("label_3")
self.lineEdit_3 = QtWidgets.QLineEdit(Form)
self.lineEdit_3.setGeometry(QtCore.QRect(220, 130, 113, 20))
self.lineEdit_3.setObjectName("lineEdit_3")
self.pushButton_3 = QtWidgets.QPushButton(Form)
self.pushButton_3.setGeometry(QtCore.QRect(130, 190, 111, 23))
self.pushButton_3.setObjectName("pushButton_3")
self.textEdit = QtWidgets.QTextEdit(Form)
self.textEdit.setGeometry(QtCore.QRect(240, 250, 151, 31))
self.textEdit.setObjectName("textEdit")
self.retranslateUi(Form)
self.pushButton.clicked.connect(self.jiage)
self.pushButton_2.clicked.connect(self.pingfeng)
self.pushButton_3.clicked.connect(self.xiazai)
QtCore.QMetaObject.connectSlotsByName(Form)
def retranslateUi(self, Form):
_translate = QtCore.QCoreApplication.translate
Form.setWindowTitle(_translate("Form", "Form"))
self.label.setText(_translate("Form", "请输入书名"))
self.pushButton.setText(_translate("Form", "查询价格"))
self.pushButton_2.setText(_translate("Form", "查询评分"))
self.label_2.setText(_translate("Form", "价格"))
self.label_3.setText(_translate("Form", "评分"))
self.pushButton_3.setText(_translate("Form", "下载所有信息和图片"))
self.textEdit.setHtml(_translate("Form",
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\" \"http://www.w3.org/TR/REC-html40/strict.dtd\">\n"
"<html><head><meta name=\"qrichtext\" content=\"1\" /><style type=\"text/css\">\n"
"p, li { white-space: pre-wrap; }\n"
"</style></head><body style=\" font-family:\'SimSun\'; font-size:9pt; font-weight:400; font-style:normal;\">\n"
"<p style=\" margin-top:0px; margin-bottom:0px; margin-left:0px; margin-right:0px; -qt-block-indent:0; text-indent:0px;\"><span style=\" font-size:10pt; font-style:italic; color:#7f7c7f;\">提示框:</span></p></body></html>"))
#爬取价格
def jiage(self):
shuming=self.lineEdit.text()
self.textEdit.setText("加载中,请稍等...")
QApplication.processEvents()
mark=0
for i in range(10):
url = 'https://book.douban.com/top250?start=' +str(i*25)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:95.0) Gecko/20100101 Firefox/95.0'}
html = requests.get(url, headers=headers)
html.encoding = html.apparent_encoding
html = etree.HTML(html.text)
tables = html.xpath("//div[@class='indent']//table")
for t in tables:
title = t.xpath(".//td[@valign='top']//a/@title")[0]
price = t.xpath(".//td[@valign='top']//p[1]/text()")[0].split('/')[-1]
if shuming=="{}".format(title):
self.lineEdit_2.setText("{}".format(price))
self.textEdit.setText("查询完成。")
mark=1
break
else:
self.lineEdit_2.setText("none")
if mark==1:
break
time.sleep(1)
#爬取评分
def pingfeng(self):
self.textEdit.setText("加载中,请稍等...")
QApplication.processEvents()
shumin=self.lineEdit.text()
sign=0
for i in range(10):
url = 'https://book.douban.com/top250?start=' +str(i*25)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:95.0) Gecko/20100101 Firefox/95.0'}
html = requests.get(url, headers=headers)
html.encoding = html.apparent_encoding
html = etree.HTML(html.text)
tables = html.xpath("//div[@class='indent']//table")
for t in tables:
title = t.xpath(".//td[@valign='top']//a/@title")[0]
score = t.xpath(".//span[@class='rating_nums']/text()")[0]
if shumin == "{}".format(title):
self.lineEdit_3.setText("{}".format(score))
self.textEdit.setText("查询完毕。")
sign=1
break
else:
self.lineEdit_3.setText("none")
if sign==1:
break
time.sleep(1)
#下载
def xiazai(self):
self.textEdit.setText("下载中,请稍等...")
QApplication.processEvents()
IMGURLS = []
BOOKS = []
for i in range(10):
url = 'https://book.douban.com/top250?start=' +str(i*25)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:95.0) Gecko/20100101 Firefox/95.0'}
html = requests.get(url, headers=headers)
html.encoding = html.apparent_encoding
html = etree.HTML(html.text)
tables = html.xpath("//div[@class='indent']//table")
books=[]
imgurls = []
for t in tables:
title = t.xpath(".//td[@valign='top']//a/@title")[0]
author = t.xpath(".//td[@valign='top']//p[1]/text()")[0].split('/')[0]
price = t.xpath(".//td[@valign='top']//p[1]/text()")[0].split('/')[-1]
press_time = t.xpath(".//td[@valign='top']//p[1]/text()")[0].split('/')[-2]
rating_score = t.xpath(".//span[@class='rating_nums']/text()")[0]
produce = t.xpath(".//p[@class='quote']/span/text()")
imgurl = t.xpath(".//a/img/@src")[0]
if produce is not None:
book = {'title': title, 'authon': author, 'price': price, 'press_time': press_time,
'rating_score': rating_score, 'produce': produce}
else:
book = {'title': title, 'authon': author, 'price': price, 'press_time': press_time,
'rating_score': rating_score, 'produce': None}
books.append(book)
imgurls.append(imgurl)
BOOKS.extend(books)
IMGURLS.extend(imgurls)
for i in range(250):#保存数据
sm=BOOKS[i]
zp=IMGURLS[i]
if 'bookposter' in os.listdir(r'C:\Users\hp\PycharmProjects\pythonProject7'):
pass
else:
os.mkdir(r'C:\Users\hp\PycharmProjects\pythonProject7\bookposter')
os.chdir(r'C:\Users\hp\PycharmProjects\pythonProject7\bookposter')
img = requests.request('GET', zp).content#返回原生字符串,是bytes类型 .text是解码后的字符串,是unicode类型
with open(sm['title'] + '.jpg', 'wb') as f:
f.write(img)
os.chdir(r'C:\Users\hp\PycharmProjects\pythonProject7')
bookdata = pd.DataFrame(BOOKS)
bookdata.to_csv('book.csv', index=False, encoding='utf-8-sig')
self.textEdit.setText("下载完毕,请查看文件。")
if __name__ == "__main__":
app = QtWidgets.QApplication(sys.argv)
widget = QtWidgets.QWidget()
ui = Ui_Form()
ui.setupUi(widget)
widget.show()
sys.exit(app.exec_())
|