[开发工具] 基于pycharm的简单爬虫

开发: C++知识库 Java知识库 JavaScript Python PHP知识库人工智能区块链大数据移动开发嵌入式开发工具数据结构与算法开发测试游戏开发网络协议系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑笔记本显卡显示器固态硬盘硬盘耳机手机 iphone vivo oppo 小米华为单反装机图拉丁

-> 开发工具 -> 基于pycharm的简单爬虫 -> 正文阅读

[开发工具]基于pycharm的简单爬虫

爬取的是豆瓣书籍top250的所有书籍信息和图片，然后做了一个小小的功能界面

首先用到的一些库，其中sys和os不用下系统自带，其他都需要自己下

from PyQt5 import QtCore, QtGui, QtWidgets
import requests
from lxml import etree
import sys
import time
import os
import pandas as pd
from PyQt5.QtWidgets import QApplication

UI界面是基于qtdesigner做的，效果如图
主要代码解释：headers就是模拟浏览器样式，让爬取更安全。对于网页的分页我使用了一个for循环，利用requests获取了网页HTML源码，用lxml库里的etree.HTML()函数将HTML文档里的字符串转变成_Element()对象（注意etree.HTML函数里面是传的txt类型的），然后再用xpath把书籍的信息和图片抓取出来。因为有些书籍没有简介，所以做了一个判断if语句。然后对于价格和评分的查询，使用了两个槽函数，先将书名传进来，然后根据书名匹配书籍，再获取价格和评分。其中那个mark变量就是来标识匹配到书籍，然后跳出循环用的。
数据的存储和功能的实现

?

?

?

最后上代码

# -*- coding: utf-8 -*-

# Form implementation generated from reading ui file 'spider.ui'
#
# Created by: PyQt5 UI code generator 5.15.4
#
# WARNING: Any manual changes made to this file will be lost when pyuic5 is
# run again.  Do not edit this file unless you know what you are doing.


from PyQt5 import QtCore, QtGui, QtWidgets
import requests
from lxml import etree
import sys
import time
import os
import pandas as pd
from PyQt5.QtWidgets import QApplication

class Ui_Form(object):
    def setupUi(self, Form):
        Form.setObjectName("Form")
        Form.resize(400, 300)
        self.label = QtWidgets.QLabel(Form)
        self.label.setGeometry(QtCore.QRect(60, 40, 71, 20))
        self.label.setObjectName("label")
        self.lineEdit = QtWidgets.QLineEdit(Form)
        self.lineEdit.setGeometry(QtCore.QRect(130, 40, 113, 20))
        self.lineEdit.setObjectName("lineEdit")
        self.pushButton = QtWidgets.QPushButton(Form)
        self.pushButton.setGeometry(QtCore.QRect(70, 80, 75, 23))
        self.pushButton.setObjectName("pushButton")
        self.pushButton_2 = QtWidgets.QPushButton(Form)
        self.pushButton_2.setGeometry(QtCore.QRect(230, 80, 75, 23))
        self.pushButton_2.setObjectName("pushButton_2")
        self.label_2 = QtWidgets.QLabel(Form)
        self.label_2.setGeometry(QtCore.QRect(30, 130, 31, 16))
        self.label_2.setObjectName("label_2")
        self.lineEdit_2 = QtWidgets.QLineEdit(Form)
        self.lineEdit_2.setGeometry(QtCore.QRect(60, 130, 113, 20))
        self.lineEdit_2.setObjectName("lineEdit_2")
        self.label_3 = QtWidgets.QLabel(Form)
        self.label_3.setGeometry(QtCore.QRect(190, 130, 31, 16))
        self.label_3.setObjectName("label_3")
        self.lineEdit_3 = QtWidgets.QLineEdit(Form)
        self.lineEdit_3.setGeometry(QtCore.QRect(220, 130, 113, 20))
        self.lineEdit_3.setObjectName("lineEdit_3")
        self.pushButton_3 = QtWidgets.QPushButton(Form)
        self.pushButton_3.setGeometry(QtCore.QRect(130, 190, 111, 23))
        self.pushButton_3.setObjectName("pushButton_3")
        self.textEdit = QtWidgets.QTextEdit(Form)
        self.textEdit.setGeometry(QtCore.QRect(240, 250, 151, 31))
        self.textEdit.setObjectName("textEdit")


        self.retranslateUi(Form)
        self.pushButton.clicked.connect(self.jiage)
        self.pushButton_2.clicked.connect(self.pingfeng)
        self.pushButton_3.clicked.connect(self.xiazai)
        QtCore.QMetaObject.connectSlotsByName(Form)

    def retranslateUi(self, Form):
        _translate = QtCore.QCoreApplication.translate
        Form.setWindowTitle(_translate("Form", "Form"))
        self.label.setText(_translate("Form", "请输入书名"))
        self.pushButton.setText(_translate("Form", "查询价格"))
        self.pushButton_2.setText(_translate("Form", "查询评分"))
        self.label_2.setText(_translate("Form", "价格"))
        self.label_3.setText(_translate("Form", "评分"))
        self.pushButton_3.setText(_translate("Form", "下载所有信息和图片"))
        self.textEdit.setHtml(_translate("Form",
                                         "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\" \"http://www.w3.org/TR/REC-html40/strict.dtd\">\n"
                                         "<html><head><meta name=\"qrichtext\" content=\"1\" /><style type=\"text/css\">\n"
                                         "p, li { white-space: pre-wrap; }\n"
                                         "</style></head><body style=\" font-family:\'SimSun\'; font-size:9pt; font-weight:400; font-style:normal;\">\n"
                                         "<p style=\" margin-top:0px; margin-bottom:0px; margin-left:0px; margin-right:0px; -qt-block-indent:0; text-indent:0px;\"><span style=\" font-size:10pt; font-style:italic; color:#7f7c7f;\">提示框：</span></p></body></html>"))

    #爬取价格
    def jiage(self):

        shuming=self.lineEdit.text()
        self.textEdit.setText("加载中，请稍等...")
        QApplication.processEvents()
        mark=0
        for i in range(10):

            url = 'https://book.douban.com/top250?start=' +str(i*25)
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:95.0) Gecko/20100101 Firefox/95.0'}
            html = requests.get(url, headers=headers)
            html.encoding = html.apparent_encoding
            html = etree.HTML(html.text)
            tables = html.xpath("//div[@class='indent']//table")
            for t in tables:
                title = t.xpath(".//td[@valign='top']//a/@title")[0]
                price = t.xpath(".//td[@valign='top']//p[1]/text()")[0].split('/')[-1]
                if shuming=="{}".format(title):
                    self.lineEdit_2.setText("{}".format(price))
                    self.textEdit.setText("查询完成。")
                    mark=1
                    break
                else:
                    self.lineEdit_2.setText("none")
            if mark==1:
                break
            time.sleep(1)


#爬取评分


    def pingfeng(self):
        self.textEdit.setText("加载中，请稍等...")
        QApplication.processEvents()
        shumin=self.lineEdit.text()
        sign=0
        for i in range(10):
            url = 'https://book.douban.com/top250?start=' +str(i*25)
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:95.0) Gecko/20100101 Firefox/95.0'}
            html = requests.get(url, headers=headers)
            html.encoding = html.apparent_encoding
            html = etree.HTML(html.text)
            tables = html.xpath("//div[@class='indent']//table")
            for t in tables:
                title = t.xpath(".//td[@valign='top']//a/@title")[0]
                score = t.xpath(".//span[@class='rating_nums']/text()")[0]
                if shumin == "{}".format(title):
                    self.lineEdit_3.setText("{}".format(score))
                    self.textEdit.setText("查询完毕。")
                    sign=1
                    break
                else:
                    self.lineEdit_3.setText("none")
            if sign==1:
                break
            time.sleep(1)
#下载
    def xiazai(self):
        self.textEdit.setText("下载中，请稍等...")
        QApplication.processEvents()
        IMGURLS = []
        BOOKS = []
        for i in range(10):
            url = 'https://book.douban.com/top250?start=' +str(i*25)
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:95.0) Gecko/20100101 Firefox/95.0'}
            html = requests.get(url, headers=headers)
            html.encoding = html.apparent_encoding
            html = etree.HTML(html.text)
            tables = html.xpath("//div[@class='indent']//table")
            books=[]
            imgurls = []
            for t in tables:
                title = t.xpath(".//td[@valign='top']//a/@title")[0]
                author = t.xpath(".//td[@valign='top']//p[1]/text()")[0].split('/')[0]
                price = t.xpath(".//td[@valign='top']//p[1]/text()")[0].split('/')[-1]
                press_time = t.xpath(".//td[@valign='top']//p[1]/text()")[0].split('/')[-2]
                rating_score = t.xpath(".//span[@class='rating_nums']/text()")[0]
                produce = t.xpath(".//p[@class='quote']/span/text()")
                imgurl = t.xpath(".//a/img/@src")[0]

                if produce is not None:
                    book = {'title': title, 'authon': author, 'price': price, 'press_time': press_time,
                            'rating_score': rating_score,  'produce': produce}
                else:
                    book = {'title': title, 'authon': author, 'price': price, 'press_time': press_time,
                            'rating_score': rating_score,  'produce': None}
                books.append(book)
                imgurls.append(imgurl)
            BOOKS.extend(books)
            IMGURLS.extend(imgurls)
        for i in range(250):#保存数据
            sm=BOOKS[i]
            zp=IMGURLS[i]
            if 'bookposter' in os.listdir(r'C:\Users\hp\PycharmProjects\pythonProject7'):
                pass
            else:
                os.mkdir(r'C:\Users\hp\PycharmProjects\pythonProject7\bookposter')
            os.chdir(r'C:\Users\hp\PycharmProjects\pythonProject7\bookposter')

            img = requests.request('GET', zp).content#返回原生字符串，是bytes类型   .text是解码后的字符串，是unicode类型

            with open(sm['title'] + '.jpg', 'wb') as f:
                f.write(img)
        os.chdir(r'C:\Users\hp\PycharmProjects\pythonProject7')

        bookdata = pd.DataFrame(BOOKS)
        bookdata.to_csv('book.csv', index=False, encoding='utf-8-sig')

        self.textEdit.setText("下载完毕，请查看文件。")



if __name__ == "__main__":
 app = QtWidgets.QApplication(sys.argv)
 widget = QtWidgets.QWidget()
 ui = Ui_Form()
 ui.setupUi(widget)
 widget.show()
 sys.exit(app.exec_())

开发工具最新文章

Postman接口测试之Mock快速入门

ASCII码空格替换查表_最全ASCII码对照表0-2

如何使用 ssh 建立 socks 代理

Typora配合PicGo阿里云图床配置

SoapUI、Jmeter、Postman三种接口测试工具的

github用相对路径显示图片_GitHub 中 readm

Windows编译g2o及其g2o viewer

解决jupyter notebook无法连接/ jupyter连接

Git恢复到之前版本

VScode常用快捷键

加:2021-12-23 15:56:13 更:2021-12-23 15:57:42

360图书馆购物三丰科技阅读网日历万年历 2025年12日历

-2025/12/8 13:47:07-

图片自动播放器
↓图片自动播放器↓

TxT小说阅读器
↓语音阅读,小说下载,古典文学↓

一键清除垃圾
↓轻轻一点,清除系统垃圾↓

图片批量下载器
↓批量下载图片,美女图库↓

网站联系: qq:121756557 email:121756557@qq.com IT数码