[Python知识库] 使用tkinter爬取二手房交易网站信息

开发: C++知识库 Java知识库 JavaScript Python PHP知识库人工智能区块链大数据移动开发嵌入式开发工具数据结构与算法开发测试游戏开发网络协议系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑笔记本显卡显示器固态硬盘硬盘耳机手机 iphone vivo oppo 小米华为单反装机图拉丁

-> Python知识库 -> 使用tkinter爬取二手房交易网站信息 -> 正文阅读

[Python知识库]使用tkinter爬取二手房交易网站信息

此程序经过测试非常的稳定，无网络状态下会提示，网络正常就可以爬取，每次最多50页，可根据需要选择。问题就是代码过于冗长，看得人眼花缭乱，估计只有我自己能看懂，水平有限，见谅了。做这个程序是想练习下综合的实战应用，界面页相对简洁友好，整个程序也都做个异常处理，当然一定会有不够完善的地方，也请各位同仁批评指正。

界面如下：

?代码如下（不含代理IP的API）：

from tkinter import *
import requests
from lxml import html
import random
import xlwt
import time
from requests.adapters import HTTPAdapter
from retry import *
from tkinter.filedialog import askdirectory
import threading
import tkinter.messagebox  # 弹窗库

root = Tk()
global list_title

import os

'''
文件路径选择
'''

xVariable = tkinter.IntVar()


def selectPath():
    path_ = askdirectory(title="路径选择")

    if path_ == "":
        path.get()
    else:
        path_ = path_.replace("/", "\\")  # 实际在代码中执行的路径为“\“ 所以替换一下
        path.set(path_)


def openPath():
    dir = os.path.dirname(path.get() + "\\")
    os.system('start ' + dir)  # start+空格 相当于windows里CMD命令打开文件夹


path = StringVar()
path.set(os.path.abspath("."))
text = Listbox(root, font=('微软雅黑', 15), width=52, height=13)
text.grid_configure(row=1, columnspan=6)
sc = Scrollbar(root, command=text.yview)
sc.grid_configure(row=1, column=7, sticky=S + E + N)
sc1 = Scrollbar(root, orient="horizontal", command=text.xview,)
sc1.grid_configure(row=2, columnspan=6, sticky=EW)
text.config(yscrollcommand=sc.set)
text.config(xscrollcommand=sc1.set)


def download():
    try:
        response = requests.get("http://www.xiongmaodaili.cn/").status_code
        if response == 200:
            lujing["state"] = DISABLED
            spbox["state"] = DISABLED
            button["text"] = "正在爬取"
            button["state"] = DISABLED
            savapath = path.get()
            page = xVariable.get()

            def get_ip():

                try:
                    url1 = '代理IP的地址,这里删除了...'
                    response = requests.get(url1)
                    if response.status_code == 200:
                        while response.text[0] == '{' or response.text[0] == "":
                            time.sleep(2)
                            response = requests.get(url1)
                        return [x for x in response.text.split('\r')]
                        # print('获取ip失败')

                        # 此处返回的内容是多行的字符串，使用列表表达式使其拆分成组合成列表

                    else:
                        text.insert(END, '请求失败')
                except Exception as e:
                    text.insert(END, e)

            i = 0

            @retry(tries=4, delay=1, backoff=1, jitter=0, max_delay=1)
            def my_request(url):

                requests.adapters.DEFAULT_RETRIES = 15
                s = requests.session()
                s.keep_alive = False  # 关闭之前的连接，避免连接过多
                global r
                try:
                    ips = get_ip()
                    proxy = {'https': ips[0]}
                    text.insert(END, proxy)
                    r = requests.get(url, headers=head, proxies=proxy, verify=False, timeout=5)
                    r.encoding = 'utf-8'

                except BaseException:  # 捕获异常的时候，这里粗略的写了BaseException，根据需要可写的更具体。
                    text.insert(url, "请求失败，开始重试")
                    ips = get_ip()
                    proxy = {'https': ips[0]}
                    text.insert(proxy)
                    r = requests.get(url, headers=head, proxies=proxy, verify=False, timeout=5)
                    r.encoding = 'utf-8'
                return r

            global r
            head = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
                "accept": "application/json, text/plain, */*", "accept-encoding": "gzip, deflate, br",
                "accept-language": "zh-CN,zh;q=0.9"}
            work_book = xlwt.Workbook(encoding="utf-8")
            sheet = work_book.add_sheet("巴州二手房信息")
            sheet.write(0, 3, "小区名称")
            sheet.write(0, 4, "区域1")
            sheet.write(0, 5, "区域2")
            sheet.write(0, 6, "地址")

            sheet.write(0, 7, "总价(万元)")
            sheet.write(0, 8, "单价(元/㎡)")
            sheet.write(0, 2, "房子大小(㎡)")
            sheet.write(0, 1, "房型")
            sheet.write(0, 0, "标题")
            row_num = 1
            z = 0
            v = 0

            # page = xVariable.get()

            for i in range(0, page):
                v += 1
                i += 1
                try:
                    url = "https://bygl.58.com/ershoufang/p" + str(i) + "/"
                    text.insert(END, url)
                    text.see(END)
                    text.update()
                    my_request(url)

                except Exception as e:
                    text.insert(END, e)
                    text.insert(END, "第" + str(i) + "页出错!")
                    text.insert(END, "--------------------------")

                    continue

                else:

                    preview_html = html.fromstring(r.text)

                    list_title = preview_html.xpath("//div[@class='property-content-title']/h3/text()|//p["
                                                    "@class='property-content-info-comm-name']/text()|//p[ "
                                                    "@class='property-content-info-comm-address']//span/text()|//span[ "
                                                    "@class='property-price-total-num']/text()|//p["
                                                    "@class='property-price-average']/text()|//p["
                                                    "@class='property-content-info-text'][1]/text()|//p["
                                                    "@class='property-content-info-text property-content-info-attribute']//span//text()")
                    list_title = [str(x) for x in list_title]
                    while not list_title:
                        text.insert(END, "列表为空,重新获取代理IP:")
                        my_request(url)

                        list_title = preview_html.xpath("//div[@class='property-content-title']/h3/text()|//p["
                                                        "@class='property-content-info-comm-name']/text()|//p[ "
                                                        "@class='property-content-info-comm-address']//span/text()|//span[ "
                                                        "@class='property-price-total-num']/text()|//p["
                                                        "@class='property-price-average']/text()|//p["
                                                        "@class='property-content-info-text'][1]/text()|//p["
                                                        "@class='property-content-info-text property-content-info-attribute']//span//text()")
                        list_title = [str(x) for x in list_title]

                        time.sleep(random.random() * 2)
                        break

                    text.insert(END, "成功爬取第" + str(i) + "页数据")

                    z += 1
                    text.insert(END, "抓取成功率:{:.2%}".format(z / v))
                    text.insert(END, "----------------------------")
                    text.see(END)
                    text.update()
                for j in range(len(list_title)):

                    if j % 14 == 0:
                        title = list_title[j + 8]
                        area1 = list_title[j + 9]
                        biaoti = list_title[j]
                        area2 = list_title[j + 10]
                        area3 = list_title[j + 11]
                        totalnum = list_title[j + 12]
                        avg = list_title[j + 13]
                        size = list_title[j + 7].strip().strip('\n')
                        house_type = list_title[j + 1] + list_title[j + 2] + list_title[j + 3] + list_title[j + 4] + \
                                     list_title[
                                         j + 5] + list_title[j + 6]
                        # print(type(list_title[j + 6]))

                        sheet.write(row_num, 3, title)
                        sheet.write(row_num, 4, area1)
                        sheet.write(row_num, 5, area2)
                        sheet.write(row_num, 6, area3)
                        sheet.write(row_num, 7, totalnum)
                        sheet.write(row_num, 8, avg)
                        sheet.write(row_num, 2, size)
                        sheet.write(row_num, 1, house_type)
                        sheet.write(row_num, 0, biaoti)
                        row_num += 1
                time.sleep(3)
            # file_name = r"F:\巴州二手房爬取.xls"

            file_name = savapath + "\\" + time.strftime("%Y-%m-%d") + "共" + str(page) + "页数据.xls"
            print(file_name)

            if os.path.exists(file_name):

                k = 1
                name, suffix = file_name.split(".")  # 数据处理,以点号进行前后分割
                name = name + "(" + str(k) + ")"
                file_name = name + "." + suffix

                if os.path.exists(file_name):

                    print(file_name)
                    while os.path.exists(file_name):
                        name2 = name.split("(")
                        name1 = name2[1].split(")")
                        k = int(name1[0]) + k
                        file_name = name2[0] + "(" + str(k) + ")" + "." + suffix
                        # print(name1)
                        # print(name)
                        # print(k)

                        print(file_name)

                    work_book.save(file_name)
                else:
                    work_book.save(file_name)

            else:
                work_book.save(file_name)
            lujing["state"] = ACTIVE
            text.insert(END, "文件生成成功:" + file_name)
            text.insert(END, "\n")
            tkinter.messagebox.showinfo("提示", "文件生成成功:" + file_name)
            button["text"] = "开始爬虫"
            button["state"] = ACTIVE
            spbox["state"] = "readonly"


    except:

        text.insert(END,"网络异常,请检查后重试")
        text.see(END)


def spiderok():
    page = xVariable.get()
    if tkinter.messagebox.askokcancel("", "确定爬取巴州房产" + str(page) + "页的数据\n并保存至" + path.get(), default="cancel"):
        download()


def thread_it(func):
    t = threading.Thread(target=func)
    t.setDaemon(True)
    t.start()


root.title("巴音郭楞蒙古自治州58同城二手房数据爬取")
root.geometry("820x600+600+230")


def tuichu():
    if tkinter.messagebox.askokcancel(title="提示", message="您确定要退出吗？", default="cancel"):
        root.quit()


root.protocol("WM_DELETE_WINDOW", tuichu)

button = Button(root, text="开始爬虫", font=("微软雅黑", 10), command=lambda: thread_it(spiderok))
button.grid(row=3, sticky=W)
lb = Label(root, text="页数选择:")
lb.grid(row=4, column=0, sticky=W)

spbox = Spinbox(root, from_=1, to=50, increment=1, justify="center", textvariable=xVariable, state="readonly", width=8)
spbox.grid(row=4, column=1, sticky=W)
button1 = Button(root, text="退出", padx=10, font=("微软雅黑", 10), command=tuichu)
button1.grid(row=3, column=4, sticky=E)
Label(root, text="存储路径:").grid(row=5, column=0, sticky=W)
Entry(root, textvariable=path, state="readonly").grid(row=5, column=1, ipadx=170)

lujing = Button(root, text="更改路径", command=selectPath)
lujing.grid(row=5, column=3, sticky=E)
Button(root, text="显示路径", command=openPath).grid(row=5, column=4, sticky=E)
root.update()
root.mainloop()

Python知识库最新文章

Python中String模块

【Python】 14-CVS文件操作

python的panda库读写文件

使用Nordic的nrf52840实现蓝牙DFU过程

【Python学习记录】numpy数组用法整理

Python学习笔记

python字符串和列表

python如何从txt文件中解析出有效的数据

Python编程从入门到实践自学/3.1-3.2

python变量

加:2021-10-07 13:48:31 更:2021-10-07 13:49:25

360图书馆购物三丰科技阅读网日历万年历 2025年7日历

-2025/7/16 7:23:51-

图片自动播放器
↓图片自动播放器↓

TxT小说阅读器
↓语音阅读,小说下载,古典文学↓

一键清除垃圾
↓轻轻一点,清除系统垃圾↓

图片批量下载器
↓批量下载图片,美女图库↓

网站联系: qq:121756557 email:121756557@qq.com IT数码