[Python知识库] 案例： USNews 世界大学榜单 Python selenium 实践

开发: C++知识库 Java知识库 JavaScript Python PHP知识库人工智能区块链大数据移动开发嵌入式开发工具数据结构与算法开发测试游戏开发网络协议系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑笔记本显卡显示器固态硬盘硬盘耳机手机 iphone vivo oppo 小米华为单反装机图拉丁

-> Python知识库 -> 案例： USNews 世界大学榜单 Python selenium 实践 -> 正文阅读

[Python知识库]案例： USNews 世界大学榜单 Python selenium 实践

如果你是新手，通过阅读此案例，可以参考解决的问题及习得的技巧：

selenium：

1、判断元素是否存在

2、懒加载，控制台执行js，页面滑动最下方

3、按钮因遮挡导致不可点击时，强制点击

4、隐藏自动化测试标签和静默执行

5、获取当前加载页面的源码

pandas：

1、保存excel时不替换原有文件，新增sheet保存

2、DataFrame 添加字典数据时，默认列名字典顺序排序，保存加columns固定顺序

css： 类名存在空格时，用? .代替空格

print： 打印同类信息时，在当前行打印，不换行

源码：

import os
import pandas as pd
from bs4 import BeautifulSoup
import time
from openpyxl import load_workbook
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

start = time.clock()


def get_rankings(path, URL, title):
    options = Options()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_experimental_option('excludeSwitches', ['enable-automation'])  # 隐藏 测试软件tab
    # options.add_argument('--headless')    # 静默执行
    # options.add_argument("user-agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'")
    browser = webdriver.Chrome(options=options)
    browser.get(URL)
    time.sleep(2)
    no_pagedown = 1
    shcools = browser.find_element_by_css_selector(".filter-bar__CountContainer-sc-1glfoa-5.kFwGjm").text.replace(
        ' schools', '').replace(',', '')
    while no_pagedown:
        try:
            browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")  # 移动到页面最下方
            time.sleep(4)

            soup = BeautifulSoup(browser.page_source, 'lxml')
            dataNumber = len(soup.find_all("h2",
                                           class_="Heading__HeadingStyled-sc-1w5xk2o-0-h2 heunUr Heading-sc-1w5xk2o-1 cRrhAX md-mb2"))
            print(f'\r当前已加载{dataNumber}条数据,共需加载{shcools}条', end='')

            button_element = '.button__ButtonStyled-sc-1vhaw8r-1.kDQStt.pager__ButtonStyled-sc-1i8e93j-1.dypUdv.type-secondary.size-large'
            exists = check_element_exists(browser, 'css', button_element)

            if exists:
                button = browser.find_element_by_css_selector(button_element)
                # 当元素遮挡导致无法点击时，进行移动点击，有可能误点广告
                webdriver.ActionChains(browser).move_to_element(button).click(button).perform()

            no_pagedown = 0 if dataNumber >= int(shcools) else no_pagedown

        except Exception as e:
            print('Error:', e)

    soup = BeautifulSoup(browser.page_source, 'lxml')
    divList = soup.find_all('div', class_='DetailCardGlobalUniversities__TextContainer-sc-1v60hm5-3 fInsHn')
    browser.close()
    dataReturn = []

    for div in divList:
        name = div.find('h2').find('a').text
        link = div.find('h2').find("a")['href']
        loc = div.find("p", class_="Paragraph-sc-1iyax29-0 pyUjv").text
        score = div.find_all("dd", class_="QuickStatHug__Description-hb1bl8-1 eXguFl")[0].text
        regist = div.find_all("dd", class_="QuickStatHug__Description-hb1bl8-1 eXguFl")[1].text
        rank = div.find("div", class_="RankList__Rank-sc-2xewen-2 fxzjOx ranked has-badge").text.replace('#', '')
        rank = rank if not rank is None else 'N/A'  # rank存在空情况
        dataReturn.append({'排名': rank, '院校': name, '国家': loc, '评分': score, '注册': regist, '网址': link, })

    writer = pd.ExcelWriter(path, engine='openpyxl')
    if os.path.exists(path):
        writer.book = load_workbook(path)
    df = pd.DataFrame(dataReturn)
    df.to_excel(writer, sheet_name=title, encoding='utf-8', index=False, columns=dataReturn[0].keys())
    writer.save()


def check_element_exists(driver, condition, element):
    # 检查元素是否存在
    try:
        if condition == 'class':
            driver.find_element_by_class_name(element)
        elif condition == 'id':
            driver.find_element_by_id(element)
        elif condition == 'xpath':
            driver.find_element_by_xpath(element)
        elif condition == 'css':
            driver.find_element_by_css_selector(element)
        return True
    except Exception as e:
        print(f'\n寻找元素出错:', e)
        return False


if __name__ == '__main__':
    path = r'../DataCache/22USNews_demo.xlsx'
    page_urls = {
        # 'world': 'https://www.usnews.com/education/best-global-universities/search',
        # 'africa': 'https://www.usnews.com/education/best-global-universities/africa',
        # 'asia': 'https://www.usnews.com/education/best-global-universities/asia',
        'australia-new-zealand': 'https://www.usnews.com/education/best-global-universities/australia-new-zealand',
        # 'europe': 'https://www.usnews.com/education/best-global-universities/europe',
        # 'latin-america': 'https://www.usnews.com/education/best-global-universities/latin-america',
    }

    for urlkey in page_urls:
        start_time = int(round(time.time()))
        get_rankings(path, page_urls[urlkey], urlkey)
        print(f'\nElapsed:{round(time.clock() - start, 2)} Seconds for: {urlkey}')

    print(f"Total time: {round(time.clock() - start, 2)} seconds.")

Python知识库最新文章

Python中String模块

【Python】 14-CVS文件操作

python的panda库读写文件

使用Nordic的nrf52840实现蓝牙DFU过程

【Python学习记录】numpy数组用法整理

Python学习笔记

python字符串和列表

python如何从txt文件中解析出有效的数据

Python编程从入门到实践自学/3.1-3.2

python变量

加:2022-02-28 15:25:00 更:2022-02-28 15:27:40

360图书馆购物三丰科技阅读网日历万年历 2026年2日历

-2026/2/25 0:49:06-

图片自动播放器
↓图片自动播放器↓

TxT小说阅读器
↓语音阅读,小说下载,古典文学↓

一键清除垃圾
↓轻轻一点,清除系统垃圾↓

图片批量下载器
↓批量下载图片,美女图库↓

网站联系: qq:121756557 email:121756557@qq.com IT数码