如果你是新手,通过阅读此案例,可以参考解决的问题及习得的技巧:
selenium:
1、判断元素是否存在
2、懒加载,控制台执行js,页面滑动最下方
3、按钮因遮挡导致不可点击时,强制点击
4、隐藏 自动化测试标签 和 静默执行
5、获取当前加载页面的源码
pandas:
1、保存excel时 不替换原有文件,新增sheet保存
2、DataFrame 添加字典数据时,默认列名字典顺序排序,保存加columns固定顺序
css: 类名存在空格时,用? .代替空格
print: 打印同类信息时,在当前行打印,不换行
源码:
import os
import pandas as pd
from bs4 import BeautifulSoup
import time
from openpyxl import load_workbook
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
start = time.clock()
def get_rankings(path, URL, title):
options = Options()
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_experimental_option('excludeSwitches', ['enable-automation']) # 隐藏 测试软件tab
# options.add_argument('--headless') # 静默执行
# options.add_argument("user-agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'")
browser = webdriver.Chrome(options=options)
browser.get(URL)
time.sleep(2)
no_pagedown = 1
shcools = browser.find_element_by_css_selector(".filter-bar__CountContainer-sc-1glfoa-5.kFwGjm").text.replace(
' schools', '').replace(',', '')
while no_pagedown:
try:
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") # 移动到页面最下方
time.sleep(4)
soup = BeautifulSoup(browser.page_source, 'lxml')
dataNumber = len(soup.find_all("h2",
class_="Heading__HeadingStyled-sc-1w5xk2o-0-h2 heunUr Heading-sc-1w5xk2o-1 cRrhAX md-mb2"))
print(f'\r当前已加载{dataNumber}条数据,共需加载{shcools}条', end='')
button_element = '.button__ButtonStyled-sc-1vhaw8r-1.kDQStt.pager__ButtonStyled-sc-1i8e93j-1.dypUdv.type-secondary.size-large'
exists = check_element_exists(browser, 'css', button_element)
if exists:
button = browser.find_element_by_css_selector(button_element)
# 当元素遮挡导致无法点击时,进行移动点击,有可能误点广告
webdriver.ActionChains(browser).move_to_element(button).click(button).perform()
no_pagedown = 0 if dataNumber >= int(shcools) else no_pagedown
except Exception as e:
print('Error:', e)
soup = BeautifulSoup(browser.page_source, 'lxml')
divList = soup.find_all('div', class_='DetailCardGlobalUniversities__TextContainer-sc-1v60hm5-3 fInsHn')
browser.close()
dataReturn = []
for div in divList:
name = div.find('h2').find('a').text
link = div.find('h2').find("a")['href']
loc = div.find("p", class_="Paragraph-sc-1iyax29-0 pyUjv").text
score = div.find_all("dd", class_="QuickStatHug__Description-hb1bl8-1 eXguFl")[0].text
regist = div.find_all("dd", class_="QuickStatHug__Description-hb1bl8-1 eXguFl")[1].text
rank = div.find("div", class_="RankList__Rank-sc-2xewen-2 fxzjOx ranked has-badge").text.replace('#', '')
rank = rank if not rank is None else 'N/A' # rank存在空情况
dataReturn.append({'排名': rank, '院校': name, '国家': loc, '评分': score, '注册': regist, '网址': link, })
writer = pd.ExcelWriter(path, engine='openpyxl')
if os.path.exists(path):
writer.book = load_workbook(path)
df = pd.DataFrame(dataReturn)
df.to_excel(writer, sheet_name=title, encoding='utf-8', index=False, columns=dataReturn[0].keys())
writer.save()
def check_element_exists(driver, condition, element):
# 检查元素是否存在
try:
if condition == 'class':
driver.find_element_by_class_name(element)
elif condition == 'id':
driver.find_element_by_id(element)
elif condition == 'xpath':
driver.find_element_by_xpath(element)
elif condition == 'css':
driver.find_element_by_css_selector(element)
return True
except Exception as e:
print(f'\n寻找元素出错:', e)
return False
if __name__ == '__main__':
path = r'../DataCache/22USNews_demo.xlsx'
page_urls = {
# 'world': 'https://www.usnews.com/education/best-global-universities/search',
# 'africa': 'https://www.usnews.com/education/best-global-universities/africa',
# 'asia': 'https://www.usnews.com/education/best-global-universities/asia',
'australia-new-zealand': 'https://www.usnews.com/education/best-global-universities/australia-new-zealand',
# 'europe': 'https://www.usnews.com/education/best-global-universities/europe',
# 'latin-america': 'https://www.usnews.com/education/best-global-universities/latin-america',
}
for urlkey in page_urls:
start_time = int(round(time.time()))
get_rankings(path, page_urls[urlkey], urlkey)
print(f'\nElapsed:{round(time.clock() - start, 2)} Seconds for: {urlkey}')
print(f"Total time: {round(time.clock() - start, 2)} seconds.")
|