0x00 前言
好久没敲代码,刚好通过森森发的活提升一下
webdriver以chrome为例
为什么使用selenium?
通过requests只能爬取原网页代码 通过selenium才可以爬取执行 js 后的网页数据,实现“所见即所得”
以All Articles - PortSwigger Research 为例
审查元素时经过渲染更为美观,a标签均处于
下
但直接查看源代码我们就找不到(request同理) 后面会发现源代码中的a位置比较杂乱无章不方便定位
环境准备
1.下载驱动
chromedriver.storage.googleapis.com/index.html
根据自己浏览器版本选择合适驱动,放置于python目录下
EG:
2.安装库
pip install selenium
3.python3即可
本文是3.9
元素定位
参考文章
如何使用Xpath定位元素(史上最清晰的讲解)_qq_43022048的博客-CSDN博客_xpath定位元素
EG:
明确我们要定位的目标,每个a标签中的href
位于div class=“infinitescroller cmsinfinitescroller” /div class=“pagePlaceholder tilelayoutengine-page fourcolumns”
折叠收缩一下更为直观
经过测试第二层div columns会变动
懒一点直接从大的开始定位 因为class_name独特
走相对路径
//div[@class='infinitescroller cmsinfinitescroller']/div/a
浅析一下:
第一层通过class直接定位大div,/div到第二层div(这里没必要进行属性描述,因为三个二层div我们都要,如果针对特定某个div可以在后面加上索引)
最后直接定位a(备注同第二层div)
PS:新版selenium已经废弃了find_element_by_* 命令,直接用find_element
(个人觉得XPATH最好用,比较无脑)
Selenium3自动化测试【21】find_element定位元素_探索之家的技术博客_51CTO博客
贴script
import tempfile, os.path as path
import time
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
URL='https://portswigger.net/research/articles'
log_path = path.join(tempfile.mkdtemp(), 'ghostdriver.log')
s = Service(executable_path=r"C:\Users\87308\AppData\Local\Programs\Python\Python39\chromedriver.exe")
driver = webdriver.Chrome(service=s, service_log_path=log_path)
def get_a(url,driver=driver):
driver.get(url)
driver.implicitly_wait(10)
a=driver.find_elements(By.XPATH,"//div[@class='infinitescroller cmsinfinitescroller']/div/a")
return a
r=get_a(URL)
for i in r:
url=i.get_attribute("href")
with open(r'./portswigger_researchurl.txt', 'a+', encoding='utf-8') as f:
f.write(url+'\n')
time.sleep(3)
driver.quit()
0x01 script
根据链接爬取文章写入数据库文件
import tempfile, os.path as path
import time
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import sqlite3
import traceback
import logging
import pymysql
DATABASE = 'portswigger_research.db'
TABLE_NAME = 'portswigger_research'
log_path = path.join(tempfile.mkdtemp(), 'ghostdriver.log')
s = Service(executable_path=r"C:\Users\87308\AppData\Local\Programs\Python\Python39\chromedriver.exe")
options = webdriver.ChromeOptions()
options.add_argument('blink-settings=imagesEnabled=false')
driver = webdriver.Chrome(service=s, service_log_path=log_path, options=options)
CRAETE_TABLE_SQL = 'CREATE TABLE {0} (id integer primary key autoincrement, title text ,content text, url text, author text, publish_time text, update_time text );'
INSERT_DATA = "INSERT INTO {0} (title,content,url,author,publish_time,update_time) VALUES ('{1}','{2}', '{3}', '{4}', '{5}', '{6}');"
def get_content(url, driver=driver):
driver.get(url)
driver.implicitly_wait(10)
title = driver.find_element(By.XPATH, "//div[@class='section theme-navy-1']/h1").text
author = driver.find_element(By.XPATH, "//div[@class='section theme-navy-1']/div[@class='callout-individual-profile']/div/h3").text
p_time = driver.find_element(By.XPATH, "//ul[@class='publication-list']/li[1]/p")
publish_time = p_time.text[11:]
u_time = driver.find_element(By.XPATH, "//ul[@class='publication-list']/li[2]/p")
update_time = u_time.text[12:]
text_part = driver.find_elements(By.XPATH, "//div[@class='section theme-navy-1']/p")
research_text = ""
for i in text_part:
research_text = research_text + i.text + '\n'
research_text = research_text.strip()
code_part = driver.find_elements(By.XPATH, "//code/a")
research_code = ""
for i in code_part:
research_code = research_code + i.text + '\n'
img_part = driver.find_elements(By.XPATH, "//img")
research_img = ""
for i in img_part:
research_img = research_img + i.get_attribute("src") + '\n'
research_content = research_text + research_code + research_img
research_content = research_content.replace("'","")
title = title.replace("'","")
return title, research_content, url, author, publish_time, update_time
def InsertData(title,research_content,url,author,publish_time,update_time):
conn = None
sql = INSERT_DATA.format(TABLE_NAME, title, research_content, url, author, publish_time, update_time)
try:
conn = sqlite3.connect(DATABASE)
cur = conn.cursor()
cur.execute(sql)
conn.commit()
conn.close()
print("Insert successfully!")
return 1
except Exception as e:
print(sql)
print(traceback.format_exc())
logging.debug(sql)
logging.debug(traceback.format_exc())
return 0
def CreateTable(database, table, sql):
create_sql = sql.format(table)
try:
conn = sqlite3.connect(database)
conn.execute(create_sql)
except Exception as e:
print(sql)
print(traceback.format_exc())
logging.debug(sql)
logging.debug(traceback.format_exc())
finally:
if hasattr(conn, 'close'):
conn.close()
if __name__ == "__main__":
CreateTable(DATABASE, TABLE_NAME, CRAETE_TABLE_SQL)
with open('./portswigger_researchurl.txt','r',encoding='utf-8') as f:
url = f.readline().replace('\n', '')
while url:
try:
title, research_content, url, author, publish_time, update_time = get_content(url)
InsertData(title, research_content, url, author, publish_time, update_time)
url = f.readline().replace('\n', '')
time.sleep(5)
except Exception as e:
print(url+' error!\n')
print('My exception occurred, value:',e.value)
|