Selenium爬取智能合约交易记录
2.方法:
- 加入元素显式等待
- 利用selenium爬取记录并判断是否为合约地址or外部地址(有合约地址图标的即合约地址)
- 对爬取的数据做简单的第一步的数据处理使数据格式标准化
- 将数据追加写入txt中
3.源代码(Python)
项目目录结构
- codeA中爬取的数据会保存在A.txt中
- codeB中爬取的数据会保存在B.txt中
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ECS
from selenium.webdriver.common.by import By
import time
url = 'https://eth.btc.com/accounts'
opt = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=opt)
driver.get(url)
pageSum = 2
all_list = []
for q in range(1, pageSum + 1):
print('开始爬取第' + str(q) + '页...')
try:
table_loc = (By.CSS_SELECTOR,
'#root > div > section > section > main > div > div > div.page-container > '
'div:nth-child(2) > div > div > div > div > div > div > table')
timeOut = 20
WebDriverWait(driver, timeOut).until(ECS.presence_of_element_located(table_loc))
except:
print('执行except方法 : 没有找到对应元素! 当前已爬取到第', q, '页')
driver.quit()
finally:
element = driver.find_element_by_css_selector(
'#root > div > section > section > main > div > div > div.page-container > '
'div:nth-child(2) > div > div > div > div > div > div > table')
tr_content = element.find_elements_by_tag_name("tr")
for tr in tr_content:
icon = tr.find_elements_by_class_name('contract-tx-icon')
type = ''
if len(icon) != 0:
type = '合约地址'
else:
type = '外部地址'
tempList = [type]
td_list = tr.find_elements_by_tag_name('td')
for td in td_list:
urls = td.find_elements_by_tag_name('a')
for u in urls:
tempList.append(u.get_attribute('href'))
tdText = str(td.text).strip().replace(',', '')
if len(tdText) > 0:
tempList.append(tdText)
tempLen = len(tempList)
if tempLen == 8:
del tempList[4]
if tempLen > 1:
del tempList[5]
with open('../data/A.txt', 'a+', encoding='utf-8') as f:
f.write(",".join(tempList))
f.write("\n")
all_list.append(tempList)
print('第' + str(q) + '页爬取结束!')
driver.find_element_by_xpath(
'/html/body/div/div/section/section/main/div/div/div[2]/div[3]/div/ul/li[2]/div[3]/i') \
.click()
time.sleep(0.5)
print("全部爬取结束!!! 共", pageSum, '页')
for line in all_list:
print(line)
driver.quit()
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ECS
from selenium.webdriver.common.by import By
import time
urls_in_A = []
with open('../data/A.txt', 'r', encoding='utf-8') as f:
for line in f.readlines():
sp = line.split(',')
urls_in_A.append(sp[2])
opt = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=opt)
recordsMaxNum = 10
all_list = []
t_css = '#accountCopyContainer > div.margin-top-md > div > div.ant-tabs-content.ant-tabs-content-animated.ant-tabs-top-content > div.ant-tabs-tabpane.ant-tabs-tabpane-active > div.account-txns > div.ant-table-wrapper > div > div > div > div > div > table'
for urlA in urls_in_A:
driver.get(urlA)
print('正在爬取...', urlA)
try:
table_loc = (By.CSS_SELECTOR, t_css+' > tbody > tr:nth-child(1)')
timeOut = 20
WebDriverWait(driver, timeOut).until(ECS.presence_of_element_located(table_loc))
except:
print('执行except方法 : 没有找到table元素! 当前已爬取到', urlA)
continue
finally:
element = driver.find_element_by_css_selector(t_css)
tr_content = element.find_elements_by_tag_name("tr")
all_ct = 0
for tr in tr_content:
if all_ct >= recordsMaxNum:
break
tempList = [urlA]
td_list = tr.find_elements_by_tag_name('td')
temp_ct = 0
for td in td_list:
temp_ct += 1
tdText = str(td.text).strip().replace(',', '')
if len(tdText) > 0:
tempList.append(tdText)
if temp_ct == 4 or temp_ct == 5:
type = ''
icon = td.find_elements_by_class_name('contract-tx-icon')
if len(icon) != 0:
type = '合约地址'
else:
type = '外部地址'
tempList.append(type)
if len(tempList) > 1:
with open('../data/B.txt', 'a+', encoding='utf-8') as f:
f.write(",".join(tempList))
f.write("\n")
all_list.append(tempList)
all_ct += 1
print('爬取结束!', urlA)
time.sleep(0.25)
print("全部爬取结束!!! 共", len(all_list), '条!')
for line in all_list:
print(line)
driver.quit()
|