from selenium import webdriver
import time
import pymysql
def login():
'''
模拟登录(不是)
'''
driver.get(
'https://www.douyin.com/search/%E5%A4%AE%E8%A7%86%E7%96%AB%E8%8B%97?publish_time=0&sort_type=1&source=normal_search&type=video')
driver.maximize_window()
time.sleep(10)
login = driver.find_element_by_css_selector('#_285c63f4da53bd5cedc023b4fdd71412-scss > button') # 找到安全登录按钮
login.click()
time.sleep(15) # 手动扫码登录
def Transfer(driver):
'''
下拉滑动条,实现翻页功能
'''
try:
driver.execute_script("window.scrollBy(0,document.body.scrollHeight)", "") ## 移动到页面最底部
time.sleep(2)
driver.execute_script("window.scrollBy(0,document.body.scrollHeight)", "")
time.sleep(2)
driver.execute_script("window.scrollBy(0,document.body.scrollHeight)", "")
except:
pass
return "Transfer successfully \n"
def get_news(i):
'''
获取新闻发布者,新闻标题,以及新闻热度
建立当前新闻的数据库
'''
try:
news_user = driver.find_element_by_css_selector(
f'#root > div > div:nth-child(2) > div > div.cdfb2696b192e707e529d93274fe5752-scss > div:nth-child(2) > ul > li:nth-child({i}) > div > div > a > p > span > span > span > span > span').text
news_title = driver.find_element_by_css_selector(
f'#root > div > div:nth-child(2) > div > div.cdfb2696b192e707e529d93274fe5752-scss > div:nth-child(2) > ul > li:nth-child({i}) > div > a.caa4fd3df2607e91340989a2e41628d8-scss.a074d7a61356015feb31633ad4c45f49-scss._9c976841beef15a22bcd1540d1e84c02-scss > p > span > span > span > span > span').text
news_hot = driver.find_element_by_css_selector(
f'#root > div > div:nth-child(2) > div > div.cdfb2696b192e707e529d93274fe5752-scss > div:nth-child(2) > ul > li:nth-child({i}) > div > a.caa4fd3df2607e91340989a2e41628d8-scss.a074d7a61356015feb31633ad4c45f49-scss.b388acfeaeef33f0122af9c4f71a93c9-scss > div > div._1a451682eeb2d4df81bf8e1ada549729-scss._825ee6a2309eb4a4d9f707ef61be8629-scss > div > div > span').text
print(news_user)
print(news_title)
print(news_hot)
creat_table(table=i)
except:
pass
def get_comment(i):
'''
:param i: 第i条新闻
获取当前页面评论,随后当前关闭页面
'''
news = driver.find_element_by_css_selector(
f'#root > div > div:nth-child(2) > div > div.cdfb2696b192e707e529d93274fe5752-scss > div:nth-child(2) > ul > li:nth-child({i}) > div > a.caa4fd3df2607e91340989a2e41628d8-scss.a074d7a61356015feb31633ad4c45f49-scss.b388acfeaeef33f0122af9c4f71a93c9-scss > div > div._1a451682eeb2d4df81bf8e1ada549729-scss._825ee6a2309eb4a4d9f707ef61be8629-scss')
driver.execute_script("arguments[0].click();", news) # 点击进入具体的新闻页面,防止其他干扰
driver.switch_to.window(driver.window_handles[-1]) # 切换窗口
Transfer(driver)
time.sleep(3)
# news_comment_sum = float(driver.find_element_by_css_selector(
# '#root > div > div:nth-child(2) > div._97d705de994bee2e76ad8876a1648171-scss > div.leftContainer._20bc24e2255076f4dbc27d9fe1a241f3-scss > div:nth-child(1) > div._517025479ee2ccbf2bc6480838cedb09-scss > div > div > div._9c2452d0d6d8dbc6de035f37c1b11314-scss > div:nth-child(2) > span').text)
# if news_comment_sum[-1] == 'w':
# news_comment_sum = int(float(sum[0:-1]) * 10000)
#print(news_comment_sum)
try:
news_time = driver.find_element_by_css_selector(
f'#root > div > div:nth-child(2) > div._97d705de994bee2e76ad8876a1648171-scss > div.leftContainer._20bc24e2255076f4dbc27d9fe1a241f3-scss > div:nth-child({i}) > div._517025479ee2ccbf2bc6480838cedb09-scss > div > div > div._3f5a4457e19c10aae6f40f4448fd9cb6-scss > span').text
print(news_time)
except:
pass
for n in range(1, int(page) + 3): # 翻page页
Transfer(driver)
try:
for e in range(1, int(page) * 100): # 获取page*100的评论
news_comment_content = driver.find_element_by_css_selector(
f'#root > div > div:nth-child(2) > div._97d705de994bee2e76ad8876a1648171-scss > div.leftContainer._20bc24e2255076f4dbc27d9fe1a241f3-scss > div._6ee298c56864097ec4c572364713fe94-scss > div > div:nth-child(3) > div:nth-child({e}) > div > div.c7ee22de401c856152e3646bffd656a3-scss > p > span').text ##root > div > div:nth-child(2) > div._97d705de994bee2e76ad8876a1648171-scss > div.leftContainer._20bc24e2255076f4dbc27d9fe1a241f3-scss > div._6ee298c56864097ec4c572364713fe94-scss > div > div:nth-child(3) > div:nth-child(19) > div > div.c7ee22de401c856152e3646bffd656a3-scss > p > span > span
news_comment_like = driver.find_element_by_css_selector(
f'#root > div > div:nth-child(2) > div._97d705de994bee2e76ad8876a1648171-scss > div.leftContainer._20bc24e2255076f4dbc27d9fe1a241f3-scss > div._6ee298c56864097ec4c572364713fe94-scss > div > div:nth-child(3) > div:nth-child({e}) > div > div.c7ee22de401c856152e3646bffd656a3-scss > div.c2de27b4f28bd4bb0cdbcabadabe8bf3-scss > div > p > span').text
# print(news_comment_content)
# print(news_comment_like)
insert_data(id = sum_id(),comment=news_comment_content,like=news_comment_like,table=i)#将其写入数据库中
except:
pass
print(f"成功写入到数据库comment_{i}中")
driver.close()
driver.switch_to.window(driver.window_handles[-1]) # 切换窗口
def creat_table(table):
'''
在数据库中建表 表名为comment_table
:return:
'''
conn = pymysql.connect(host='localhost', user='root', password='123456')
conn.select_db('pythondb1')
cur = conn.cursor() # 获取游标
# 创建user表
# cur.execute('drop table if exists comment4') #//就是这里出现错误,没有character set utf8 default NULL
sql = f"""CREATE TABLE IF NOT EXISTS `comment_{table}` (
`id` INT(20) NOT NULL ,
`comment` text character set utf8mb4 default NULL,
`like` varchar(255) NOT NULL,
primary key (id)
) DEFAULT CHARSET=utf8 AUTO_INCREMENT=0"""
cur.execute(sql)
print("数据库建表成功")
def sum_id():
'''
:return: 返回评论的总数
'''
global sum
sum = sum +1
return sum
def insert_data(id, comment,like,table):
# :param id:
# :param comment:
# :return:
#
conn = pymysql.connect(host='localhost', user='root', password='123456')
conn.select_db('pythondb1')
cur = conn.cursor() # 获取游标
sql = f"insert into comment_{table} values(%s,%s,%s)"
cur.execute(sql, (id, comment,like))
cur.close()
conn.commit()
conn.close()
def run ():
for i in range(1, 100):
get_news(i=i)
get_comment(i = i)
if __name__ == '__main__':
driver = webdriver.Chrome()
login() # 实现登录
Transfer(driver) # 下拉页面,展示视频
Transfer(driver)
page = 30
sum =0
run()
driver.close()
|