import requests
import time
from lxml import etree
import time
import pymysql
def craw(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
resp = requests.get(url=url, headers=headers).text
html = etree.HTML(resp)
titles = html.xpath('//*[@id="wp_news_w6"]/ul/li/div[1]/span[2]/a/@title')
parts = html.xpath('//*[@id="wp_news_w6"]/ul/li/span[1]/text()')
dts = html.xpath('//*[@id="wp_news_w6"]/ul/li/div[2]/span/text()')
clicks = html.xpath('//*[@id="wp_news_w6"]/ul/li/span[2]/span/text()')
hrefs = html.xpath('//*[@id="wp_news_w6"]/ul/li/div[1]/span[2]/a/@href')
contents, img_urls = [], []
for href in hrefs:
detail_url = 'http://www.cswu.cn' + href
resp = requests.get(url=detail_url, headers=headers).text
html = etree.HTML(resp)
# 内容
ps = html.xpath('//*[@id="container2"]/div/div/div/div/div/div/p//text()')
content = ''.join(ps)
contents.append(content)
img_hrefs = html.xpath('/html/body/div[2]/div/div/div/div/div/div/p/img/@src')
img_urls.append(','.join(img_hrefs))
to_mysql(titles, parts, dts, clicks, contents, img_urls)
def to_mysql(titles, parts, dts, clicks, contents, img_urls):
db = pymysql.connect(host='localhost', port=3306, user='root', password='123456', db='news', charset='utf8')
cursor = db.cursor()
for title, part, dt, click, content, img_url in zip(titles, parts, dts, clicks, contents, img_urls):
title = title.replace('\'', '')
sql = f"insert into news(title,part,dt,click,content,img) values('{title}','{part}','{dt}','{int(click)}','{content}','{img_url}')"
cursor.execute(sql)
db.commit()
cursor.close()
db.close()
if __name__ == '__main__':
urls = [f'https://www.cswu.cn/_s3/34/list{i}.htm' for i in range(1, 3)]
for url in urls:
craw(url)
time.sleep(3)
|