用正则表达式提取数据
import requests
import re
def get_one_page(url,page):
headers = {}
html_txt = ''
headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
response = requests.get('http://www.baidu.com', headers=headers)
if response.status_code == 200:
html_txt = response.text
return html_txt
def parse_one_page(html_txt):
html_str = html_txt.replace('\n','')
pat_all = r'<div class="day">(.*?)编辑'
blogs = re.findall(pat_all,html_str)
print(blogs)
for blog in blogs:
posdate = ''
title = ''
match = re.findall(r'<span>(.*?)</span>',blog)
if len(match) == 1:
title = match[0]
com_count,read_count,digg_count = re.findall(r'\((\d+)\)',blog)
return (title,read_count,com_count,digg_count)
if __name__=='__main__':
base_url = 'https://www.cnblogs.com/pinard/default.html?page='
urls = [base_url+str(i) for i in range(1,15)]
for urls in urls:
html_txt = get_one_page(url)
cont = parse_one_page(html_txt)
save2txt(cont)
|