[Python知识库] 基于python实现垂直爬虫系统

开发: C++知识库 Java知识库 JavaScript Python PHP知识库人工智能区块链大数据移动开发嵌入式开发工具数据结构与算法开发测试游戏开发网络协议系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑笔记本显卡显示器固态硬盘硬盘耳机手机 iphone vivo oppo 小米华为单反装机图拉丁

-> Python知识库 -> 基于python实现垂直爬虫系统 -> 正文阅读

[Python知识库]基于python实现垂直爬虫系统

html_downloader

from urllib import request

def download(url):
    if url is None:
        return

    response = request.urlopen(url)

    if response.getcode() != 200:
        return None

    return response.read()

html_outeputer

data_list = []

def collect_data(data):
    data_list.append(data)

def output_html():
    fout = open('output.html', 'w')

    fout.write('<html>')
    fout.write('<body>')
    fout.write('<table>')

    for dataitem in data_list:
        fout.write('<tr>')
        fout.write('<td>%s</td>' % dataitem['url'])
        fout.write('<td>%s</td>' % dataitem['title'])
        fout.write('<td>%s</td>' % dataitem['datetime'])
        fout.write('<td>%s</td>' % dataitem['visitcount'])
        fout.write('</tr>')

    fout.write('</table>')
    fout.write('</body>')
    fout.write('</html>')

    fout.close()

html_parser

import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def get_new_urls(page_url, soup):
    new_urls = set()

    links = soup.find_all('a', href=re.compile(r"/\d+/\d+/\w+/page\.htm"))
    for link in links:
        new_url = link['href']
        new_full_url = urljoin(page_url, new_url)
        new_urls.add(new_full_url)
    return new_urls

def get_new_data(page_url, soup):
    res_data = {}
        
    title_node = soup.find('h1', class_='arti-title')
    if title_node is None:
        return res_data

    res_data['title'] = title_node.get_text()

    datetime_node = soup.find('span', class_='arti-update')
    res_data['datetime'] = datetime_node.get_text()

    visitcount_node = soup.find('span', class_='WP_VisitCount')
    res_data['visitcount'] = visitcount_node.get_text()

    res_data['url'] = page_url
        
    return res_data
    
def parse(page_url, html_cont):
    if page_url is None or html_cont is None:
        return
        
    soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
    new_urls = get_new_urls(page_url, soup)
    new_data = get_new_data(page_url, soup)
    return new_urls, new_data

spider_main

import urls_manager, html_downloader, \
    html_parser, html_outputer

def craw(root_url):
    count = 1
    urls_manager.add_new_url(root_url)
    #启动爬虫循环
    while urls_manager.has_new_url():
        new_url = urls_manager.get_new_url()
        print('craw %d : %s' % (count, new_url))
        html_cont = html_downloader.download(new_url)
        new_urls, new_data = html_parser.parse(new_url, html_cont)
        urls_manager.add_new_urls(new_urls)
        if new_data:
            html_outputer.collect_data(new_data)

        if count == 10:
            break

        count = count + 1

    html_outputer.output_html()



if __name__ == '__main__':
    root_url = 'http://news.zzuli.edu.cn/'
    craw(root_url)

test_64

from bs4 import BeautifulSoup
import re

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc, 'html.parser')

print('获取所有链接')
links = soup.find_all('a')
for link in links:
    print(link.name, link['href'], link.get_text())

print('获取lacie链接')
link_node = soup.find('a', href='http://example.com/lacie')
print(link_node.name, link_node['href'], link_node.get_text())

print('正则匹配')
link_node = soup.find('a', href=re.compile(r'ill'))
print(link_node.name, link_node['href'], link_node.get_text())

print('获取P段落文字')
p_node = soup.find('p', class_='title')
print(p_node.name, p_node.get_text())

urls_manager

new_urls = set()
old_urls = set()
    
def add_new_url(url):
    if url is None:
        return
    if url not in new_urls and url not in old_urls:
        new_urls.add(url)

def add_new_urls(urls):
    if urls is None or len(urls) == 0:
        return
    for url in urls:
        add_new_url(url)

def get_new_url():
    new_url = new_urls.pop()
    old_urls.add(new_url)
    return new_url

def has_new_url():
    return len(new_urls) != 0

Python知识库最新文章

Python中String模块

【Python】 14-CVS文件操作

python的panda库读写文件

使用Nordic的nrf52840实现蓝牙DFU过程

【Python学习记录】numpy数组用法整理

Python学习笔记

python字符串和列表

python如何从txt文件中解析出有效的数据

Python编程从入门到实践自学/3.1-3.2

python变量

加:2022-03-04 15:30:49 更:2022-03-04 15:31:50

360图书馆购物三丰科技阅读网日历万年历 2026年3日历

-2026/3/7 16:20:55-

图片自动播放器
↓图片自动播放器↓

TxT小说阅读器
↓语音阅读,小说下载,古典文学↓

一键清除垃圾
↓轻轻一点,清除系统垃圾↓

图片批量下载器
↓批量下载图片,美女图库↓

网站联系: qq:121756557 email:121756557@qq.com IT数码