[Python知识库] python 进阶版（爬虫示例）

开发: C++知识库 Java知识库 JavaScript Python PHP知识库人工智能区块链大数据移动开发嵌入式开发工具数据结构与算法开发测试游戏开发网络协议系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑笔记本显卡显示器固态硬盘硬盘耳机手机 iphone vivo oppo 小米华为单反装机图拉丁

-> Python知识库 -> python 进阶版（爬虫示例） -> 正文阅读

[Python知识库]python 进阶版（爬虫示例）

具体实现的功能：
每5分钟爬取阿里云漏洞库的页面，找到漏洞评分大于9分的漏洞并告警，再通过阿里云的API找出存在这些新增漏洞的服务器。
整体思路：（总共有5个模块）
1、漏洞类AVD：属性有漏洞详情，漏洞名，漏洞id，漏洞评分（属性用变量表示，这种变量叫实例变量）；
漏洞类里还有通过API获取漏洞类型的函数def avd_gettype(self, client)以及存在漏洞的服务器信息的函数def avd_getuuid(self, client)，需要传入API的客户端的参数；
2、数据库连接函数connect_db()，并返回一个连接，便于连接函数
3、游标函数conndb_cursor(conn)，便于爬取页面时对数据库进行插入操作
4、爬虫函数grab(client, cursor, url)，需要传入的参数有客户端，游标，爬取的页面地址。
5、钉钉群消息的推送dingding(mes)，要注意钉钉群机器人关键字的设置，没有关键字就会被屏蔽掉

import json
import sqlite3
# 时间模块time
import time
from operator import itemgetter

# 下载网页使用requests模块
import requests
# 解析HTML，得到目标数据使用BeautifulSoup模块
from bs4 import BeautifulSoup

from alibabacloud_sas20181203.client import Client as Sas20181203Client
from alibabacloud_tea_openapi import models as open_api_models
from alibabacloud_sas20181203 import models as sas_20181203_models

# API的AK值
access_key_id = 'key_id'
access_key_secret = 'key_secret'

# # 把数据库表里的漏洞id放入列表，便于后续表数据的查询,程序直接查询列表（即内存地址）比查询数据库更高效
avd_id_list = []


class AVD:
    """
    这是一个关于漏洞的类，有漏洞的AVD编号avd_id，漏洞名称avd_name，漏洞危险评分avd_sorce，漏洞的详情页面avd_href
    """
    # 创建类变量
    # 漏洞详情页面
    avd_url = ''
    # 漏洞名
    avd_name = ''
    # 漏洞编号
    avd_id = ''
    # 漏洞危险评分
    avd_sorce = 0.0

    # 构造函数，传入漏洞的详情页面的参数，便可获得漏洞的名字，编号，危险评分
    def __init__(self, detail_url):
        d_response = requests.get(url=detail_url)
        detail_object = BeautifulSoup(d_response.text, "html.parser", from_encoding="utf-8")
        # .text获取标签的文本内容
        d_name = detail_object.find('span', class_='header__title__text').text
        # print('d_name', d_name)
        d_score = float(detail_object.find('div', class_='cvss-breakdown__score cvss-breakdown__score--high').text.strip())
        # print('d_score', d_score)
        # find_all('li', class_="breadcrumbs__list-item-last CMSBreadCrumbsLink")[2]找到li标签下的第3个元素
        d_id = detail_object.find('ol', class_="breadcrumbs__list").find_all('li',class_="breadcrumbs__list-item-last CMSBreadCrumbsLink")[2].text.strip()
        # print('d_id', d_id)
        # 初始化类变量
        self.avd_url = detail_url
        self.avd_name = d_name
        self.avd_id = d_id
        self.avd_sorce = d_score

    # 通过漏洞的相关信息获取有新漏洞服务器的uuid
    # 通过DescribeGroupedVul找到漏洞的类型type
    def avd_gettype(self, client):
        describe_grouped_vul_request = sas_20181203_models.DescribeGroupedVulRequest(
            alias_name=self.avd_name
        )
        groupedvulitems = client.describe_grouped_vul(describe_grouped_vul_request)
        avd_type = groupedvulitems.body.grouped_vul_items
        if len(avd_type) == 0:
            print(self.avd_name, '没有找到该漏洞类型')
            return ''
        return avd_type[0].type

    # 再通过DescribeVulList API得到有此漏洞的服务器的uuid
    def avd_getuuid(self, client):
        tp = self.avd_gettype(client)
        if tp == '':
            print('没有服务器有该漏洞')
            return ''
        else:
            describe_vul_list_request = sas_20181203_models.DescribeVulListRequest(
                type=tp,
                alias_name=self.avd_name
            )
            vul_records = client.describe_vul_list(describe_vul_list_request)
            uuid = vul_records.body.vul_records[0].uuid
            print('存在该漏洞的服务器为：', uuid)
            dd_uuid = {
                "at": {
                    "atMobiles": [
                        "180xxxxxx"
                    ],
                    "atUserIds": [
                        "user123"
                    ],
                    "isAtAll": False
                },
                "text": {
                    "content": "存在新增高危漏洞:\n%s\n出现该漏洞的服务器为：\n%s" % (self.avd_name, uuid)
                },
                "msgtype": "text"
            }
            dingding(dd_uuid)
            return uuid


# 得到云安全中心的客户端
def client_sas():
    global access_key_id
    global access_key_secret
    config = open_api_models.Config(access_key_id, access_key_secret)
    config.endpoint = f'tds.aliyuncs.com'
    client_s = Sas20181203Client(config)
    # 返回一个客户端，传到要调用aip的函数
    return client_s


# 建立数据库连接
def connect_db():
    # 建立数据库连接,如果不存在将会创建该数据库
    conn = sqlite3.connect('D:/vul.db')
    # 事务隔离级别，默认是需要自己commit才能修改数据库，置为None自动每次修改都提交
    conn.isolation_level = None
    # 返回一个连接,可传递连接参数使用数据库
    return conn


# 建表，以及查询表
# 把连接参数conn传入建表，查表的函数conndb_cursor()，进行对数据库vul.db的操作
def conndb_cursor(conn):
    # 创建游标
    cursor = conn.cursor()
    # 创建表
    sql_create = 'create table if not exists avd_table(avd_id varchar(30) PRIMARY KEY, avd_name varchar(30), avd_score integer)'
    try:
        cursor.execute(sql_create)
    except Exception as e:
        print('抛出异常', e)
        exit(1)
    # 因为漏洞id是唯一的，所以后面新增数据只需依据表内是否已经有漏洞id即可
    sql_selectID = 'select avd_id from avd_table'
    cursor.execute(sql_selectID)
    # fetchall()返回的是有多个元组的 列表，
    fetchall = cursor.fetchall()
    # print(fetchall)
    if len(fetchall):
        # 循环把fetchall列表里的元组的 第一个元素放入列表中
        for i in fetchall:
            global avd_id_list
            avd_id_list.append(i[0])
            # print(i)
        # print(len(avd_id_list))
        # print(avd_id_list)
    else:
        print('表内还没有数据')
    # 最后返回一个游标，把游标参数传入到爬取页面的函数，进行插入数据的操作
    return cursor


# 钉钉机器人消息推送，此函数是为了让机器人推送新增高危漏洞的消息
def dingding(mes):
    # 机器人的webhook地址
    webhook = 'webhook地址'
    # 请求头
    headers = {'content-type': 'application/json'}
    r = requests.post(webhook, headers=headers, data=json.dumps(mes))
    r.encoding = 'utf-8'
    return r.text

# 分析主页面,获取副页面的页面数page
def parse_url(url):
    response = requests.get(url)
    htm_page = BeautifulSoup(response.text, "html.parser", from_encoding="utf-8")
    spqn_page = htm_page.find('span', class_="text-muted")
    print(spqn_page.text)

    print(spqn_page.text[8:10])


# 爬取页面获取数据，并把数据插入表中
def grab(client, cursor, url):
    # 请求并下载网页,r禁止字符转义
    # response = requests.get(r'https://avd.aliyun.com/high-risk/list?page=1')
    response = requests.get(url)
    # 将网页源码构造成一个beautifulSoup解析对象，html.parser网页解析器
    high_risk_list = BeautifulSoup(response.text, "html.parser", from_encoding="utf-8")
    # 获取a标签的对象a_node，循环获取到所有a标签的href
    a_label = high_risk_list.find('tbody').find_all('a')

    # 循环获取a标签的子链接
    # 判断a标签的页面有没有数据，没有数据就返回0
    if len(a_label) == 0:
        print('该页面没有漏洞数据')
        return 0
    # print(len(a_label), 'a标签')
    # return 1
    for a in a_label:
        detail = 'https://avd.aliyun.com' + a.get('href')
        # 把子链接传入AVD（），漏洞实例化（AVD（）类的具体对象）：vul_item
        vul_item = AVD(detail)
        # print(vul_item.avd_name)
        # 把新增的漏洞信息插入表中
        if vul_item.avd_id not in avd_id_list:
            # print(vul_item.avd_name, vul_item.avd_id, '不存在表中')
            sql_insert = 'insert into avd_table(avd_id,avd_name,avd_score) values("%s","%s","%s")' % (vul_item.avd_id, vul_item.avd_name, vul_item.avd_sorce)
            cursor.execute(sql_insert)
            # 还要更新avd_id_list列表
            avd_id_list.append(vul_item.avd_id)
            # print(type(vul_item.avd_sorce), vul_item.avd_sorce)
            # print('发现新增高危漏洞:', vul_item.avd_name)
            if vul_item.avd_sorce > 9.0:
                # -------找出存在新增漏洞的服务器------
                vul_item.avd_getuuid(client)
                # -------向钉钉发送推送消息-----------
                dding = {
                    "msgtype": "link",
                    "link": {
                        "text": vul_item.avd_name,
                        "title": "新增高危漏洞",
                        "picUrl": "",
                        "messageUrl": url
                    }
                }
                dingding(dding)
    return 1


if __name__ == '__main__':
    # 清空列表元素
    # avd_id_list.clear()

    # 创建客户端
    client = client_sas()
    # 创建数据库连接
    conn = connect_db()
    # 创建表，获得一个数据库的游标
    cursor = conndb_cursor(conn)

    # 清空表中数据
    # cursor.execute('delete from avd_table')

    # 爬取所有页面
    for i in range(1, 3):
        # parse_url(r'https://avd.aliyun.com/high-risk/list')
        j = 0
        # 一直循环访问所有漏洞的副页面
        while True:
            j = j+1
            # 爬取所有漏洞页面
            pre_url = r'https://avd.aliyun.com/high-risk/list?page=' + str(j)
            print(pre_url)
            res = grab(client, conn, pre_url)
            # 判断grab（）返回的结果，返回0的结果就退出循环
            if res == 0:
                print('退出本次%d爬取', i)
                break
        time.sleep(5*60)

    sql_select = 'select * from avd_table'
    cursor.execute(sql_select)
    print('----查询表的数据-----')
    data = cursor.fetchall()
    for d in data:
        print(d)

    cursor.close()
    conn.close()

Python知识库最新文章

Python中String模块

【Python】 14-CVS文件操作

python的panda库读写文件

使用Nordic的nrf52840实现蓝牙DFU过程

【Python学习记录】numpy数组用法整理

Python学习笔记

python字符串和列表

python如何从txt文件中解析出有效的数据

Python编程从入门到实践自学/3.1-3.2

python变量

加:2022-03-21 20:45:38 更:2022-03-21 20:49:22

360图书馆购物三丰科技阅读网日历万年历 2025年7日历

-2025/7/4 7:49:10-

图片自动播放器
↓图片自动播放器↓

TxT小说阅读器
↓语音阅读,小说下载,古典文学↓

一键清除垃圾
↓轻轻一点,清除系统垃圾↓

图片批量下载器
↓批量下载图片,美女图库↓

网站联系: qq:121756557 email:121756557@qq.com IT数码