1. nvidia-smi

nvidia-smi 使用注意点：

1. nvidia-smi 默认是只统计 sm 的加权平均使用量，也就是GPU-Util的统计，
nvidia-smi采集粒度偏大
2. ffmpeg 在使用编解码的时候，也会少量的使用到sm资源， 单sm使用率100%后会影响编解码的性能，
如果sm使用率不高的情况，对硬转码没有影响； codec和sm共用时钟源和电源

指定板卡id，查看gpu状态，nvidia-smi -i 0

root@n19-045-100:~# nvidia-smi -i 0
Fri Oct 22 16:01:31 2021
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.116.00   Driver Version: 418.116.00   CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  Tesla P4            On   | 00000000:1A:00.0 Off |                    0 |
| N/A   56C    P0    30W /  75W |    140MiB /  7611MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|    0    588727      C   ...ffmpeg_build/gpu/static/bin/ffmpeg   130MiB |
+-----------------------------------------------------------------------------+

查看gpu 的详细状态信息
nvidia-smi -q
nvidia-smi -i 0 -q

root@n19-045-200:~# nvidia-smi  -i 0  -q
==============NVSMI LOG==============
Timestamp                           : Fri Oct 22 16:48:35 2021
Driver Version                      : 418.116.00
CUDA Version                        : 10.1

Attached GPUs                       : 7
GPU 00000000:1A:00.0
    Product Name                    : Tesla P4
    Product Brand                   : Tesla
    Display Mode                    : Enabled
    Display Active                  : Disabled
    Persistence Mode                : Enabled
    Accounting Mode                 : Disabled
    Accounting Mode Buffer Size     : 4000
    Driver Model
        Current                     : N/A
        Pending                     : N/A
    Serial Number                   : 0325017003772
    GPU UUID                        : GPU-b1fc4bf1-cc9b-2add-586c-b39397c1eaad
    Minor Number                    : 0
    VBIOS Version                   : 86.04.55.00.01
    MultiGPU Board                  : No
    Board ID                        : 0x1a00
    GPU Part Number                 : 900-2G414-0000-000
    Inforom Version
        Image Version               : G414.0200.00.03
        OEM Object                  : 1.1
        ECC Object                  : 4.1
        Power Management Object     : N/A
    GPU Operation Mode
        Current                     : N/A
        Pending                     : N/A
    GPU Virtualization Mode
        Virtualization mode         : None
    IBMNPU
        Relaxed Ordering Mode       : N/A
    PCI
        Bus                         : 0x1A
        Device                      : 0x00
        Domain                      : 0x0000
        Device Id                   : 0x1BB310DE
        Bus Id                      : 00000000:1A:00.0
        Sub System Id               : 0x11D810DE
        GPU Link Info
            PCIe Generation
                Max                 : 3
                Current             : 3
            Link Width
                Max                 : 16x
                Current             : 16x
        Bridge Chip
            Type                    : N/A
            Firmware                : N/A
        Replays Since Reset         : 0
        Replay Number Rollovers     : 0
        Tx Throughput               : 0 KB/s
        Rx Throughput               : 0 KB/s
    Fan Speed                       : N/A
    Performance State               : P0
    Clocks Throttle Reasons
        Idle                        : Not Active
        Applications Clocks Setting : Not Active
        SW Power Cap                : Not Active
        HW Slowdown                 : Not Active
            HW Thermal Slowdown     : Not Active
            HW Power Brake Slowdown : Not Active
        Sync Boost                  : Not Active
        SW Thermal Slowdown         : Not Active
        Display Clock Setting       : Not Active
    FB Memory Usage
        Total                       : 7611 MiB
        Used                        : 140 MiB
        Free                        : 7471 MiB
    BAR1 Memory Usage
        Total                       : 256 MiB
        Used                        : 2 MiB
        Free                        : 254 MiB
    Compute Mode                    : Default
    Utilization
        Gpu                         : 0 %
        Memory                      : 0 %
        Encoder                     : 0 %
        Decoder                     : 0 %
    Encoder Stats
        Active Sessions             : 0
        Average FPS                 : 0
        Average Latency             : 0
    FBC Stats
        Active Sessions             : 0
        Average FPS                 : 0
        Average Latency             : 0
    Ecc Mode
        Current                     : Enabled
        Pending                     : Enabled
    ECC Errors
        Volatile
            Single Bit
                Device Memory       : 0
                Register File       : N/A
                L1 Cache            : N/A
                L2 Cache            : N/A
                Texture Memory      : N/A
                Texture Shared      : N/A
                CBU                 : N/A
                Total               : 0
            Double Bit
                Device Memory       : 0
                Register File       : N/A
                L1 Cache            : N/A
                L2 Cache            : N/A
                Texture Memory      : N/A
                Texture Shared      : N/A
                CBU                 : N/A
                Total               : 0
        Aggregate
            Single Bit
                Device Memory       : 0
                Register File       : N/A
                L1 Cache            : N/A
                L2 Cache            : N/A
                Texture Memory      : N/A
                Texture Shared      : N/A
                CBU                 : N/A
                Total               : 0
            Double Bit
                Device Memory       : 0
                Register File       : N/A
                L1 Cache            : N/A
                L2 Cache            : N/A
                Texture Memory      : N/A
                Texture Shared      : N/A
                CBU                 : N/A
                Total               : 0
    Retired Pages
        Single Bit ECC              : 0
        Double Bit ECC              : 0
        Pending Page Blacklist      : No
    Temperature
        GPU Current Temp            : 56 C
        GPU Shutdown Temp           : 94 C
        GPU Slowdown Temp           : 91 C
        GPU Max Operating Temp      : N/A
        Memory Current Temp         : N/A
        Memory Max Operating Temp   : N/A
    Power Readings
        Power Management            : Supported
        Power Draw                  : 30.26 W
        Power Limit                 : 75.00 W
        Default Power Limit         : 75.00 W
        Enforced Power Limit        : 75.00 W
        Min Power Limit             : 60.00 W
        Max Power Limit             : 75.00 W
    Clocks
        Graphics                    : 1531 MHz
        SM                          : 1531 MHz
        Memory                      : 2999 MHz
        Video                       : 1366 MHz
    Applications Clocks
        Graphics                    : 1531 MHz
        Memory                      : 3003 MHz
    Default Applications Clocks
        Graphics                    : 885 MHz
        Memory                      : 3003 MHz
    Max Clocks
        Graphics                    : 1531 MHz
        SM                          : 1531 MHz
        Memory                      : 3003 MHz
        Video                       : 1379 MHz
    Max Customer Boost Clocks
        Graphics                    : 1113 MHz
    Clock Policy
        Auto Boost                  : N/A
        Auto Boost Default          : N/A
    Processes
        Process ID                  : 588727
            Type                    : C
            Name                    : /opt/ffmpeg_build/gpu/static/bin/ffmpeg
            Used GPU Memory         : 130 MiB

查看gpu 的编码器状态
nvidia-smi -q | grep -i enc
nvidia-smi -i 0 -q | grep -i enc

root@n19-045-200:~# nvidia-smi -i 0 -q | grep -i enc
    Persistence Mode                : Enabled
        Encoder                     : 0 %
    Encoder Stats
        Average Latency             : 0
        Average Latency             : 0

2. nvml监控模块

通过CUDA的Nvml库获取GPU的使用率
python实现 https://pypi.org/project/pynvml/

>>> from pynvml import *
>>> nvmlInit()
>>> print("Driver Version:", nvmlSystemGetDriverVersion())
Driver Version: b'418.116.00'
>>> deviceCount = nvmlDeviceGetCount()
>>> for i in range(deviceCount):
...     handle = nvmlDeviceGetHandleByIndex(i)
...     print("Device", i, ":", nvmlDeviceGetName(handle))
...
Device 0 : b'Tesla P4'
Device 1 : b'Tesla P4'
Device 2 : b'Tesla P4'
Device 3 : b'Tesla P4'
Device 4 : b'Tesla P4'
Device 5 : b'Tesla P4'
Device 6 : b'Tesla P4'
>>> from pynvml.smi import nvidia_smi
>>> nvsmi = nvidia_smi.getInstance()
>>> nvsmi.DeviceQuery('memory.free, memory.total')
{'gpu': [{'fb_memory_usage': {'total': 7611.9375, 'free': 7471.8125, 'unit': 'MiB'}}, {'fb_memory_usage': {'total': 7611.9375, 'free': 7601.9375, 'unit': 'MiB'}}, {'fb_memory_usage': {'total': 7611.9375, 'free': 7601.9375, 'unit': 'MiB'}}, {'fb_memory_usage': {'total': 7611.9375, 'free': 7601.9375, 'unit': 'MiB'}}, {'fb_memory_usage': {'total': 7611.9375, 'free': 7601.9375, 'unit': 'MiB'}}, {'fb_memory_usage': {'total': 7611.9375, 'free': 7601.9375, 'unit': 'MiB'}}, {'fb_memory_usage': {'total': 7611.9375, 'free': 7601.9375, 'unit': 'MiB'}}]}