在之前的nvidia-smi 详解(一)写的过程中,查资料查到了有对应的python支持方法,就计划写这个了,随后写加查资料就写好代码了,但是就是犯懒一直没写文章,墨迹到了现在。
也是做了一些简单的接口测试以及使用,主要还是查询的接口,没测试控制类接口。
对应的py库介绍主要是来自: nvidia-ml-py。
Provides a Python interface to GPU management and monitoring functions.
This is a wrapper around the NVML library. For information about the NVML library, see the NVML developer page http://developer.nvidia.com/nvidia-management-library-nvml
Download the latest package from: http://pypi.python.org/pypi/nvidia-ml-py/
Note this file can be run with 'python -m doctest -v README.txt' although the results are system dependent
The nvml header file contains function documentation that is relevant to this wrapper. The header file is distributed with. https://developer.nvidia.com/gpu-deployment-kit
The main difference is this library handles allocating structs and passing pointers to the functions, before returning the desired value. Non-success return codes are raised as exceptions as described in the section below.python3 -m pip install nvidia-ml-pywget https://files.pythonhosted.org/packages/ee/c6/2348fc1fb776ff41a34635fb1f18010a6d6fd7ba6e57184dabd9d98ba9cf/nvidia-ml-py-12.555.43.tar.gz
tar -xzf nvidia-ml-py-$major-$minor-$patch.tar.gz`
cd nvidia-ml-py-$major-$minor-$patch
sudo python setup.py install注意:$major-$minor-$patch 这三个变量的替换
from pynvml import *nvmlInit()print(f"Driver Version: {nvmlSystemGetDriverVersion()}")输出结果: Driver Version: 552.22
deviceCount = nvmlDeviceGetCount()handle = nvmlDeviceGetHandleByIndex(i)
print(f"Device {i} : {nvmlDeviceGetName(handle)}")输出结果:
Device 0 : NVIDIA GeForce RTX 4060 Tiinfo = nvmlDeviceGetMemoryInfo(handle)
print(f"Total memory: {info.total} MiB")
print(f"Free memory: {info.free} MiB")
print(f"Used memory: {info.used} MiB")输出结果:
Total memory: 8585740288 MiB
Free memory: 6701080576 MiB
Used memory: 1884659712 MiBinfo = nvmlDeviceGetUtilizationRates(handle)
print(f"UtilizationRates Gpu: {info.gpu}%")
print(f"UtilizationRates Memory: {info.memory}%")输出结果:
UtilizationRates Gpu: 1%
UtilizationRates Memory: 17%info = nvmlDeviceGetEncoderUtilization(handle)
print(f"EncoderUtilization {info}")输出结果:
EncoderUtilization [0, 200000]info = nvmlDeviceGetDecoderUtilization(handle)
print(f"DecoderUtilization {info}")输出结果:
DecoderUtilization [0, 200000]info = nvmlDeviceGetJpgUtilization(handle)
print(f"JpgUtilization {info}")输出结果:
JpgUtilization [0, 200000]def get_process_name_by_pid(pid):
try:
return psutil.Process(pid).name()
except:
return Noneinfo = nvmlDeviceGetComputeRunningProcesses(handle)
for index,item in enumerate(info):
print(f"{index} : {item} ,name: {get_process_name_by_pid(item.pid)}")输出信息(只放部分进程):
0 : {'pid': 1716, 'usedGpuMemory': None, 'gpuInstanceId': 4294967295, 'computeInstanceId': 4294967295} ,name: dwm.exe
1 : {'pid': 6384, 'usedGpuMemory': None, 'gpuInstanceId': 4294967295, 'computeInstanceId': 4294967295} ,name: explorer.exe
2 : {'pid': 6592, 'usedGpuMemory': None, 'gpuInstanceId': 4294967295, 'computeInstanceId': 4294967295} ,name: SearchHost.exe
3 : {'pid': 7064, 'usedGpuMemory': None, 'gpuInstanceId': 4294967295, 'computeInstanceId': 4294967295} ,name: StartMenuExperienceHost.exeinfo = nvmlDeviceGetGraphicsRunningProcesses(handle)
for index,item in enumerate(info):
print(f"{index} : {item} ,name: {get_process_name_by_pid(item.pid)}")输出信息(只放部分输出结果);
0 : {'pid': 1716, 'usedGpuMemory': None, 'gpuInstanceId': 4294967295, 'computeInstanceId': 4294967295} ,name: dwm.exe
1 : {'pid': 6384, 'usedGpuMemory': None, 'gpuInstanceId': 4294967295, 'computeInstanceId': 4294967295} ,name: explorer.exe
2 : {'pid': 6592, 'usedGpuMemory': None, 'gpuInstanceId': 4294967295, 'computeInstanceId': 4294967295} ,name: SearchHost.exe
3 : {'pid': 7064, 'usedGpuMemory': None, 'gpuInstanceId': 4294967295, 'computeInstanceId': 4294967295} ,name: StartMenuExperienceHost.exetemperature_gpu = nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU)
print(f"temperature_gpu {temperature_gpu}")输出结果:
temperature_gpu 29temperature_threshold = nvmlDeviceGetTemperatureThreshold(handle, NVML_TEMPERATURE_THRESHOLD_GPU_MAX)
print(f"temperature_threshold {temperature_threshold}")输出结果:
temperature_threshold 90gpu_uuid = nvmlDeviceGetUUID(handle)
print(f"gpu_uuid {gpu_uuid}")输出结果:
gpu_uuid GPU-3fd9292f-3024-fbdb-4596-5c5560b91654pci_info = nvmlDeviceGetPciInfo(handle)
print(f"pci info {pci_info}")print(f"GRAPHICS clock info {nvmlDeviceGetClockInfo(handle, NVML_CLOCK_GRAPHICS)}")
print(f"SM clock info {nvmlDeviceGetClockInfo(handle, NVML_CLOCK_SM)}")
print(f"MEM clock info {nvmlDeviceGetClockInfo(handle, NVML_CLOCK_MEM)}")
print(f"VIDEO clock info {nvmlDeviceGetClockInfo(handle, NVML_CLOCK_VIDEO)}")输出结果:
GRAPHICS clock info 270
SM clock info 270
MEM clock info 405
VIDEO clock info 1185print(f"power state {nvmlDeviceGetPowerState(handle)}")
print(f"power usage watts {nvmlDeviceGetPowerUsage(handle) / 1000}")输出结果:
power state 8
power usage percent 27.649nvmlShutdown()主要是遍历输出所有Nvidia状态信息,可以根据自己需要来进行编写设置。
from pynvml import *
import psutil
def get_process_name_by_pid(pid):
try:
return psutil.Process(pid).name()
except:
return None
nvmlInit()
print(f"Driver Version: {nvmlSystemGetDriverVersion()}")
deviceCount = nvmlDeviceGetCount()
try:
for i in range(deviceCount):
handle = nvmlDeviceGetHandleByIndex(i)
print(f"Device {i} : {nvmlDeviceGetName(handle)}")
print(" ")
try:
(current, pending) = nvmlDeviceGetEccMode(handle)
print("current",current,"pending",pending)
except NVMLError as error:
print(error)
print(" ")
info = nvmlDeviceGetMemoryInfo(handle)
print(f"Total memory: {info.total} MiB")
print(f"Free memory: {info.free} MiB")
print(f"Used memory: {info.used} MiB")
print(" ")
info = nvmlDeviceGetUtilizationRates(handle)
print(f"UtilizationRates Gpu: {info.gpu}%")
print(f"UtilizationRates Memory: {info.memory}%")
print(" ")
info = nvmlDeviceGetEncoderUtilization(handle)
print(f"EncoderUtilization {info}")
print(" ")
info = nvmlDeviceGetDecoderUtilization(handle)
print(f"DecoderUtilization {info}")
print(" ")
info = nvmlDeviceGetJpgUtilization(handle)
print(f"JpgUtilization {info}")
print(" ")
info = nvmlDeviceGetOfaUtilization(handle)
print(f"OfaUtilization {info}")
print(" ")
info = nvmlDeviceGetComputeRunningProcesses(handle)
for index,item in enumerate(info):
print(f"{index} : {item} ,name: {get_process_name_by_pid(item.pid)}")
print(" ")
info = nvmlDeviceGetGraphicsRunningProcesses(handle)
for index,item in enumerate(info):
print(f"{index} : {item} ,name: {get_process_name_by_pid(item.pid)}")
print(" ")
try:
# NVML_PROCESS_MODE_COMPUTE | NVML_PROCESS_MODE_GRAPHICS | NVML_PROCESS_MODE_MPS
info = nvmlDeviceGetRunningProcessDetailList(handle, nvmlProcessDetailList_v1, NVML_PROCESS_MODE_COMPUTE)
for item in info:
print(f"ProcessDetailList {item}")
except NVMLError as error:
print(f"nvmlDeviceGetRunningProcessDetailList {error}")
temperature_gpu = nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU)
print(f"temperature_gpu {temperature_gpu}")
temperature_threshold = nvmlDeviceGetTemperatureThreshold(handle, NVML_TEMPERATURE_THRESHOLD_GPU_MAX)
print(f"temperature_threshold {temperature_threshold}")
print("")
gpu_uuid = nvmlDeviceGetUUID(handle)
print(f"gpu_uuid {gpu_uuid}")
print("")
pci_info = nvmlDeviceGetPciInfo(handle)
print(f"pci info {pci_info}")
print("")
print(f"GRAPHICS clock info {nvmlDeviceGetClockInfo(handle, NVML_CLOCK_GRAPHICS)}")
print(f"SM clock info {nvmlDeviceGetClockInfo(handle, NVML_CLOCK_SM)}")
print(f"MEM clock info {nvmlDeviceGetClockInfo(handle, NVML_CLOCK_MEM)}")
print(f"VIDEO clock info {nvmlDeviceGetClockInfo(handle, NVML_CLOCK_VIDEO)}")
print("")
print(f"power state {nvmlDeviceGetPowerState(handle)}")
print(f"power usage watts {nvmlDeviceGetPowerUsage(handle) / 1000}")
except NVMLError as error:
print(error)
nvmlShutdown()其实写的比较简单,只是摘取了我觉得日常会用的一些信息,还有很多其他函数,可以在pynvml.py文件里边查看获取。或者 NVIDIA GPU等网站查看自己需要的一些信息,样例等。这半年其实一直没怎写,日常工作也比较无聊,开始研究srs,以及一些pyton脚本的编写,希望把文章抓起来,多写一些。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。