昇腾NPU功率监测

参考:https://www.hiascend.com/document/detail/zh/Atlas%20200I%20A2/24.1.RC2/re/npu/npusmi_023.html

npu-smi info -t power -i id

Nvidia GPU功率监测

nvidia-smi --query-gpu=power.draw --format=csv,noheader,nounits -i id

昇腾功率监测脚本

import subprocess
import time
import pandas as pd
import threading

# 初始化一个列表来存储功率记录及其时间戳
power_records = []

def get_power(npu_id):
    "获取指定NPU的功率"
    try:
        # 执行npu-smi info命令并捕获输出
        output = subprocess.check_output(['npu-smi', 'info', '-t', 'power', '-i', str(npu_id)]).decode('utf-8')
        # 解析输出中的功率信息
        for line in output.split('\n'):
            if 'NPU Real-time Power(W)' in line:
                power = line.split(':')[1].strip()
                return float(power)
    except subprocess.CalledProcessError as e:
        print(f"Failed to get power info: {e}")
    return None

def record_power(npu_id, interval):
    "记录指定时长内的功率信息"
    while True:
        power = get_power(npu_id)
        if power is not None:
            timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            power_records.append({'Timestamp': timestamp, 'Power': power})
            print(f"Recorded Power: {power} W at {timestamp}")
        time.sleep(interval)

def save_to_excel(file_name):
    "将功率记录保存到Excel文件"
    df = pd.DataFrame(power_records)
    df.to_excel(file_name, index=False)
    print(f"Power records saved to {file_name}")

def calculate_average_power():
    "计算平均功率"
    if power_records:
        average_power = sum(record['Power'] for record in power_records) / len(power_records)
        print(f"Average Power: {average_power} W")
    else:
        print("No power records to calculate average.")

# 设置NPU ID和记录功率的时间间隔(秒)
npu_id = 1
interval = 600  # 10分钟

# 启动记录功率的线程
power_thread = threading.Thread(target=record_power, args=(npu_id, interval))
power_thread.daemon = True  # 设置为守护线程,以便主程序结束时线程也结束
power_thread.start()

# 这里放置您的训练代码
# train_your_model()

# 当训练完成后,保存功率记录并计算平均功率
save_to_excel("power_records.xlsx")
calculate_average_power()

昇腾提交训练任务监测功率

这里有一个问题,就是需要获取实机那边的npu id,昇腾提供了一个现成的命令可以查看当前设备的NPU个数和ID:npu-smi info -l
在这里插入图片描述
在上面的代码中,npu id是直接指定的,在提交训练任务场景中,需要获取当前设备的npu id。因此需要手动获取:

import subprocess

def get_npu_id():
    try:
        # 执行npu-smi命令并获取输出
        result = subprocess.run(['npu-smi', 'info', '-l'], stdout=subprocess.PIPE, text=True)
        
        # 解析输出结果
        output = result.stdout
        # 检查输出中是否包含NPU ID信息
        if 'NPU ID' in output:
            # 提取NPU ID
            npu_id_line = [line for line in output.split('\n') if 'NPU ID' in line][0]
            npu_id = npu_id_line.split(': ')[1].strip()
            return npu_id
        else:
            return "NPU ID not found"
    except Exception as e:
        return f"An error occurred: {e}"

# 调用函数并打印NPU编号
npu_id = get_npu_id()
print(f"The NPU ID is: {npu_id}")

Nvidia功率监测脚本

import subprocess
import time
import pandas as pd
import threading

# 初始化一个列表来存储功率记录及其时间戳
power_records = []

def get_power(gpu_id):
    "获取指定GPU的功率"
    try:
        # 执行nvidia-smi命令并捕获输出
        output = subprocess.check_output(['nvidia-smi', '--query-gpu=power.draw', '--format=csv,noheader,nounits', '-i', str(gpu_id)]).decode('utf-8')
        # 输出是单行的功率值,直接转换为float
        power = float(output.strip())
        return power
    except subprocess.CalledProcessError as e:
        print(f"Failed to get power info: {e}")
    return None

def record_power(gpu_id, interval):
    "记录指定时长内的功率信息"
    while True:
        power = get_power(gpu_id)
        if power is not None:
            timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            power_records.append({'Timestamp': timestamp, 'Power': power})
            print(f"Recorded Power: {power} W at {timestamp}")
        time.sleep(interval)

def save_to_excel(file_name):
    "将功率记录保存到Excel文件"
    df = pd.DataFrame(power_records)
    df.to_excel(file_name, index=False)
    print(f"Power records saved to {file_name}")

def calculate_average_power():
    "计算平均功率"
    if power_records:
        average_power = sum(record['Power'] for record in power_records) / len(power_records)
        print(f"Average Power: {average_power} W")
    else:
        print("No power records to calculate average.")

# 设置GPU ID和记录功率的时间间隔(秒)
gpu_id = 0  # NVIDIA GPU的ID
interval = 600  # 每10分钟记录一次

# 启动记录功率的线程
power_thread = threading.Thread(target=record_power, args=(gpu_id, interval))
power_thread.daemon = True  # 设置为守护线程,以便主程序结束时线程也结束
power_thread.start()

# 这里放置您的训练代码
# train_your_model()

# 保存功率记录并计算平均功率
save_to_excel("power_records.xlsx")
calculate_average_power()
Logo

昇腾计算产业是基于昇腾系列(HUAWEI Ascend)处理器和基础软件构建的全栈 AI计算基础设施、行业应用及服务,包括昇腾系列处理器、系列硬件、CANN、AI计算框架、应用使能、开发工具链、管理运维工具、行业应用及服务等全产业链

更多推荐