Linux中计算特定CPU使用率案例详解

一、基础概念

1. /proc/stat 文件结构

cpu  1000 2000 3000 4000 500 600 700 800 900 1000
cpu0 100 200 300 400 50 60 70 80 90 100
cpu1 100 200 300 400 50 60 70 80 90 100

各列含义（单位：jiffies，通常1 jiffy=10ms）：

user: 用户态时间
nice: 低优先级用户态时间
system: 内核态时间
idle: 空闲时间
iowait: I/O等待时间
irq: 硬中断时间
softirq: 软中断时间
steal: 虚拟化环境下被偷走的时间
guest: 运行虚拟机时间
guest_nice: 低优先级虚拟机时间

二、方法一：使用 mpstat（推荐）

1. 安装与基础使用

# Ubuntu/Debian
sudo apt install sysstat

# RHEL/CentOS
sudo yum install sysstat

# 查看所有CPU核心
mpstat -P ALL 1 5

2. 监控特定CPU核心

# 监控CPU0，每秒更新，共10次
mpstat -P 0 1 10

# 输出示例：
# 02:30:00 PM  CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest  %gnice   %idle
# 02:30:01 PM    0    5.00    0.00    2.00    0.00    0.00    0.00    0.00    0.00    0.00   93.00

3. 脚本示例：监控CPU2使用率

#!/bin/bash
# monitor_cpu2.sh

INTERVAL=2
CPU_ID=2
LOG_FILE="/tmp/cpu2_monitor.log"

echo "开始监控 CPU$CPU_ID，间隔 ${INTERVAL}秒" | tee -a $LOG_FILE
echo "时间戳     用户态% 内核态% 空闲%" | tee -a $LOG_FILE

while true; do
    # 获取CPU2的统计数据
    stats=$(mpstat -P $CPU_ID $INTERVAL 1 | awk 'NR==4')

    if [ -n "$stats" ]; then
        timestamp=$(date '+%Y-%m-%d %H:%M:%S')
        usr=$(echo $stats | awk '{print $4}')
        sys=$(echo $stats | awk '{print $6}')
        idle=$(echo $stats | awk '{print $NF}')

        # 计算使用率 = 100% - 空闲%
        usage=$(echo "100 - $idle" | bc)

        echo "$timestamp  $usr%    $sys%    $idle%   使用率: $usage%" | tee -a $LOG_FILE
    fi

    sleep $INTERVAL
done

三、方法二：直接解析 /proc/stat

1. 计算单个CPU使用率函数

#!/bin/bash
# calc_cpu_usage.sh

get_cpu_usage() {
    local cpu_id=$1
    local interval=$2

    # 第一次读取
    read -r cpu user1 nice1 system1 idle1 iowait1 irq1 softirq1 steal1 guest1 guest_nice1 \
        <<< $(grep "^cpu${cpu_id}" /proc/stat)

    total1=$((user1 + nice1 + system1 + idle1 + iowait1 + irq1 + softirq1 + steal1))
    idle_total1=$((idle1 + iowait1))

    sleep $interval

    # 第二次读取
    read -r cpu user2 nice2 system2 idle2 iowait2 irq2 softirq2 steal2 guest2 guest_nice2 \
        <<< $(grep "^cpu${cpu_id}" /proc/stat)

    total2=$((user2 + nice2 + system2 + idle2 + iowait2 + irq2 + softirq2 + steal2))
    idle_total2=$((idle2 + iowait2))

    # 计算差值
    total_diff=$((total2 - total1))
    idle_diff=$((idle_total2 - idle_total1))

    # 计算使用率百分比
    if [ $total_diff -eq 0 ]; then
        echo "0"
    else
        usage=$((100 * (total_diff - idle_diff) / total_diff))
        echo $usage
    fi
}

# 使用示例：监控CPU1的使用率
while true; do
    usage=$(get_cpu_usage 1 1)
    echo "$(date '+%H:%M:%S') - CPU1使用率: ${usage}%"
    sleep 1
done

2. 更精确的Python版本

#!/usr/bin/env python3
# cpu_monitor.py

import time
import sys

def get_cpu_times(cpu_id):
    """读取特定CPU的时间统计数据"""
    with open('/proc/stat', 'r') as f:
        for line in f:
            if line.startswith(f'cpu{cpu_id}'):
                parts = line.split()
                # 转换为整数列表
                times = [int(x) for x in parts[1:]]
                return times
    return None

def calculate_usage(cpu_id, interval=1):
    """计算CPU使用率"""
    # 第一次采样
    times1 = get_cpu_times(cpu_id)
    if not times1:
        print(f"CPU{cpu_id} 不存在")
        return None

    total1 = sum(times1)
    idle1 = times1[3] + times1[4]  # idle + iowait

    time.sleep(interval)

    # 第二次采样
    times2 = get_cpu_times(cpu_id)
    total2 = sum(times2)
    idle2 = times2[3] + times2[4]

    # 计算差值
    total_diff = total2 - total1
    idle_diff = idle2 - idle1

    if total_diff == 0:
        return 0

    # 计算使用率
    usage = 100.0 * (total_diff - idle_diff) / total_diff
    return round(usage, 2)

def detailed_monitor(cpu_id, interval=2):
    """详细监控CPU使用情况"""
    print(f"监控 CPU{cpu_id}，按 Ctrl+C 退出")
    print(f"{'时间':<20} {'用户%':<8} {'系统%':<8} {'空闲%':<8} {'总使用率%':<10}")
    print("="*60)

    while True:
        times1 = get_cpu_times(cpu_id)
        total1 = sum(times1)

        time.sleep(interval)

        times2 = get_cpu_times(cpu_id)
        total2 = sum(times2)

        total_diff = total2 - total1

        if total_diff > 0:
            metrics = {}
            labels = ['用户', '低优先级', '系统', '空闲', 'IO等待', 
                     '硬中断', '软中断', '虚拟化', '虚拟机', '低优先级虚拟机']

            for i in range(len(times1)):
                diff = times2[i] - times1[i]
                percentage = 100.0 * diff / total_diff
                metrics[labels[i]] = round(percentage, 2)

            timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
            usage = 100 - metrics['空闲'] - metrics['IO等待']

            print(f"{timestamp}  "
                  f"{metrics['用户']:>6.2f}%  "
                  f"{metrics['系统']:>6.2f}%  "
                  f"{metrics['空闲']:>6.2f}%  "
                  f"{usage:>9.2f}%")

if __name__ == "__main__":
    # 监控CPU0，间隔2秒
    if len(sys.argv) > 1:
        cpu_id = int(sys.argv[1])
    else:
        cpu_id = 0

    try:
        detailed_monitor(cpu_id)
    except KeyboardInterrupt:
        print("\n监控结束")

四、方法三：使用 perf 工具

1. 系统级监控

# 监控特定CPU的事件
sudo perf stat -C 0,1 sleep 5

# 监控CPU0的缓存命中率
sudo perf stat -C 0 -e cache-references,cache-misses sleep 5

2. 实时监控脚本

#!/bin/bash
# perf_cpu_monitor.sh

CPU_ID=$1
DURATION=${2:-10}
INTERVAL=${3:-1}

echo "使用 perf 监控 CPU$CPU_ID，持续 $DURATION 秒"

for ((i=0; i<$DURATION; i+=$INTERVAL)); do
    # 收集1秒内的性能数据
    sudo perf stat -C $CPU_ID -e cycles,instructions,cache-misses sleep $INTERVAL 2>&1 | \
        grep -E "(cycles|instructions|cache-misses)" | \
        while read line; do
            echo "[$(date '+%H:%M:%S')] CPU$CPU_ID: $line"
        done
done

五、高级应用案例

案例1：监控进程的CPU亲和性使用情况

#!/bin/bash
# monitor_process_cpu_affinity.sh

PID=$1
if [ -z "$PID" ]; then
    echo "用法: $0 <进程ID>"
    exit 1
fi

echo "监控进程 $PID 的CPU使用情况"
echo "按 Ctrl+C 停止"

while true; do
    # 获取进程的CPU亲和性
    taskset -p $PID 2>/dev/null | grep -oP 'affinity:\s*\K[0-9a-f]+'

    # 查看进程在每个CPU上的运行时间
    echo "进程在各CPU核心的时间分布:"
    grep -E "^cpu[0-9]+" /proc/$PID/stat 2>/dev/null || \
        ps -p $PID -o psr,pcpu,time

    # 使用pidstat查看进程的CPU使用
    pidstat -p $PID 1 1 | grep -A1 "Average"

    sleep 2
    echo "---"
done

案例2：实时CPU热力图

#!/usr/bin/env python3
# cpu_heatmap.py

import os
import time
import sys
from collections import deque

def get_cpu_count():
    """获取CPU核心数量"""
    with open('/proc/cpuinfo', 'r') as f:
        return sum(1 for line in f if line.startswith('processor'))

def get_all_cpu_usage(interval=1):
    """获取所有CPU核心的使用率"""
    cpu_count = get_cpu_count()
    results = []

    # 读取所有CPU的初始值
    initial = []
    with open('/proc/stat', 'r') as f:
        for i in range(cpu_count + 1):
            line = f.readline()
            if i > 0:  # 跳过总的cpu行
                parts = line.split()
                times = [int(x) for x in parts[1:]]
                initial.append({
                    'total': sum(times),
                    'idle': times[3] + times[4]
                })

    time.sleep(interval)

    # 读取所有CPU的结束值
    with open('/proc/stat', 'r') as f:
        for i in range(cpu_count + 1):
            line = f.readline()
            if i > 0:
                parts = line.split()
                times = [int(x) for x in parts[1:]]
                total2 = sum(times)
                idle2 = times[3] + times[4]

                # 计算使用率
                idx = i - 1
                total_diff = total2 - initial[idx]['total']
                idle_diff = idle2 - initial[idx]['idle']

                if total_diff > 0:
                    usage = 100.0 * (total_diff - idle_diff) / total_diff
                    results.append(round(usage, 1))
                else:
                    results.append(0.0)

    return results

def print_heatmap(usages, width=50):
    """打印ASCII热力图"""
    os.system('clear' if os.name == 'posix' else 'cls')

    print("=" * 60)
    print("CPU使用率热力图 (刷新间隔: 1秒)")
    print("=" * 60)

    for i, usage in enumerate(usages):
        # 创建进度条
        bars = int(usage * width / 100)
        bar = '█' * bars + '░' * (width - bars)

        # 颜色编码
        if usage > 80:
            color = "\033[91m"  # 红色
        elif usage > 50:
            color = "\033[93m"  # 黄色
        elif usage > 20:
            color = "\033[92m"  # 绿色
        else:
            color = "\033[96m"  # 青色

        reset = "\033[0m"

        print(f"CPU{i:2d}: {color}{bar}{reset} {usage:5.1f}%")

    print("\n图例: ████████ 使用中 | ░░░░░░░░ 空闲")
    print(f"最高: {max(usages):.1f}% | 最低: {min(usages):.1f}% | 平均: {sum(usages)/len(usages):.1f}%")

def main():
    try:
        history = deque(maxlen=10)  # 保存最近10次记录

        while True:
            usages = get_all_cpu_usage(1)
            history.append(usages)

            print_heatmap(usages)

            # 可选：显示趋势
            if len(history) > 1:
                print("\n趋势 (最近10个点):")
                for i in range(len(usages)):
                    trend = [h[i] for h in history]
                    avg_trend = sum(trend) / len(trend)
                    print(f"CPU{i:2d} 平均: {avg_trend:5.1f}%", end=" | ")
                print()

            time.sleep(0.1)  # 控制刷新率

    except KeyboardInterrupt:
        print("\n监控结束")

if __name__ == "__main__":
    main()

案例3：CPU绑定的应用监控

#!/bin/bash
# bind_and_monitor.sh

# 绑定进程到特定CPU
BIND_CPU=2
PROGRAM="$1"

if [ -z "$PROGRAM" ]; then
    echo "用法: $0 <程序> [参数...]"
    exit 1
fi

echo "将程序绑定到 CPU$BIND_CPU 并监控"

# 启动程序并绑定CPU
taskset -c $BIND_CPU $PROGRAM &
PID=$!

echo "程序 PID: $PID"

# 监控该CPU的使用情况
(
    while kill -0 $PID 2>/dev/null; do
        # 获取该CPU的使用率
        usage=$(mpstat -P $BIND_CPU 1 1 | awk 'NR==4 {print 100-$NF}')
        echo "$(date '+%H:%M:%S') - CPU$BIND_CPU 使用率: ${usage}%"

        # 获取该进程的CPU使用
        process_usage=$(ps -p $PID -o %cpu --no-headers)
        echo "进程 $PID CPU使用: ${process_usage}%"
        echo "---"
    done
) &

# 等待程序结束
wait $PID

六、常用工具总结

工具	用途	示例
mpstat	多CPU统计	`mpstat -P 0 1 5`
pidstat	进程CPU统计	`pidstat -p <PID> 1 5`
top	交互式监控	`top -p <PID>`
htop	增强版top	`htop`
perf	性能分析	`perf stat -C 0 sleep 5`
taskset	CPU绑定	`taskset -c 0-2 ./program`
sar	系统活动报告	`sar -P ALL 1 3`