一、基础概念
1. /proc/stat 文件结构
cpu 1000 2000 3000 4000 500 600 700 800 900 1000
cpu0 100 200 300 400 50 60 70 80 90 100
cpu1 100 200 300 400 50 60 70 80 90 100
各列含义(单位:jiffies,通常1 jiffy=10ms):
- user: 用户态时间
- nice: 低优先级用户态时间
- system: 内核态时间
- idle: 空闲时间
- iowait: I/O等待时间
- irq: 硬中断时间
- softirq: 软中断时间
- steal: 虚拟化环境下被偷走的时间
- guest: 运行虚拟机时间
- guest_nice: 低优先级虚拟机时间
二、方法一:使用 mpstat(推荐)
1. 安装与基础使用
# Ubuntu/Debian
sudo apt install sysstat
# RHEL/CentOS
sudo yum install sysstat
# 查看所有CPU核心
mpstat -P ALL 1 5
2. 监控特定CPU核心
# 监控CPU0,每秒更新,共10次
mpstat -P 0 1 10
# 输出示例:
# 02:30:00 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
# 02:30:01 PM 0 5.00 0.00 2.00 0.00 0.00 0.00 0.00 0.00 0.00 93.00
3. 脚本示例:监控CPU2使用率
#!/bin/bash
# monitor_cpu2.sh
INTERVAL=2
CPU_ID=2
LOG_FILE="/tmp/cpu2_monitor.log"
echo "开始监控 CPU$CPU_ID,间隔 ${INTERVAL}秒" | tee -a $LOG_FILE
echo "时间戳 用户态% 内核态% 空闲%" | tee -a $LOG_FILE
while true; do
# 获取CPU2的统计数据
stats=$(mpstat -P $CPU_ID $INTERVAL 1 | awk 'NR==4')
if [ -n "$stats" ]; then
timestamp=$(date '+%Y-%m-%d %H:%M:%S')
usr=$(echo $stats | awk '{print $4}')
sys=$(echo $stats | awk '{print $6}')
idle=$(echo $stats | awk '{print $NF}')
# 计算使用率 = 100% - 空闲%
usage=$(echo "100 - $idle" | bc)
echo "$timestamp $usr% $sys% $idle% 使用率: $usage%" | tee -a $LOG_FILE
fi
sleep $INTERVAL
done
三、方法二:直接解析 /proc/stat
1. 计算单个CPU使用率函数
#!/bin/bash
# calc_cpu_usage.sh
get_cpu_usage() {
local cpu_id=$1
local interval=$2
# 第一次读取
read -r cpu user1 nice1 system1 idle1 iowait1 irq1 softirq1 steal1 guest1 guest_nice1 \
<<< $(grep "^cpu${cpu_id}" /proc/stat)
total1=$((user1 + nice1 + system1 + idle1 + iowait1 + irq1 + softirq1 + steal1))
idle_total1=$((idle1 + iowait1))
sleep $interval
# 第二次读取
read -r cpu user2 nice2 system2 idle2 iowait2 irq2 softirq2 steal2 guest2 guest_nice2 \
<<< $(grep "^cpu${cpu_id}" /proc/stat)
total2=$((user2 + nice2 + system2 + idle2 + iowait2 + irq2 + softirq2 + steal2))
idle_total2=$((idle2 + iowait2))
# 计算差值
total_diff=$((total2 - total1))
idle_diff=$((idle_total2 - idle_total1))
# 计算使用率百分比
if [ $total_diff -eq 0 ]; then
echo "0"
else
usage=$((100 * (total_diff - idle_diff) / total_diff))
echo $usage
fi
}
# 使用示例:监控CPU1的使用率
while true; do
usage=$(get_cpu_usage 1 1)
echo "$(date '+%H:%M:%S') - CPU1使用率: ${usage}%"
sleep 1
done
2. 更精确的Python版本
#!/usr/bin/env python3
# cpu_monitor.py
import time
import sys
def get_cpu_times(cpu_id):
"""读取特定CPU的时间统计数据"""
with open('/proc/stat', 'r') as f:
for line in f:
if line.startswith(f'cpu{cpu_id}'):
parts = line.split()
# 转换为整数列表
times = [int(x) for x in parts[1:]]
return times
return None
def calculate_usage(cpu_id, interval=1):
"""计算CPU使用率"""
# 第一次采样
times1 = get_cpu_times(cpu_id)
if not times1:
print(f"CPU{cpu_id} 不存在")
return None
total1 = sum(times1)
idle1 = times1[3] + times1[4] # idle + iowait
time.sleep(interval)
# 第二次采样
times2 = get_cpu_times(cpu_id)
total2 = sum(times2)
idle2 = times2[3] + times2[4]
# 计算差值
total_diff = total2 - total1
idle_diff = idle2 - idle1
if total_diff == 0:
return 0
# 计算使用率
usage = 100.0 * (total_diff - idle_diff) / total_diff
return round(usage, 2)
def detailed_monitor(cpu_id, interval=2):
"""详细监控CPU使用情况"""
print(f"监控 CPU{cpu_id},按 Ctrl+C 退出")
print(f"{'时间':<20} {'用户%':<8} {'系统%':<8} {'空闲%':<8} {'总使用率%':<10}")
print("="*60)
while True:
times1 = get_cpu_times(cpu_id)
total1 = sum(times1)
time.sleep(interval)
times2 = get_cpu_times(cpu_id)
total2 = sum(times2)
total_diff = total2 - total1
if total_diff > 0:
metrics = {}
labels = ['用户', '低优先级', '系统', '空闲', 'IO等待',
'硬中断', '软中断', '虚拟化', '虚拟机', '低优先级虚拟机']
for i in range(len(times1)):
diff = times2[i] - times1[i]
percentage = 100.0 * diff / total_diff
metrics[labels[i]] = round(percentage, 2)
timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
usage = 100 - metrics['空闲'] - metrics['IO等待']
print(f"{timestamp} "
f"{metrics['用户']:>6.2f}% "
f"{metrics['系统']:>6.2f}% "
f"{metrics['空闲']:>6.2f}% "
f"{usage:>9.2f}%")
if __name__ == "__main__":
# 监控CPU0,间隔2秒
if len(sys.argv) > 1:
cpu_id = int(sys.argv[1])
else:
cpu_id = 0
try:
detailed_monitor(cpu_id)
except KeyboardInterrupt:
print("\n监控结束")
四、方法三:使用 perf 工具
1. 系统级监控
# 监控特定CPU的事件
sudo perf stat -C 0,1 sleep 5
# 监控CPU0的缓存命中率
sudo perf stat -C 0 -e cache-references,cache-misses sleep 5
2. 实时监控脚本
#!/bin/bash
# perf_cpu_monitor.sh
CPU_ID=$1
DURATION=${2:-10}
INTERVAL=${3:-1}
echo "使用 perf 监控 CPU$CPU_ID,持续 $DURATION 秒"
for ((i=0; i<$DURATION; i+=$INTERVAL)); do
# 收集1秒内的性能数据
sudo perf stat -C $CPU_ID -e cycles,instructions,cache-misses sleep $INTERVAL 2>&1 | \
grep -E "(cycles|instructions|cache-misses)" | \
while read line; do
echo "[$(date '+%H:%M:%S')] CPU$CPU_ID: $line"
done
done
五、高级应用案例
案例1:监控进程的CPU亲和性使用情况
#!/bin/bash
# monitor_process_cpu_affinity.sh
PID=$1
if [ -z "$PID" ]; then
echo "用法: $0 <进程ID>"
exit 1
fi
echo "监控进程 $PID 的CPU使用情况"
echo "按 Ctrl+C 停止"
while true; do
# 获取进程的CPU亲和性
taskset -p $PID 2>/dev/null | grep -oP 'affinity:\s*\K[0-9a-f]+'
# 查看进程在每个CPU上的运行时间
echo "进程在各CPU核心的时间分布:"
grep -E "^cpu[0-9]+" /proc/$PID/stat 2>/dev/null || \
ps -p $PID -o psr,pcpu,time
# 使用pidstat查看进程的CPU使用
pidstat -p $PID 1 1 | grep -A1 "Average"
sleep 2
echo "---"
done
案例2:实时CPU热力图
#!/usr/bin/env python3
# cpu_heatmap.py
import os
import time
import sys
from collections import deque
def get_cpu_count():
"""获取CPU核心数量"""
with open('/proc/cpuinfo', 'r') as f:
return sum(1 for line in f if line.startswith('processor'))
def get_all_cpu_usage(interval=1):
"""获取所有CPU核心的使用率"""
cpu_count = get_cpu_count()
results = []
# 读取所有CPU的初始值
initial = []
with open('/proc/stat', 'r') as f:
for i in range(cpu_count + 1):
line = f.readline()
if i > 0: # 跳过总的cpu行
parts = line.split()
times = [int(x) for x in parts[1:]]
initial.append({
'total': sum(times),
'idle': times[3] + times[4]
})
time.sleep(interval)
# 读取所有CPU的结束值
with open('/proc/stat', 'r') as f:
for i in range(cpu_count + 1):
line = f.readline()
if i > 0:
parts = line.split()
times = [int(x) for x in parts[1:]]
total2 = sum(times)
idle2 = times[3] + times[4]
# 计算使用率
idx = i - 1
total_diff = total2 - initial[idx]['total']
idle_diff = idle2 - initial[idx]['idle']
if total_diff > 0:
usage = 100.0 * (total_diff - idle_diff) / total_diff
results.append(round(usage, 1))
else:
results.append(0.0)
return results
def print_heatmap(usages, width=50):
"""打印ASCII热力图"""
os.system('clear' if os.name == 'posix' else 'cls')
print("=" * 60)
print("CPU使用率热力图 (刷新间隔: 1秒)")
print("=" * 60)
for i, usage in enumerate(usages):
# 创建进度条
bars = int(usage * width / 100)
bar = '█' * bars + '░' * (width - bars)
# 颜色编码
if usage > 80:
color = "\033[91m" # 红色
elif usage > 50:
color = "\033[93m" # 黄色
elif usage > 20:
color = "\033[92m" # 绿色
else:
color = "\033[96m" # 青色
reset = "\033[0m"
print(f"CPU{i:2d}: {color}{bar}{reset} {usage:5.1f}%")
print("\n图例: ████████ 使用中 | ░░░░░░░░ 空闲")
print(f"最高: {max(usages):.1f}% | 最低: {min(usages):.1f}% | 平均: {sum(usages)/len(usages):.1f}%")
def main():
try:
history = deque(maxlen=10) # 保存最近10次记录
while True:
usages = get_all_cpu_usage(1)
history.append(usages)
print_heatmap(usages)
# 可选:显示趋势
if len(history) > 1:
print("\n趋势 (最近10个点):")
for i in range(len(usages)):
trend = [h[i] for h in history]
avg_trend = sum(trend) / len(trend)
print(f"CPU{i:2d} 平均: {avg_trend:5.1f}%", end=" | ")
print()
time.sleep(0.1) # 控制刷新率
except KeyboardInterrupt:
print("\n监控结束")
if __name__ == "__main__":
main()
案例3:CPU绑定的应用监控
#!/bin/bash
# bind_and_monitor.sh
# 绑定进程到特定CPU
BIND_CPU=2
PROGRAM="$1"
if [ -z "$PROGRAM" ]; then
echo "用法: $0 <程序> [参数...]"
exit 1
fi
echo "将程序绑定到 CPU$BIND_CPU 并监控"
# 启动程序并绑定CPU
taskset -c $BIND_CPU $PROGRAM &
PID=$!
echo "程序 PID: $PID"
# 监控该CPU的使用情况
(
while kill -0 $PID 2>/dev/null; do
# 获取该CPU的使用率
usage=$(mpstat -P $BIND_CPU 1 1 | awk 'NR==4 {print 100-$NF}')
echo "$(date '+%H:%M:%S') - CPU$BIND_CPU 使用率: ${usage}%"
# 获取该进程的CPU使用
process_usage=$(ps -p $PID -o %cpu --no-headers)
echo "进程 $PID CPU使用: ${process_usage}%"
echo "---"
done
) &
# 等待程序结束
wait $PID
六、常用工具总结
| 工具 |
用途 |
示例 |
|---|
| mpstat |
多CPU统计 |
mpstat -P 0 1 5 |
| pidstat |
进程CPU统计 |
pidstat -p <PID> 1 5 |
| top |
交互式监控 |
top -p <PID> |
| htop |
增强版top |
htop |
| perf |
性能分析 |
perf stat -C 0 sleep 5 |
| taskset |
CPU绑定 |
taskset -c 0-2 ./program |
| sar |
系统活动报告 |
sar -P ALL 1 3 |
七、最佳实践建议
采样间隔选择
- 故障排查:1秒间隔
- 性能分析:5-10秒间隔
- 长期监控:1-5分钟间隔
注意事项
- 区分用户态和内核态使用率
- 注意I/O等待时间的影响
- 在多核系统中,单个进程可能在不同核心间迁移
监控告警阈值建议
- 警告:单个CPU > 80% 持续5分钟
- 严重:单个CPU > 95% 持续2分钟
- 紧急:系统平均负载 > CPU核心数 × 2
这些方法可以满足大多数场景下的特定CPU使用率监控需求,从简单的命令行工具到复杂的脚本监控系统。