
在现代IT基础设施中,Ubuntu Server作为主流Linux发行版,其性能监控是确保系统稳定性和应用性能的关键。本文将深入探讨Ubuntu Server性能监控的高级实践,涵盖原生命令的深度使用和开源监控方案的实现。
动态进程树分析
# 实时监控进程资源继承关系
ps auxf --forest --sort=-%cpu
# 结合pstree进行更直观的进程树展示
ps -eo pid,ppid,cmd,%mem,%cpu --forest | head -20
# 监控特定进程族的资源汇总
ps -e -o pid,ppid,pgid,sess,comm --forest | grep -A5 -B5 nginx进程状态深度解析
# 监控进程状态转换(特别是D状态不可中断进程)
while true; do
ps -eo pid,state,comm | grep "^.* D " && date
sleep 1
done
# 僵尸进程检测与清理脚本
zombies=$(ps aux | awk '{print $8}' | grep -c Z)
if [ $zombies -gt 0 ]; then
echo "发现僵尸进程: $(date)" >> /var/log/zombie_monitor.log
ps aux | awk '$8=="Z" {print $2, $11}' >> /var/log/zombie_monitor.log
fi深入/proc内存分析
# 实时监控内存分配细节
watch -n 2 'cat /proc/meminfo | grep -E "(MemTotal|MemFree|MemAvailable|Buffers|Cached|Slab|SReclaimable|SUnreclaim)"'
# 监控页面分配和回收速率
cat /proc/vmstat | grep -E "(pgpgin|pgpgout|pswpin|pswpout)"
# 详细的内存映射分析(针对特定进程)
pid=$(pgrep nginx | head -1)
cat /proc/$pid/smaps | awk '/Size|Rss|Pss|Shared_Clean|Shared_Dirty/ {print $0}' | head -20内存泄漏检测脚本
#!/bin/bash
# memory_leak_detector.sh
LOG_FILE="/var/log/memory_trend.log"
THRESHOLD_PERCENT=80
while true; do
# 获取内存使用率
mem_total=$(grep MemTotal /proc/meminfo | awk '{print $2}')
mem_available=$(grep MemAvailable /proc/meminfo | awk '{print $2}')
mem_usage_percent=$((100 - (mem_available * 100) / mem_total))
# 获取Slab内存使用情况
slab_unreclaimable=$(grep SUnreclaim /proc/meminfo | awk '{print $2}')
# 记录趋势数据
echo "$(date): MemUsage=${mem_usage_percent}% SlabUnreclaim=${slab_unreclaimable}kB" >> $LOG_FILE
# 预警检测
if [ $mem_usage_percent -gt $THRESHOLD_PERCENT ]; then
echo "内存使用率超过阈值: ${mem_usage_percent}%" | mail -s "内存告警 $(hostname)" admin@example.com
fi
sleep 300
done块设备级I/O分析
# 实时监控各设备I/O队列深度和延迟
iostat -dxmt 2 | awk '
BEGIN { print "Device rrqm/s wrqm/s r/s w/s await svctm %util" }
!/^$/ && !/Linux/ && !/Device/ {
if ($14 > 80 || $10 > 50)
print $0 " <--- 注意!"
else
print $0
}'
# 监控特定进程的I/O活动
iotop -aoP | head -20
# 深入/proc查看进程级I/O统计
for pid in $(ps aux | grep nginx | awk '{print $2}'); do
echo "PID $pid I/O:"
cat /proc/$pid/io
doneI/O瓶颈诊断脚本
#!/bin/bash
# io_bottleneck_detector.sh
# 监控I/O等待和队列长度
while true; do
# 从/proc/diskstats获取原始数据
cat /proc/diskstats | grep "sda " | awk '
{
reads = $4; read_sectors = $6; writes = $8; write_sectors = $10;
io_time = $13; weighted_io = $14;
# 计算瞬时差异
getline < "/tmp/diskstats_prev"
prev_reads = $4; prev_writes = $8; prev_io_time = $13;
# 计算速率
read_iops = reads - prev_reads;
write_iops = writes - prev_writes;
io_util = (io_time - prev_io_time) / 10; # 转换为百分比
print strftime("%Y-%m-%d %H:%M:%S") " ReadIOPS:" read_iops " WriteIOPS:" write_iops " IOUtil:" io_util "%";
# 保存当前状态
print reads, writes, io_time > "/tmp/diskstats_prev"
}'
sleep 5
doneTCP连接状态深度监控
# 监控TCP连接状态分布(生产环境关键指标)
netstat -n | awk '/^tcp/ {++S[$NF]} END {for(a in S) print a, S[a]}'
# 实时监控TCP重传和拥塞窗口
ss -it | grep -E "cwnd|rtt|retrans"
# 网络缓冲区监控
cat /proc/sys/net/ipv4/tcp_rmem
cat /proc/sys/net/ipv4/tcp_wmem网络性能诊断脚本
#!/bin/bash
# network_perf_monitor.sh
interface="eth0"
log_file="/var/log/network_stats.log"
while true; do
# 获取网络统计
rx_errors=$(cat /sys/class/net/$interface/statistics/rx_errors)
tx_errors=$(cat /sys/class/net/$interface/statistics/tx_errors)
rx_drop=$(cat /sys/class/net/$interface/statistics/rx_dropped)
tx_drop=$(cat /sys/class/net/$interface/statistics/tx_dropped)
# 获取TCP重传统计
tcp_retrans=$(cat /proc/net/snmp | grep Tcp | awk 'NR==2 {print $13}')
# 记录到日志
echo "$(date): RX_Errors=$rx_errors TX_Errors=$tx_errors RX_Drop=$rx_drop TX_Drop=$tx_drop TCP_Retrans=$tcp_retrans" >> $log_file
# 错误率超过阈值告警
if [ $rx_errors -gt 100 ] || [ $tx_errors -gt 100 ]; then
echo "网络错误率异常升高" | mail -s "网络告警 $(hostname)" admin@example.com
fi
sleep 60
done自定义应用指标暴露
# app_metrics.py - Flask应用性能指标
from prometheus_client import Counter, Histogram, Gauge, generate_latest
from flask import Flask, Response
import time
app = Flask(__name__)
# 定义自定义指标
REQUEST_COUNT = Counter('app_requests_total', 'Total HTTP Requests', ['method', 'endpoint', 'status'])
REQUEST_LATENCY = Histogram('app_request_latency_seconds', 'Request latency', ['endpoint'])
ACTIVE_USERS = Gauge('app_active_users', 'Active users')
@app.route('/metrics')
def metrics():
return Response(generate_latest(), mimetype='text/plain')
@app.before_request
def before_request():
request.start_time = time.time()
@app.after_request
def after_request(response):
latency = time.time() - request.start_time
REQUEST_COUNT.labels(request.method, request.path, response.status_code).inc()
REQUEST_LATENCY.labels(request.path).observe(latency)
return response高级PromQL查询示例
# 计算95分位响应时间
histogram_quantile(0.95,
sum(rate(app_request_latency_seconds_bucket[5m])) by (le, endpoint)
)
# 预测磁盘空间耗尽时间
predict_linear(node_filesystem_free_bytes[6h], 3600 * 24 * 7) < 0
# 服务SLA计算
avg_over_time(up{job="api-server"}[30d]) * 100多数据中心监控配置
# prometheus.yml - 联邦配置
global:
scrape_interval: 15s
scrape_configs:
- job_name: 'federation'
honor_labels: true
metrics_path: '/federate'
params:
'match[]':
- '{job=~".+"}'
static_configs:
- targets:
- 'prometheus-dc1:9090'
- 'prometheus-dc2:9090'
- job_name: 'cross-dc-alert'
static_configs:
- targets: ['alertmanager:9093']分级监控策略
# 监控等级定义
monitoring_levels:
critical:
resources: [cpu, memory, disk, network]
interval: "15s"
retention: "30d"
alerts: [pager_duty, email, slack]
important:
resources: [application_metrics, business_metrics]
interval: "1m"
retention: "90d"
alerts: [email, slack]
informational:
resources: [log_analysis, user_behavior]
interval: "5m"
retention: "1y"
alerts: [slack]基于机器学习的异常检测
# anomaly_detector.py
from sklearn.ensemble import IsolationForest
import numpy as np
import pandas as pd
class PerformanceAnomalyDetector:
def __init__(self):
self.model = IsolationForest(contamination=0.1)
self.is_fitted = False
def train(self, historical_data):
"""使用历史数据训练异常检测模型"""
self.model.fit(historical_data)
self.is_fitted = True
def detect_anomalies(self, current_metrics):
"""检测当前指标是否异常"""
if not self.is_fitted:
return np.array([False] * len(current_metrics))
predictions = self.model.predict(current_metrics)
return predictions == -1Kubernetes集群深度监控
# k8s-monitoring-config.yml
apiVersion: v1
kind: ConfigMap
metadata:
name: custom-kube-monitoring
data:
prometheus.yml: |
global:
scrape_interval: 30s
rule_files:
- /etc/prometheus/rules/*.yml
scrape_configs:
- job_name: 'kubernetes-pods'
kubernetes_sources:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
target_label: __metrics_path__
regex: (.+)业务级监控视图
{
"dashboard": {
"title": "业务性能全景图",
"panels": [
{
"title": "用户转化漏斗",
"type": "bargauge",
"targets": [
{
"expr": "sum(rate(user_registration_completed[5m])) / sum(rate(user_registration_started[5m])) * 100",
"legendFormat": "注册转化率"
}
]
}
]
}
}# report_generator.py
import pandas as pd
from datetime import datetime, timedelta
def generate_performance_report(start_date, end_date):
"""生成性能分析报告"""
# 从多个数据源聚合数据
system_metrics = query_prometheus('system_metrics', start_date, end_date)
app_metrics = query_prometheus('app_metrics', start_date, end_date)
business_metrics = query_business_db(start_date, end_date)
# 生成综合分析报告
report = {
'summary': generate_executive_summary(system_metrics, app_metrics, business_metrics),
'trends': calculate_performance_trends(system_metrics),
'recommendations': generate_optimization_recommendations()
}
return report# 监控监控系统(Meta-monitoring)
#!/bin/bash
# monitor_the_monitor.sh
check_prometheus_health() {
curl -s http://localhost:9090/-/healthy | grep -q "Prometheus is Healthy"
if [ $? -ne 0 ]; then
echo "Prometheus健康检查失败" | mail -s "监控系统告警" admin@example.com
fi
}
check_storage_usage() {
usage=$(df /prometheus | awk 'NR==2 {print $5}' | sed 's/%//')
if [ $usage -gt 85 ]; then
echo "Prometheus存储使用率超过85%" | mail -s "存储告警" admin@example.com
fi
}Ubuntu Server性能监控是一个需要深度实践和持续优化的领域。通过结合原生命令的精细监控和开源监控系统的规模化能力,可以构建出适应不同场景的完整监控体系。关键在于理解业务需求,制定合理的监控策略,并建立完善的告警和响应机制。
在生产环境中,监控不仅是技术问题,更是组织流程问题。建立完善的监控文化,确保监控数据的有效利用,才能真正发挥监控系统的价值。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。