[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"$f5mW_TQgz-CfuCiRuluKevpCEg-bLp_5wZyjiMuwL4co":3,"$fJU-4tot_gC5fDkujNeoE-cGsdMy5V_KcdUXLuAnTFgw":16,"$fvU5DcS4qHN5sBGg7erm7SP7ZJAV8AxV6yi7gXetJaUg":423},{"slug":4,"title":5,"description":6,"content":7,"content_html":8,"pub_date":9,"tags":10,"draft":15},"linux-performance-tuning","Linux 性能调优实战：从 top 到 perf 的完整工具链","遇到性能问题不知道从哪下手？这篇建立系统化的排查思路，从 CPU\u002F内存\u002FIO\u002F网络逐层分析。","# Linux 性能调优实战：从 top 到 perf 的完整工具链\n\n> 性能问题的排查从来不是\"运行一个命令\"，而是系统性地缩小问题范围。\n\n## 方法论：USE 方法\n\n每个资源（CPU、内存、IO、网络）都检查三个维度：\n\n- **U**tilization：使用率（该资源有多忙？）\n- **S**aturation：饱和度（是否有请求在排队等待？）\n- **E**rrors：错误（是否有操作失败？）\n\n从宏观到微观：先确定瓶颈在哪个资源层，再深入该层排查。\n\n## CPU 分析\n\n### 基础工具\n\n```bash\n# top：实时概览\n# 关键指标：us（用户态）、sy（内核态）、wa（IO等待）、si（软中断）\ntop -H  # 显示线程级别\n\n# 负载均值的含义\n# Load Average: 1.23, 0.87, 0.65 （1分钟, 5分钟, 15分钟）\n# 单核 CPU：load=1 表示满载，>1 表示有任务在排队\n# 4核 CPU：load=4 才是满载\n# 经验法则：load \u002F 核数 > 0.7 需要关注\nnproc   # 查看核数\nuptime  # 查看负载均值\n```\n\n```bash\n# mpstat：多核 CPU 详情\napt install sysstat\nmpstat -P ALL 1  # 每秒刷新，显示所有核\n\n# 输出示例\n# CPU  %usr  %sys  %iowait  %idle\n# all  45.2   8.1     12.3   34.4\n# 0    89.1  10.9      0.0    0.0  \u003C- 某个核跑满了\n```\n\n```bash\n# 上下文切换分析\nvmstat 1\n# r: 运行队列（等待 CPU 的进程数），持续 > 核数说明 CPU 紧张\n# b: 阻塞进程数（等待 IO）\n# cs: 每秒上下文切换次数，过高（>10万）说明线程太多\n\npidstat -w -p \u003CPID> 1  # 进程级上下文切换\n```\n\n### perf：性能分析利器\n\n```bash\n# 安装\napt install linux-perf\n\n# 采样 CPU 热点（30秒）\nperf record -g -F 99 -p \u003CPID> sleep 30\nperf report  # 交互式查看\n\n# 全系统热点\nperf top -g  # 实时显示热函数\n\n# 统计特定事件\nperf stat -e cache-misses,cache-references,instructions,cycles .\u002Fyour_program\n```\n\n### 火焰图\n\n```bash\n# 1. 用 perf 采集\nperf record -g -F 99 -p \u003CPID> sleep 60\nperf script > out.perf\n\n# 2. 用 FlameGraph 生成 SVG\ngit clone https:\u002F\u002Fgithub.com\u002Fbrendangregg\u002FFlameGraph\n.\u002FFlameGraph\u002Fstackcollapse-perf.pl out.perf > out.folded\n.\u002FFlameGraph\u002Fflamegraph.pl out.folded > flamegraph.svg\n# 宽度代表 CPU 时间占比，越宽越值得优化\n```\n\n## 内存分析\n\n```bash\n# free：内存使用概览\nfree -h\n# Mem: total 16G, used 8G, free 2G, buff\u002Fcache 6G, available 8G\n# \"available\" 才是实际可用（包含可回收的 cache），不是 free\n\n# vmstat：虚拟内存统计\nvmstat 1\n# si\u002Fso: swap in\u002Fout（非零说明内存紧张，性能急剧下降）\n\n# 进程内存详情\ncat \u002Fproc\u002F\u003CPID>\u002Fsmaps | grep -E \"^(Pss|Rss|Size)\" | \\\n    awk '{sum[$1]+=$2} END{for(k in sum)print k, sum[k]\u002F1024, \"MB\"}'\n# RSS: 实际占用物理内存\n# PSS: 按比例分配共享内存后的大小（更准确）\n```\n\n### OOM Killer\n\n```bash\n# OOM 被触发时的日志\ndmesg | grep -i \"oom\\|killed process\"\njournalctl -k | grep -i oom\n\n# 输出示例：\n# Out of memory: Kill process 12345 (java) score 892 or sacrifice child\n# Killed process 12345 (java) total-vm:4096MB, anon-rss:3800MB\n\n# 调整 OOM 优先级（-1000 到 1000，越高越先被杀）\necho 500 > \u002Fproc\u002F\u003CPID>\u002Foom_score_adj   # 提高被杀概率\necho -500 > \u002Fproc\u002F\u003CPID>\u002Foom_score_adj  # 降低被杀概率\n```\n\n### 内存泄漏定位\n\n```bash\n# valgrind（C\u002FC++）\nvalgrind --leak-check=full --track-origins=yes .\u002Fprogram\n\n# Python：tracemalloc\nimport tracemalloc\ntracemalloc.start()\n# ... 运行一段时间\nsnapshot = tracemalloc.take_snapshot()\ntop_stats = snapshot.statistics('lineno')\nfor stat in top_stats[:10]:\n    print(stat)\n```\n\n## IO 分析\n\n```bash\n# iostat：磁盘 IO 统计\niostat -xz 1\n# %util: 磁盘利用率（接近100%说明磁盘是瓶颈）\n# await: 平均等待时间（ms），SSD 应 \u003C1ms，HDD 应 \u003C10ms\n# r\u002Fs, w\u002Fs: 每秒读写次数（IOPS）\n\n# iotop：进程级 IO\niotop -o  # 只显示有 IO 的进程\niotop -a  # 显示累计 IO 而非速率\n\n# 打开文件\nlsof -p \u003CPID>\n```\n\n### Page Cache 机制\n\n```bash\n# Linux 大量使用 Page Cache 加速 IO\ncat \u002Fproc\u002Fmeminfo | grep -E \"Cached|Buffers|Dirty|Writeback\"\n# Dirty: 待写入磁盘的脏页\n# Writeback: 正在写入磁盘的页\n\n# 手动清理 cache（测试场景）\necho 3 > \u002Fproc\u002Fsys\u002Fvm\u002Fdrop_caches\n\n# 调整脏页刷新策略\nsysctl -w vm.dirty_ratio=10\nsysctl -w vm.dirty_background_ratio=5\n```\n\n## 网络分析\n\n```bash\n# ss：替代 netstat，更快\nss -tunap  # 显示 TCP\u002FUDP 连接\nss -s      # 连接统计\n\n# TIME_WAIT 大量堆积\nss -tan state time-wait | wc -l\n# 解决方案\nsysctl -w net.ipv4.tcp_tw_reuse=1\nsysctl -w net.ipv4.tcp_fin_timeout=30\n\n# 网络吞吐监控\niftop -i eth0  # 实时带宽\nnethogs        # 进程级流量\n```\n\n### TCP 状态机\n\n```\nCLOSED -> LISTEN -> SYN_RCVD -> ESTABLISHED -> FIN_WAIT_1 -> FIN_WAIT_2 -> TIME_WAIT -> CLOSED\n                                             |\n                                        CLOSE_WAIT -> LAST_ACK -> CLOSED\n```\n\n常见异常状态：\n- 大量 `CLOSE_WAIT`：服务端没有正常关闭连接（代码 bug）\n- 大量 `TIME_WAIT`：短连接太多，考虑连接池\n- 大量 `SYN_RECV`：可能遭受 SYN Flood 攻击\n\n## 系统调用追踪\n\n```bash\n# strace：跟踪进程的系统调用\nstrace -p \u003CPID>\nstrace -c .\u002Fprogram                      # 统计频次\nstrace -e trace=open,read,write .\u002Fprogram\nstrace -T .\u002Fprogram                      # 显示每个调用耗时\n\n# ltrace：跟踪库函数调用\nltrace -p \u003CPID>\nltrace -e malloc+free .\u002Fprogram\n```\n\n## 调优手段\n\n### sysctl 参数\n\n```bash\n# 网络优化\nsysctl -w net.core.somaxconn=65535\nsysctl -w net.ipv4.tcp_max_syn_backlog=65535\nsysctl -w net.core.rmem_max=16777216\nsysctl -w net.core.wmem_max=16777216\n\n# 持久化\necho \"net.core.somaxconn=65535\" >> \u002Fetc\u002Fsysctl.conf\nsysctl -p\n```\n\n### ulimit\n\n```bash\nulimit -n 65535  # 文件描述符数量\n\n# 持久化 \u002Fetc\u002Fsecurity\u002Flimits.conf\necho \"* soft nofile 65535\" >> \u002Fetc\u002Fsecurity\u002Flimits.conf\necho \"* hard nofile 65535\" >> \u002Fetc\u002Fsecurity\u002Flimits.conf\n```\n\n### CPU 亲和性\n\n```bash\ntaskset -c 0,1 .\u002Fprogram      # 绑定到 core 0 和 1\ntaskset -p -c 2,3 \u003CPID>       # 修改运行中进程\n\n# NUMA 架构优化\nnumactl --cpunodebind=0 --membind=0 .\u002Fprogram\n```\n\n## 实际案例：iowait 高导致响应慢\n\n**现象：** Web 服务响应时间从正常 50ms 飙升到 2s+，偶发性。\n\n**排查过程：**\n\n```bash\n# Step 1: top 查看全局\n# wa: 67%  \u003C- iowait 极高，IO 是瓶颈\n\n# Step 2: iostat 定位磁盘\niostat -xz 1\n# sda: %util 99.1%, await 234ms  \u003C- sda 磁盘几乎满载\n\n# Step 3: iotop 定位进程\niotop -o\n# 12345  java   123 MB\u002Fs  56 MB\u002Fs  \u003C- Java 进程疯狂写 IO\n\n# Step 4: lsof 查看写的是什么文件\nlsof -p 12345 | grep REG\n# \u002Fvar\u002Flog\u002Fapp\u002Fapp.log (size: 45GB)  \u003C- 日志文件巨大\n\n# 根因：日志级别错误设置为 DEBUG，产生大量日志\n# 解决：改回 INFO + 配置 logrotate + 使用异步日志\n```\n\n## 常用命令速查表\n\n| 目的 | 命令 |\n|------|------|\n| CPU 总览 | `top`, `htop` |\n| 多核详情 | `mpstat -P ALL 1` |\n| CPU 热点 | `perf top -g` |\n| 内存概览 | `free -h` |\n| IO 监控 | `iostat -xz 1` |\n| 进程 IO | `iotop -o` |\n| 打开文件 | `lsof -p \u003CPID>` |\n| 网络连接 | `ss -tunap` |\n| 带宽监控 | `iftop -i eth0` |\n| 系统调用 | `strace -p \u003CPID>` |\n| 负载历史 | `sar -u 1 10` |\n| 全局概览 | `dstat -cdngy` |\n\n记住：性能调优是**测量驱动**的，不要凭直觉优化，先找到真正的瓶颈。\n","\u003Ch1>Linux 性能调优实战：从 top 到 perf 的完整工具链\u003C\u002Fh1>\n\u003Cblockquote>\n\u003Cp>性能问题的排查从来不是&quot;运行一个命令&quot;，而是系统性地缩小问题范围。\u003C\u002Fp>\n\u003C\u002Fblockquote>\n\u003Ch2 id=\"方法论-use-方法\">方法论：USE 方法\u003C\u002Fh2>\n\u003Cp>每个资源（CPU、内存、IO、网络）都检查三个维度：\u003C\u002Fp>\n\u003Cul>\n\u003Cli>\u003Cstrong>U\u003C\u002Fstrong>tilization：使用率（该资源有多忙？）\u003C\u002Fli>\n\u003Cli>\u003Cstrong>S\u003C\u002Fstrong>aturation：饱和度（是否有请求在排队等待？）\u003C\u002Fli>\n\u003Cli>\u003Cstrong>E\u003C\u002Fstrong>rrors：错误（是否有操作失败？）\u003C\u002Fli>\n\u003C\u002Ful>\n\u003Cp>从宏观到微观：先确定瓶颈在哪个资源层，再深入该层排查。\u003C\u002Fp>\n\u003Ch2 id=\"cpu-分析\">CPU 分析\u003C\u002Fh2>\n\u003Ch3 id=\"基础工具\">基础工具\u003C\u002Fh3>\n\u003Cpre>\u003Ccode class=\"language-bash\"># top：实时概览\n# 关键指标：us（用户态）、sy（内核态）、wa（IO等待）、si（软中断）\ntop -H  # 显示线程级别\n\n# 负载均值的含义\n# Load Average: 1.23, 0.87, 0.65 （1分钟, 5分钟, 15分钟）\n# 单核 CPU：load=1 表示满载，&gt;1 表示有任务在排队\n# 4核 CPU：load=4 才是满载\n# 经验法则：load \u002F 核数 &gt; 0.7 需要关注\nnproc   # 查看核数\nuptime  # 查看负载均值\n\u003C\u002Fcode>\u003C\u002Fpre>\n\u003Cpre>\u003Ccode class=\"language-bash\"># mpstat：多核 CPU 详情\napt install sysstat\nmpstat -P ALL 1  # 每秒刷新，显示所有核\n\n# 输出示例\n# CPU  %usr  %sys  %iowait  %idle\n# all  45.2   8.1     12.3   34.4\n# 0    89.1  10.9      0.0    0.0  &lt;- 某个核跑满了\n\u003C\u002Fcode>\u003C\u002Fpre>\n\u003Cpre>\u003Ccode class=\"language-bash\"># 上下文切换分析\nvmstat 1\n# r: 运行队列（等待 CPU 的进程数），持续 &gt; 核数说明 CPU 紧张\n# b: 阻塞进程数（等待 IO）\n# cs: 每秒上下文切换次数，过高（&gt;10万）说明线程太多\n\npidstat -w -p &lt;PID&gt; 1  # 进程级上下文切换\n\u003C\u002Fcode>\u003C\u002Fpre>\n\u003Ch3 id=\"perf-性能分析利器\">perf：性能分析利器\u003C\u002Fh3>\n\u003Cpre>\u003Ccode class=\"language-bash\"># 安装\napt install linux-perf\n\n# 采样 CPU 热点（30秒）\nperf record -g -F 99 -p &lt;PID&gt; sleep 30\nperf report  # 交互式查看\n\n# 全系统热点\nperf top -g  # 实时显示热函数\n\n# 统计特定事件\nperf stat -e cache-misses,cache-references,instructions,cycles .\u002Fyour_program\n\u003C\u002Fcode>\u003C\u002Fpre>\n\u003Ch3 id=\"火焰图\">火焰图\u003C\u002Fh3>\n\u003Cpre>\u003Ccode class=\"language-bash\"># 1. 用 perf 采集\nperf record -g -F 99 -p &lt;PID&gt; sleep 60\nperf script &gt; out.perf\n\n# 2. 用 FlameGraph 生成 SVG\ngit clone https:\u002F\u002Fgithub.com\u002Fbrendangregg\u002FFlameGraph\n.\u002FFlameGraph\u002Fstackcollapse-perf.pl out.perf &gt; out.folded\n.\u002FFlameGraph\u002Fflamegraph.pl out.folded &gt; flamegraph.svg\n# 宽度代表 CPU 时间占比，越宽越值得优化\n\u003C\u002Fcode>\u003C\u002Fpre>\n\u003Ch2 id=\"内存分析\">内存分析\u003C\u002Fh2>\n\u003Cpre>\u003Ccode class=\"language-bash\"># free：内存使用概览\nfree -h\n# Mem: total 16G, used 8G, free 2G, buff\u002Fcache 6G, available 8G\n# &quot;available&quot; 才是实际可用（包含可回收的 cache），不是 free\n\n# vmstat：虚拟内存统计\nvmstat 1\n# si\u002Fso: swap in\u002Fout（非零说明内存紧张，性能急剧下降）\n\n# 进程内存详情\ncat \u002Fproc\u002F&lt;PID&gt;\u002Fsmaps | grep -E &quot;^(Pss|Rss|Size)&quot; | \\\n    awk '{sum[$1]+=$2} END{for(k in sum)print k, sum[k]\u002F1024, &quot;MB&quot;}'\n# RSS: 实际占用物理内存\n# PSS: 按比例分配共享内存后的大小（更准确）\n\u003C\u002Fcode>\u003C\u002Fpre>\n\u003Ch3 id=\"oom-killer\">OOM Killer\u003C\u002Fh3>\n\u003Cpre>\u003Ccode class=\"language-bash\"># OOM 被触发时的日志\ndmesg | grep -i &quot;oom\\|killed process&quot;\njournalctl -k | grep -i oom\n\n# 输出示例：\n# Out of memory: Kill process 12345 (java) score 892 or sacrifice child\n# Killed process 12345 (java) total-vm:4096MB, anon-rss:3800MB\n\n# 调整 OOM 优先级（-1000 到 1000，越高越先被杀）\necho 500 &gt; \u002Fproc\u002F&lt;PID&gt;\u002Foom_score_adj   # 提高被杀概率\necho -500 &gt; \u002Fproc\u002F&lt;PID&gt;\u002Foom_score_adj  # 降低被杀概率\n\u003C\u002Fcode>\u003C\u002Fpre>\n\u003Ch3 id=\"内存泄漏定位\">内存泄漏定位\u003C\u002Fh3>\n\u003Cpre>\u003Ccode class=\"language-bash\"># valgrind（C\u002FC++）\nvalgrind --leak-check=full --track-origins=yes .\u002Fprogram\n\n# Python：tracemalloc\nimport tracemalloc\ntracemalloc.start()\n# ... 运行一段时间\nsnapshot = tracemalloc.take_snapshot()\ntop_stats = snapshot.statistics('lineno')\nfor stat in top_stats[:10]:\n    print(stat)\n\u003C\u002Fcode>\u003C\u002Fpre>\n\u003Ch2 id=\"io-分析\">IO 分析\u003C\u002Fh2>\n\u003Cpre>\u003Ccode class=\"language-bash\"># iostat：磁盘 IO 统计\niostat -xz 1\n# %util: 磁盘利用率（接近100%说明磁盘是瓶颈）\n# await: 平均等待时间（ms），SSD 应 &lt;1ms，HDD 应 &lt;10ms\n# r\u002Fs, w\u002Fs: 每秒读写次数（IOPS）\n\n# iotop：进程级 IO\niotop -o  # 只显示有 IO 的进程\niotop -a  # 显示累计 IO 而非速率\n\n# 打开文件\nlsof -p &lt;PID&gt;\n\u003C\u002Fcode>\u003C\u002Fpre>\n\u003Ch3 id=\"page-cache-机制\">Page Cache 机制\u003C\u002Fh3>\n\u003Cpre>\u003Ccode class=\"language-bash\"># Linux 大量使用 Page Cache 加速 IO\ncat \u002Fproc\u002Fmeminfo | grep -E &quot;Cached|Buffers|Dirty|Writeback&quot;\n# Dirty: 待写入磁盘的脏页\n# Writeback: 正在写入磁盘的页\n\n# 手动清理 cache（测试场景）\necho 3 &gt; \u002Fproc\u002Fsys\u002Fvm\u002Fdrop_caches\n\n# 调整脏页刷新策略\nsysctl -w vm.dirty_ratio=10\nsysctl -w vm.dirty_background_ratio=5\n\u003C\u002Fcode>\u003C\u002Fpre>\n\u003Ch2 id=\"网络分析\">网络分析\u003C\u002Fh2>\n\u003Cpre>\u003Ccode class=\"language-bash\"># ss：替代 netstat，更快\nss -tunap  # 显示 TCP\u002FUDP 连接\nss -s      # 连接统计\n\n# TIME_WAIT 大量堆积\nss -tan state time-wait | wc -l\n# 解决方案\nsysctl -w net.ipv4.tcp_tw_reuse=1\nsysctl -w net.ipv4.tcp_fin_timeout=30\n\n# 网络吞吐监控\niftop -i eth0  # 实时带宽\nnethogs        # 进程级流量\n\u003C\u002Fcode>\u003C\u002Fpre>\n\u003Ch3 id=\"tcp-状态机\">TCP 状态机\u003C\u002Fh3>\n\u003Cpre>\u003Ccode>CLOSED -&gt; LISTEN -&gt; SYN_RCVD -&gt; ESTABLISHED -&gt; FIN_WAIT_1 -&gt; FIN_WAIT_2 -&gt; TIME_WAIT -&gt; CLOSED\n                                             |\n                                        CLOSE_WAIT -&gt; LAST_ACK -&gt; CLOSED\n\u003C\u002Fcode>\u003C\u002Fpre>\n\u003Cp>常见异常状态：\u003C\u002Fp>\n\u003Cul>\n\u003Cli>大量 \u003Ccode>CLOSE_WAIT\u003C\u002Fcode>：服务端没有正常关闭连接（代码 bug）\u003C\u002Fli>\n\u003Cli>大量 \u003Ccode>TIME_WAIT\u003C\u002Fcode>：短连接太多，考虑连接池\u003C\u002Fli>\n\u003Cli>大量 \u003Ccode>SYN_RECV\u003C\u002Fcode>：可能遭受 SYN Flood 攻击\u003C\u002Fli>\n\u003C\u002Ful>\n\u003Ch2 id=\"系统调用追踪\">系统调用追踪\u003C\u002Fh2>\n\u003Cpre>\u003Ccode class=\"language-bash\"># strace：跟踪进程的系统调用\nstrace -p &lt;PID&gt;\nstrace -c .\u002Fprogram                      # 统计频次\nstrace -e trace=open,read,write .\u002Fprogram\nstrace -T .\u002Fprogram                      # 显示每个调用耗时\n\n# ltrace：跟踪库函数调用\nltrace -p &lt;PID&gt;\nltrace -e malloc+free .\u002Fprogram\n\u003C\u002Fcode>\u003C\u002Fpre>\n\u003Ch2 id=\"调优手段\">调优手段\u003C\u002Fh2>\n\u003Ch3 id=\"sysctl-参数\">sysctl 参数\u003C\u002Fh3>\n\u003Cpre>\u003Ccode class=\"language-bash\"># 网络优化\nsysctl -w net.core.somaxconn=65535\nsysctl -w net.ipv4.tcp_max_syn_backlog=65535\nsysctl -w net.core.rmem_max=16777216\nsysctl -w net.core.wmem_max=16777216\n\n# 持久化\necho &quot;net.core.somaxconn=65535&quot; &gt;&gt; \u002Fetc\u002Fsysctl.conf\nsysctl -p\n\u003C\u002Fcode>\u003C\u002Fpre>\n\u003Ch3 id=\"ulimit\">ulimit\u003C\u002Fh3>\n\u003Cpre>\u003Ccode class=\"language-bash\">ulimit -n 65535  # 文件描述符数量\n\n# 持久化 \u002Fetc\u002Fsecurity\u002Flimits.conf\necho &quot;* soft nofile 65535&quot; &gt;&gt; \u002Fetc\u002Fsecurity\u002Flimits.conf\necho &quot;* hard nofile 65535&quot; &gt;&gt; \u002Fetc\u002Fsecurity\u002Flimits.conf\n\u003C\u002Fcode>\u003C\u002Fpre>\n\u003Ch3 id=\"cpu-亲和性\">CPU 亲和性\u003C\u002Fh3>\n\u003Cpre>\u003Ccode class=\"language-bash\">taskset -c 0,1 .\u002Fprogram      # 绑定到 core 0 和 1\ntaskset -p -c 2,3 &lt;PID&gt;       # 修改运行中进程\n\n# NUMA 架构优化\nnumactl --cpunodebind=0 --membind=0 .\u002Fprogram\n\u003C\u002Fcode>\u003C\u002Fpre>\n\u003Ch2 id=\"实际案例-iowait-高导致响应慢\">实际案例：iowait 高导致响应慢\u003C\u002Fh2>\n\u003Cp>\u003Cstrong>现象：\u003C\u002Fstrong> Web 服务响应时间从正常 50ms 飙升到 2s+，偶发性。\u003C\u002Fp>\n\u003Cp>\u003Cstrong>排查过程：\u003C\u002Fstrong>\u003C\u002Fp>\n\u003Cpre>\u003Ccode class=\"language-bash\"># Step 1: top 查看全局\n# wa: 67%  &lt;- iowait 极高，IO 是瓶颈\n\n# Step 2: iostat 定位磁盘\niostat -xz 1\n# sda: %util 99.1%, await 234ms  &lt;- sda 磁盘几乎满载\n\n# Step 3: iotop 定位进程\niotop -o\n# 12345  java   123 MB\u002Fs  56 MB\u002Fs  &lt;- Java 进程疯狂写 IO\n\n# Step 4: lsof 查看写的是什么文件\nlsof -p 12345 | grep REG\n# \u002Fvar\u002Flog\u002Fapp\u002Fapp.log (size: 45GB)  &lt;- 日志文件巨大\n\n# 根因：日志级别错误设置为 DEBUG，产生大量日志\n# 解决：改回 INFO + 配置 logrotate + 使用异步日志\n\u003C\u002Fcode>\u003C\u002Fpre>\n\u003Ch2 id=\"常用命令速查表\">常用命令速查表\u003C\u002Fh2>\n\u003Ctable>\n\u003Cthead>\n\u003Ctr>\n\u003Cth>目的\u003C\u002Fth>\n\u003Cth>命令\u003C\u002Fth>\n\u003C\u002Ftr>\n\u003C\u002Fthead>\n\u003Ctbody>\n\u003Ctr>\n\u003Ctd>CPU 总览\u003C\u002Ftd>\n\u003Ctd>\u003Ccode>top\u003C\u002Fcode>, \u003Ccode>htop\u003C\u002Fcode>\u003C\u002Ftd>\n\u003C\u002Ftr>\n\u003Ctr>\n\u003Ctd>多核详情\u003C\u002Ftd>\n\u003Ctd>\u003Ccode>mpstat -P ALL 1\u003C\u002Fcode>\u003C\u002Ftd>\n\u003C\u002Ftr>\n\u003Ctr>\n\u003Ctd>CPU 热点\u003C\u002Ftd>\n\u003Ctd>\u003Ccode>perf top -g\u003C\u002Fcode>\u003C\u002Ftd>\n\u003C\u002Ftr>\n\u003Ctr>\n\u003Ctd>内存概览\u003C\u002Ftd>\n\u003Ctd>\u003Ccode>free -h\u003C\u002Fcode>\u003C\u002Ftd>\n\u003C\u002Ftr>\n\u003Ctr>\n\u003Ctd>IO 监控\u003C\u002Ftd>\n\u003Ctd>\u003Ccode>iostat -xz 1\u003C\u002Fcode>\u003C\u002Ftd>\n\u003C\u002Ftr>\n\u003Ctr>\n\u003Ctd>进程 IO\u003C\u002Ftd>\n\u003Ctd>\u003Ccode>iotop -o\u003C\u002Fcode>\u003C\u002Ftd>\n\u003C\u002Ftr>\n\u003Ctr>\n\u003Ctd>打开文件\u003C\u002Ftd>\n\u003Ctd>\u003Ccode>lsof -p &lt;PID&gt;\u003C\u002Fcode>\u003C\u002Ftd>\n\u003C\u002Ftr>\n\u003Ctr>\n\u003Ctd>网络连接\u003C\u002Ftd>\n\u003Ctd>\u003Ccode>ss -tunap\u003C\u002Fcode>\u003C\u002Ftd>\n\u003C\u002Ftr>\n\u003Ctr>\n\u003Ctd>带宽监控\u003C\u002Ftd>\n\u003Ctd>\u003Ccode>iftop -i eth0\u003C\u002Fcode>\u003C\u002Ftd>\n\u003C\u002Ftr>\n\u003Ctr>\n\u003Ctd>系统调用\u003C\u002Ftd>\n\u003Ctd>\u003Ccode>strace -p &lt;PID&gt;\u003C\u002Fcode>\u003C\u002Ftd>\n\u003C\u002Ftr>\n\u003Ctr>\n\u003Ctd>负载历史\u003C\u002Ftd>\n\u003Ctd>\u003Ccode>sar -u 1 10\u003C\u002Fcode>\u003C\u002Ftd>\n\u003C\u002Ftr>\n\u003Ctr>\n\u003Ctd>全局概览\u003C\u002Ftd>\n\u003Ctd>\u003Ccode>dstat -cdngy\u003C\u002Fcode>\u003C\u002Ftd>\n\u003C\u002Ftr>\n\u003C\u002Ftbody>\n\u003C\u002Ftable>\n\u003Cp>记住：性能调优是\u003Cstrong>测量驱动\u003C\u002Fstrong>的，不要凭直觉优化，先找到真正的瓶颈。\u003C\u002Fp>\n","2026-05-03",[11,12,13,14],"linux","性能","运维","系统编程",false,[17,30,40,52,61,68,75,82,89,96,106,109,119,128,136,144,153,162,171,181,188,198,204,211,217,226,233,240,248,258,267,276,286,296,306,314,324,335,345,354,362,368,376,384,392,400,408,415],{"slug":18,"title":19,"description":20,"pub_date":21,"tags":22,"draft":15,"word_count":29},"ide-skills-guide","Agent Skills 完全指南：21 款第三方 Skill 深度评测与使用心得","全面评测 21 款第三方 Agent Skills，涵盖 Vue 生态、前端设计、构建工具、实用工具四大分类。从安装配置到实际使用场景，带你了解每个 Skill 的功能特点、最佳实践与使用心得。","2026-06-15",[23,24,25,26,27,28],"agent","skills","AI","效率工具","前端","Vue",4169,{"slug":31,"title":32,"description":33,"pub_date":34,"tags":35,"draft":15,"word_count":39},"linux-kernel-skeleton-struct-funcptr-container_of","Linux 内核骨架：struct、函数指针与 container_of","读懂 Linux 内核源码的三件套：巨大的 struct 组合代替继承、函数指针表实现虚派发、container_of 宏从嵌入成员找回完整对象。","2026-05-09",[11,36,37,38],"kernel","C","container_of",1369,{"slug":41,"title":42,"description":43,"pub_date":44,"tags":45,"draft":15,"word_count":51},"astro-complete-guide-2025","Astro 5 深度剖析：Islands 架构原理、构建优化与 Cloudflare Workers 边缘部署","从编译器视角解析 Astro 5 的 Islands 架构实现原理，Content Layer API 的 Vite 插件机制，Server Islands 的流式渲染，以及如何在 Cloudflare Workers + D1 边缘环境下榨干性能。","2026-05-08",[46,47,48,49,50],"astro","frontend","cloudflare","performance","architecture",3663,{"slug":53,"title":54,"description":55,"pub_date":9,"tags":56,"draft":15,"word_count":60},"llm-prompt-engineering","Prompt Engineering 实战：让 LLM 真正听话的技巧","System prompt 怎么写、Few-shot 怎么设计、Chain-of-Thought 原理，以及常见失败模式和调试方法。",[57,58,59],"ai","llm","工程实践",1723,{"slug":62,"title":63,"description":64,"pub_date":9,"tags":65,"draft":15,"word_count":67},"rag-system-design","RAG 系统设计：从 naive 到 production-ready","Retrieval-Augmented Generation 不只是「向量数据库 + LLM」，分块策略、召回质量、重排序、缓存才是工程核心。",[57,66,58,59],"rag",1613,{"slug":69,"title":70,"description":71,"pub_date":9,"tags":72,"draft":15,"word_count":74},"git-advanced-workflow","Git 进阶工作流：rebase、cherry-pick、bisect 的正确使用","merge 会了，但 rebase 总搞错？bisect 找 bug 提交？interactive rebase 整理历史？这篇一次说清楚。",[73,59],"git",1396,{"slug":76,"title":77,"description":78,"pub_date":9,"tags":79,"draft":15,"word_count":81},"docker-practical-guide","Docker 实战：从会用到用好","会 docker run 不够，Dockerfile 最佳实践、多阶段构建、Compose 编排、镜像瘦身才是日常真正需要的。",[80,11,59],"docker",1268,{"slug":83,"title":84,"description":85,"pub_date":9,"tags":86,"draft":15,"word_count":88},"anthropics-skills-guide","anthropics\u002Fskills：Anthropic 官方 Agent Skills 仓库解析","Anthropic 官方开源的 Agent Skills 标准仓库，127k stars，解析 SKILL.md 规范、17 个示例 skill 的设计模式，以及如何在 Claude Code \u002F Claude.ai \u002F API 中使用",[57,87,23,24],"Claude",2090,{"slug":90,"title":91,"description":92,"pub_date":9,"tags":93,"draft":15,"word_count":95},"karpathy-claude-code-guidelines","Karpathy 的 LLM 编码批评与 CLAUDE.md 最佳实践","基于 Andrej Karpathy 对 LLM 编程助手的观察，forrestchang 提炼出一个 CLAUDE.md 文件，4 条原则解决 AI 编码的典型失控问题：乱猜假设、过度设计、乱改代码、目标不清",[57,87,94,59],"Claude Code",2699,{"slug":97,"title":98,"description":99,"pub_date":9,"tags":100,"draft":15,"word_count":105},"typescript-advanced-patterns","TypeScript 高级模式：让类型系统为你工作","基础 TS 会了但类型总是 any？条件类型、映射类型、模板字面量类型、infer 关键字才是 TS 的真正威力。",[101,102,103,104],"typescript","类型系统","前端工程","高级模式",1419,{"slug":4,"title":5,"description":6,"pub_date":9,"tags":107,"draft":15,"word_count":108},[11,12,13,14],1524,{"slug":110,"title":111,"description":112,"pub_date":9,"tags":113,"draft":15,"word_count":118},"python-functional-programming","Python 函数式编程：map\u002Ffilter\u002Freduce 之外","Python 不是纯函数式语言，但 functools、itertools、偏函数、闭包这些工具用好了能让代码简洁一个量级。",[114,115,116,117],"python","函数式","闭包","装饰器",1867,{"slug":120,"title":121,"description":122,"pub_date":9,"tags":123,"draft":15,"word_count":127},"python-oop-guide","Python 面向对象：__init__ 之外你需要知道的","Python OOP 不只是 class + __init__，魔术方法、描述符、元类才是真正的武器。",[114,124,125,126],"OOP","面向对象","魔术方法",1792,{"slug":129,"title":130,"description":131,"pub_date":9,"tags":132,"draft":15,"word_count":135},"python-data-structures","Python 内置数据结构深度解析","list、dict、set、tuple 不只是数据容器，搞懂它们的底层实现和时间复杂度，才能写出高性能 Python。",[114,133,12,134],"数据结构","算法",1517,{"slug":137,"title":138,"description":139,"pub_date":9,"tags":140,"draft":15,"word_count":143},"python-basics-quick-start","Python 快速上手：写给有编程基础的人","已经会其他语言，想快速掌握 Python 的语法特性和思维方式，这篇是捷径。",[114,141,142],"入门","基础",1607,{"slug":145,"title":146,"description":147,"pub_date":9,"tags":148,"draft":15,"word_count":152},"python-dataclass-pydantic","Python dataclass vs Pydantic：数据类选型指南","dataclass 是标准库的轻量选择，Pydantic v2 是带验证的重武器，什么时候用哪个，这篇说清楚。",[114,149,150,151],"dataclass","pydantic","数据验证",1323,{"slug":154,"title":155,"description":156,"pub_date":9,"tags":157,"draft":15,"word_count":161},"python-asyncio-practical","Python asyncio 实战：从回调地狱到协程优雅","asyncio 是 Python 异步编程的核心，搞懂 event loop、Task、gather 这些概念才能写出真正高效的异步代码。",[114,158,159,160],"asyncio","并发","网络编程",1258,{"slug":163,"title":164,"description":165,"pub_date":9,"tags":166,"draft":15,"word_count":170},"python-type-hints-guide","Python 类型注解完全指南：从入门到实践","Python 3.5+ 引入类型注解，配合 mypy\u002Fpyright 让 Python 也能享受静态类型检查的好处。",[114,167,168,169],"typescript-style","type-hints","工具链",1102,{"slug":172,"title":173,"description":174,"pub_date":175,"tags":176,"draft":15,"word_count":180},"pwa-install-update-button","PWA 踩坑：为什么安装按钮从来不出现","从 beforeinstallprompt 到 Service Worker waiting，把 PWA 的安装与更新提示真正做对","2026-05-02",[177,178,179],"pwa","javascript","web",1683,{"slug":182,"title":183,"description":184,"pub_date":185,"tags":186,"draft":15,"word_count":187},"openclaw-vs-hermes-agent","OpenClaw vs Hermes Agent：两个本地优先 Agent 的设计差异","OpenClaw（Novita AI）和 Hermes Agent（Nous Research）都是本地运行的个人 AI Agent，但在记忆系统、技能学习、运行环境和模型生态上走了不同的路。深入对比两种架构的核心差异。","2026-05-01",[57,23,58],1679,{"slug":189,"title":190,"description":191,"pub_date":185,"tags":192,"draft":15,"word_count":197},"cpp-random-design-patterns","C++ 设计模式实战：RAII、观察者、工厂","用现代 C++（C++17\u002F20）实现三种高频设计模式：RAII 资源管理、观察者模式事件系统、工厂模式插件架构。每种模式给出问题场景、实现代码和真实工程案例。",[193,194,195,196],"cpp","设计模式","c++17","工程",2613,{"slug":199,"title":200,"description":201,"pub_date":185,"tags":202,"draft":15,"word_count":203},"data-structures-fundamentals","数据结构基础：从数组到红黑树","系统梳理常用数据结构的核心原理、时间复杂度和适用场景。数组、链表、栈、队列、哈希表、二叉树、堆、图，每种结构附实现要点和 C++ 代码片段。",[133,134,193,142],3004,{"slug":205,"title":206,"description":207,"pub_date":208,"tags":209,"draft":15,"word_count":210},"ai-agent-what-is","什么是 AI Agent？从 LLM 到自主执行","LLM 本身是无状态问答机，Agent 是什么让它’动’起来的？本文深入解析 Agent 的四个核心能力、ReAct 框架、工具调用原理，以及主流框架横向对比。","2026-04-30",[57,23,58],2116,{"slug":212,"title":213,"description":214,"pub_date":208,"tags":215,"draft":15,"word_count":216},"ai-agent-memory","AI Agent 的记忆系统：从上下文窗口到长期记忆","深入拆解 AI Agent 的四种记忆类型、上下文窗口压缩策略、RAG 向量检索原理，以及三种典型失败模式和工程选型建议。",[57,23,66],2052,{"slug":218,"title":219,"description":220,"pub_date":208,"tags":221,"draft":15,"word_count":225},"network-proxy-vpn-guide","代理与翻墙技术原理：从 HTTP 代理到现代协议","深入解析代理与 VPN 的本质区别，梳理从 SOCKS5 到 Shadowsocks、V2Ray\u002FXray、Hysteria2 的协议演进，以及机场订阅的技术本质。",[222,223,224],"网络","代理","协议",2148,{"slug":227,"title":228,"description":229,"pub_date":208,"tags":230,"draft":15,"word_count":143},"algorithm-binary-search","二分查找：永远写不对？记住这个模板","彻底搞清楚二分查找的边界问题：闭区间和左闭右开两套模板、三道经典 LeetCode 题目完整 C++ 实现，以及二分答案的进阶思路。",[134,231,232,193],"二分查找","leetcode",{"slug":234,"title":235,"description":236,"pub_date":208,"tags":237,"draft":15,"word_count":239},"algorithm-sliding-window","滑动窗口算法：从暴力到 O(n) 的思维跃迁","系统讲解滑动窗口算法的核心模板、适用题型，配合三道经典 LeetCode 题目的完整 C++ 实现，彻底理解双指针收缩思路。",[134,238,232,193],"滑动窗口",1943,{"slug":241,"title":242,"description":243,"pub_date":208,"tags":244,"draft":15,"word_count":247},"network-clash-config","Clash \u002F Mihomo 配置详解：规则、策略组与分流","深入解析 Clash\u002FMihomo 的核心配置结构，包括代理节点、策略组类型、规则优先级、DNS fake-ip 模式，以及一份实用的完整配置模板。",[222,245,223,246],"clash","配置",1292,{"slug":249,"title":250,"description":251,"pub_date":252,"tags":253,"draft":15,"word_count":257},"hid-hotplug","HID 设备热插拔检测：从 udev 到 node-hid","在 Linux 上用 node-hid + usb 库实现可靠的 USB HID 设备热插拔检测，踩坑记录","2026-04-28",[193,254,11,255,256],"hid","nodejs","electron",2039,{"slug":259,"title":260,"description":261,"pub_date":262,"tags":263,"draft":15,"word_count":266},"electron-ipc-types","Electron IPC 类型安全：从 any 到完全类型化","用 TypeScript 泛型封装 Electron IPC，彻底消灭 any，preload 契约集中管理","2026-04-25",[256,101,264,265],"ipc","vue",1446,{"slug":268,"title":269,"description":270,"pub_date":271,"tags":272,"draft":15,"word_count":275},"element-plus-popover-hide","手动关闭多个 el-popover（不用 v-model:visible）","通过 ref + Reflect.get 调用 hide() 方法手动关闭 Element Plus Popover，解释 Vue3 Proxy 导致无法直接调用实例方法的原因。","2024-10-25",[265,273,274],"element-plus","vue3",1321,{"slug":277,"title":278,"description":279,"pub_date":280,"tags":281,"draft":15,"word_count":285},"vite-vue3-ts-elementplus-pinia","用 Vite+（vp）从零搭建 Vue3 + TypeScript + Element Plus + Pinia + Vue Router","使用 Vite+ 统一工具链（vp）一条命令搭建 Vue3 全家桶，涵盖按需导入、Pinia store、路由配置，以及常见坑的解决方案。","2024-08-27",[265,282,101,273,283,284],"vite","pinia","vite-plus",1960,{"slug":287,"title":288,"description":289,"pub_date":290,"tags":291,"draft":15,"word_count":295},"cef-lnk2038-iterator-debug-level","CEF LNK2038：解决 _ITERATOR_DEBUG_LEVEL 不匹配错误","分析 CEF（Chromium Embedded Framework）集成时出现的 LNK2038 _ITERATOR_DEBUG_LEVEL 链接错误，从根本原因到解决方案的完整指南。","2024-05-07",[193,292,293,294],"CEF","Visual Studio","链接错误",1509,{"slug":297,"title":298,"description":299,"pub_date":300,"tags":301,"draft":15,"word_count":305},"npm-electron-install-fix","彻底解决 npm 安装 Electron 失败的问题","分析 npm install electron 失败的根本原因（下载二进制超时\u002F被墙），通过国内镜像（npmmirror）彻底解决，并介绍多种备选方案和常见错误排查。","2024-03-01",[256,302,303,304],"npm","前端工具链","国内镜像",1494,{"slug":307,"title":308,"description":309,"pub_date":310,"tags":311,"draft":15,"word_count":313},"git-out-of-memory","解决 git 报错：Fatal: Out of memory, malloc failed","分析 git 大仓库操作时出现 Out of memory malloc failed 的根本原因，通过调整 pack.windowMemory、http.postBuffer 和 git repack 彻底解决。","2024-01-31",[73,11,312],"工具",2244,{"slug":315,"title":316,"description":317,"pub_date":318,"tags":319,"draft":15,"word_count":323},"vmware-tools-install","在 VMware 虚拟机中安装 open-vm-tools 完整指南","详解 VMware Tools 的作用、open-vm-tools 与官方 VMware Tools 的区别，以及在 Ubuntu 虚拟机中安装并生效的完整步骤和常见问题排查。","2023-11-21",[320,11,321,322],"VMware","Ubuntu","虚拟机",2523,{"slug":325,"title":326,"description":327,"pub_date":328,"tags":329,"draft":15,"word_count":334},"load-balancing-algorithms","负载均衡算法完全指南：从轮询到一致性哈希","系统梳理静态与动态负载均衡算法，涵盖轮询、随机、权重、IP Hash、一致性 Hash、最少连接、最快响应等，并对比 Nginx、Dubbo、Spring Cloud LoadBalancer 的实现差异。","2023-11-15",[330,331,332,333],"分布式","负载均衡","Nginx","微服务",1764,{"slug":336,"title":337,"description":338,"pub_date":339,"tags":340,"draft":15,"word_count":344},"win-cw2a-ca2w","ATL 字符串转换：CW2A 与 CA2W 完全指南","详解 ATL 宏 CW2A\u002FCA2W 在 Unicode 与 ANSI 之间的字符串转换用法、头文件依赖、USES_CONVERSION 宏的作用与常见陷阱。","2023-06-09",[193,341,342,343],"windows","ATL","字符串",1665,{"slug":346,"title":347,"description":348,"pub_date":339,"tags":349,"draft":15,"word_count":353},"csharp-sendmessage-cpp","C# 通过 SendMessage 向 C++ 窗口发送消息与字符串","使用 P\u002FInvoke 调用 user32.dll 的 SendMessage，从 C# 发送自定义 WM_USER 消息及字符串指针给 C++ 原生窗口，并在 C++ 侧正确接收和转换。",[350,193,341,351,352],"C#","互操作","PInvoke",1554,{"slug":355,"title":356,"description":357,"pub_date":358,"tags":359,"draft":15,"word_count":361},"win-postmessage-vector","Windows PostMessage 跨线程传递 std::vector 指针","通过 PostMessage 在 Windows 消息队列中传递 std::vector 指针，使用 reinterpret_cast 将指针装入 LPARAM，并在接收方正确释放内存。","2023-05-26",[193,341,360],"WinAPI",1823,{"slug":363,"title":364,"description":365,"pub_date":358,"tags":366,"draft":15,"word_count":367},"exe-dll-single-package","将 EXE 和 DLL 打包成单一可执行文件","介绍两种将 exe 和依赖 dll 打包成单文件的方案：Enigma Virtual Box 和 WinRAR 自解压，适合发布 Windows 桌面程序时简化分发流程。",[341,193,312],1619,{"slug":369,"title":370,"description":371,"pub_date":358,"tags":372,"draft":15,"word_count":375},"cpp-random-mt19937","C++ 现代随机数生成：用 mt19937 彻底告别 rand()","深入讲解为什么 rand() 不够用，以及如何用 C++11 的 \u003Crandom> 库正确生成高质量随机数，涵盖 mt19937、各种分布和线程安全。",[193,373,374],"c++11","random",1549,{"slug":377,"title":378,"description":379,"pub_date":380,"tags":381,"draft":15,"word_count":383},"win-startup-registry","C++ 实现程序开机自启动：注册表方式详解","通过操作 Windows 注册表 Run 键实现程序开机自启动，包括 HKCU 与 HKLM 区别、完整封装代码、工作目录问题和 UAC 权限处理。","2022-12-26",[341,193,382],"registry",1201,{"slug":385,"title":386,"description":387,"pub_date":388,"tags":389,"draft":15,"word_count":391},"mfc-cstring-wparam","MFC 中 CString 与 WPARAM 之间的转换","详解 MFC 消息传递中 CString 无法直接强转为 WPARAM 的原因，以及两种正确的转换方案，并介绍结构体指针传递的正确姿势。","2022-11-25",[390,193,341],"mfc",1546,{"slug":393,"title":394,"description":395,"pub_date":396,"tags":397,"draft":15,"word_count":399},"duilib-static-build","正确编译 Duilib 静态库：避免 ATL 依赖和链接错误","详解如何用 DuiLib_Static.vcxproj 编译 Duilib 静态库，解决 VARIANT 未定义、Unicode 配置不匹配和 ATL 依赖等常见问题。","2022-08-24",[193,398,341,390],"duilib",2639,{"slug":401,"title":402,"description":403,"pub_date":404,"tags":405,"draft":15,"word_count":407},"mfc-dpi-adaptive","MFC 界面自适应不同分辨率","MFC 对话框程序实现控件和字体随分辨率自动缩放的完整方案，附 DPI Awareness 配置说明","2022-08-17",[390,193,341,406],"dpi",1414,{"slug":409,"title":410,"description":411,"pub_date":412,"tags":413,"draft":15,"word_count":414},"mfc-drag-window","MFC 无标题栏窗口客户区拖动：三种方法对比","MFC 对话框去掉标题栏后如何实现拖动移动窗口，三种方案完整实现与适用场景分析","2022-08-16",[390,193,341],1633,{"slug":416,"title":417,"description":418,"pub_date":419,"tags":420,"draft":15,"word_count":422},"algorithm-number-complement","整数的补数：位运算掩码解法","LeetCode 476 题，用掩码 XOR 实现整数补数，附 C++\u002FPython\u002FJava 三种实现及补数与补码的区别","2021-03-08",[134,421,232],"位运算",1374,[]]