接上篇~
7.14 修改进程中的变量
root@j9 ~# cat stap_set_var.c -n
1 #include <stdio.h>
2
3 typedef struct policy {
4 int id;
5 } policy_t;
6
7 int main(int argc, char *argv[])
8 {
9 policy_t policy;
10 policy_t *p = &policy;
11 policy_t **pp;
12
13 p->id = 111;
14
15 printf("before stap set, p->id: %d\n", p->id);
16
17 pp = &p;
18
19 printf("after stap set, p->id: %d, (*pp)->id: %d\n", p->id, (*pp)->id);
20
21 return 0;
22 }
root@j9 ~# gcc -Wall -g -o ./stap_set_var ./stap_set_var.c
root@j9 ~# cat stap_set_var.stp
probe process("./stap_set_var").statement("main@./stap_set_var.c:17")
{
$p->id = 222;
printf("$p$: %s\n", $p$)
}
root@j9 ~# stap -g stap_set_var.stp -c ./stap_set_var
before stap set, p->id: 111
after stap set, p->id: 222, (*pp)->id: 222
$p$: {.id=222}
root@j9 ~#
可以看出在第 17 行用 SystemTap 修改后的值在第 19 行就生效了。 需要注意的是 stap 要加-g 参数在 guru 模式下才能修改变量的值。
7.15 跟踪进程执行流程
thread_indent(n): 补充空格 ppfunc(): 当前探测点所在的函数 在 call 探测点调用 thread_indent(4)补充 4 个空格,在 return 探测点调用 thread_indent(-4)回退 4 个空格,效果如下:
#cat trace_nginx.stp
probe process("/home/admin/tengine/bin/nginx").function("*@src/http/ngx_http_*").call
{
printf("%s -> %s\n", thread_indent(4), ppfunc());
}
probe process("/home/admin/tengine/bin/nginx").function("*@src/http/ngx_http_*").return
{
printf("%s <- %s\n", thread_indent(-4), ppfunc());
}
#stap trace_nginx.stp
0 nginx(11368): -> ngx_http_init_connection
21 nginx(11368): <- ngx_http_init_connection
0 nginx(11368): -> ngx_http_wait_request_handler
30 nginx(11368): -> ngx_http_create_request
41 nginx(11368): <- ngx_http_create_request
55 nginx(11368): -> ngx_http_process_request_line
72 nginx(11368): -> ngx_http_read_request_header
78 nginx(11368): <- ngx_http_read_request_header
91 nginx(11368): -> ngx_http_parse_request_line
99 nginx(11368): <- ngx_http_parse_request_line
109 nginx(11368): -> ngx_http_process_request_uri
115 nginx(11368): <- ngx_http_process_request_uri
127 nginx(11368): -> ngx_http_process_request_headers
138 nginx(11368): -> ngx_http_read_request_header
143 nginx(11368): <- ngx_http_read_request_header
155 nginx(11368): -> ngx_http_parse_header_line
163 nginx(11368): <- ngx_http_parse_header_line
178 nginx(11368): -> ngx_http_process_user_agent
185 nginx(11368): <- ngx_http_process_user_agent
192 nginx(11368): -> ngx_http_parse_header_line
198 nginx(11368): <- ngx_http_parse_header_line
208 nginx(11368): -> ngx_http_process_host
222 nginx(11368): -> ngx_http_validate_host
229 nginx(11368): <- ngx_http_validate_host
239 nginx(11368): -> ngx_http_set_virtual_server
252 nginx(11368): -> ngx_http_find_virtual_server
259 nginx(11368): <- ngx_http_find_virtual_server
263 nginx(11368): <- ngx_http_set_virtual_server
266 nginx(11368): <- ngx_http_process_host
274 nginx(11368): -> ngx_http_parse_header_line
279 nginx(11368): <- ngx_http_parse_header_line
287 nginx(11368): -> ngx_http_parse_header_line
292 nginx(11368): <- ngx_http_parse_header_line
.....
2072 nginx(11368): <- ngx_http_finalize_request
2076 nginx(11368): <- ngx_http_core_content_phase
2079 nginx(11368): <- ngx_http_core_run_phases
2083 nginx(11368): <- ngx_http_handler
2093 nginx(11368): -> ngx_http_run_posted_requests
2100 nginx(11368): <- ngx_http_run_posted_requests
2103 nginx(11368): <- ngx_http_process_request
2107 nginx(11368): <- ngx_http_process_request_headers
2111 nginx(11368): <- ngx_http_process_request_line
2114 nginx(11368): <- ngx_http_wait_request_handler
0 nginx(11368): -> ngx_http_keepalive_handler
26 nginx(11368): -> ngx_http_close_connection
79 nginx(11368): <- ngx_http_close_connection
83 nginx(11368): <- ngx_http_keepalive_handler
7.16 查看代码执行路径
pp(): 输出当前被激活的探测点
#cat ngx_http_process_request.stp
probe process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:*") {
printf("%s\n", pp())
}
#stap ngx_http_process_request.stp
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2762")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2768")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2771")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2773")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2774")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2783")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2835")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2840")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2841")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2842")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2843")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2846")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2847")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2848")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2850")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2852")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2853")
^C
可以看出该函数哪些行被执行了。
7.17 巧用正则匹配过滤
在排查问题时,可以利用一些正则匹配来获取自己想要的信息,比如下面是只收集*.j9.com 的堆栈:
#
cat debug_tengine_5xx.stp
probe process("/home/admin/tengine/bin/t-coresystem-tengine-cdn").function("ngx_http_finalize_request").call {
rc = $rc
if (rc < 0) {
host = "(null)"
if ($r->headers_in->server->len != 0) {
host = user_string_n($r->headers_in->server->data, $r->headers_in->server->len)
} else {
cscf = &@cast($r->srv_conf, "ngx_http_core_srv_conf_t")[@var("ngx_http_core_module@src/http/ngx_http_core_module.c")->ctx_index]
if (cscf->server_name->len != 0) {
host = user_string_n(cscf->server_name->data, cscf->server_name->len)
}
}
if (host =~ ".*\.j9\.com") {
printf("rc: %d, host: %s\n", rc, host)
print_ubacktrace()
}
}
}
#stap debug_tengine_5xx.stp
WARNING: Missing unwind data for module, rerun with 'stap -d /lib64/libc-2.12.so'
rc: -4, host: www.j9.com
0x49af2e : ngx_http_finalize_request+0xe/0x480 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
0x492eab : ngx_http_core_content_phase+0x2b/0x130 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
0x48e74d : ngx_http_core_run_phases+0x3d/0x50 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
0x514c3c : ngx_http_lua_socket_tcp_read+0x44c/0x590 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
0x513150 : ngx_http_lua_socket_tcp_handler+0x30/0x50 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
0x475b96 : ngx_event_process_posted+0x36/0x40 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
0x47d4d8 : ngx_worker_process_cycle+0x138/0x260 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
0x47a38a : ngx_spawn_process+0x1ca/0x5e0 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
0x47c73c : ngx_start_worker_processes+0x7c/0x100 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
0x47db5f : ngx_master_process_cycle+0x3af/0x9b0 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
0x45a740 : main+0xa90/0xb50 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
0x3623e1ecdd [/lib64/libc-2.12.so+0x1ecdd/0x38d000]
rc: -4, host: cdn.j9.com
0x49af2e : ngx_http_finalize_request+0xe/0x480 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
0x492eab : ngx_http_core_content_phase+0x2b/0x130 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
0x48e74d : ngx_http_core_run_phases+0x3d/0x50 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
0x514c3c : ngx_http_lua_socket_tcp_read+0x44c/0x590 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
0x513150 : ngx_http_lua_socket_tcp_handler+0x30/0x50 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
0x475b96 : ngx_event_process_posted+0x36/0x40 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
0x47d4d8 : ngx_worker_process_cycle+0x138/0x260 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
0x47a38a : ngx_spawn_process+0x1ca/0x5e0 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
0x47c73c : ngx_start_worker_processes+0x7c/0x100 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
0x47db5f : ngx_master_process_cycle+0x3af/0x9b0 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
0x45a740 : main+0xa90/0xb50 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
0x3623e1ecdd [/lib64/libc-2.12.so+0x1ecdd/0x38d000]
7.18 关联数组用法
SystemTap 的关联数组必须是全局变量,需要用 global 进行声明,其索引可以支持多达 9 项索引域,各域间以逗号隔开。支持 =, ++ 与 +=操作,其默认的初始值为 0。 例如:
root@j9 ~# cat stap_array.stp
global reads
probe vfs.read {
reads[execname(), pid()] ++
}
probe timer.s(3) {
foreach ([execname, pid] in reads) {
printf("%s(%d) : %d \n", execname, pid, reads[execname, pid])
}
print("============================\n")
delete reads
}
root@j9 ~# stap stap_array.stp
stapio(18716) : 16
rsyslogd(770) : 1
docker(743) : 3
IFSWatch(5594) : 30
QThread(5594) : 6
AliYunDunUpdate(1057) : 4
sshd(15118) : 1
sshd(15191) : 1
============================
stapio(18716) : 16
sshd(15191) : 3
docker(743) : 3
IFSWatch(5594) : 30
sshd(15118) : 2
QThread(5594) : 12
AliYunDunUpdate(1057) : 8
============================
^C
root@j9 ~/systemtap#
也可以用+、-进行排序:
root@j9 ~# cat stap_array.stp
global reads
probe vfs.read {
reads[execname(), pid()] ++
}
probe timer.s(3) {
foreach ([execname, pid+] in reads) {
printf("%s(%d) : %d \n", execname, pid, reads[execname, pid])
}
print("============================\n")
delete reads
}
root@j9 ~# stap stap_array.stp
docker(743) : 3
rsyslogd(770) : 1
AliYunDunUpdate(1057) : 12
IFSWatch(5594) : 30
QThread(5594) : 12
sshd(15118) : 2
sshd(15191) : 2
stapio(19021) : 16
============================
docker(743) : 3
AliYunDunUpdate(1057) : 12
IFSWatch(5594) : 30
QThread(5594) : 6
sshd(15118) : 1
sshd(15191) : 19
stapio(19021) : 16
============================
^C
root@j9 ~#
7.19 调试内存泄漏以及内存重复释放
probe begin {
printf("=============begin============\n")
}
//记录内存分配和释放的计数关联数组
global g_mem_ref_tbl
//记录内存分配和释放的调用堆栈关联数组
global g_mem_bt_tbl
probe process("/lib/x86_64-linux-gnu/libc.so.6").function("__libc_malloc").return, process("/lib/x86_64-linux-gnu/libc.so.6").function("__libc_calloc").return {
if (target() == pid()) {
if (g_mem_ref_tbl[$return] == 0) {
g_mem_ref_tbl[$return]++
g_mem_bt_tbl[$return] = sprint_ubacktrace()
}
}
}
probe process("/lib/x86_64-linux-gnu/libc.so.6").function("__libc_free").call {
if (target() == pid()) {
g_mem_ref_tbl[$mem]--
if (g_mem_ref_tbl[$mem] == 0) {
if ($mem != 0) {
//记录上次释放的调用堆栈
g_mem_bt_tbl[$mem] = sprint_ubacktrace()
}
} else if (g_mem_ref_tbl[$mem] < 0 && $mem != 0) {
//如果调用 free 已经失衡,那就出现了重复释放内存的问题,这里输出当前调用堆栈,以及这个地址上次释放的调用堆栈
printf("MMMMMMMMMMMMMMMMMMMMMMMMMMMM\n")
printf("g_mem_ref_tbl[%p]: %d\n", $mem, g_mem_ref_tbl[$mem])
print_ubacktrace()
printf("last free backtrace:\n%s\n", g_mem_bt_tbl[$mem])
printf("WWWWWWWWWWWWWWWWWWWWWWWWWWWW\n")
}
}
}
probe end {
//最后输出产生泄漏的内存是在哪里分配的
printf("=============end============\n")
foreach(mem in g_mem_ref_tbl) {
if (g_mem_ref_tbl[mem] > 0) {
printf("%s\n", g_mem_bt_tbl[mem])
}
}
}
详细请看: http://blog.csdn.net/wangzuxi/article/details/44901285
7.20 嵌入 C 代码
在进程 fork 出子进程时打印出进程 id 和进程名:
root@jusse ~/systemtap# cat copy_process.stp
function getprocname:string(task:long)
%{
struct task_struct *task = (struct task_struct *)STAP_ARG_task;
snprintf(STAP_RETVALUE, MAXSTRINGLEN, "pid: %d, comm: %s", task->pid, task->comm);
%}
function getprocid:long(task:long)
%{
struct task_struct *task = (struct task_struct *)STAP_ARG_task;
STAP_RETURN(task->pid);
%}
probe kernel.function("copy_process").return
{
printf("copy_process return: %p, pid: %d, getprocname: %s, getprocid: %d\n", $return, $return->pid, getprocname($return), getprocid($return));
}
root@jusse ~/systemtap# stap -g copy_process.stp
copy_process return: 0xffff880039f61800, pid: 12212, getprocname: pid: 12212, comm: bash, getprocid: 12212
copy_process return: 0xffff880039f61800, pid: 12212, getprocname: pid: 12212, comm: bash, getprocid: 12212
copy_process return: 0xffff880039f63000, pid: 12213, getprocname: pid: 12213, comm: cc_epoll, getprocid: 12213
copy_process return: 0xffff880039f63000, pid: 12213, getprocname: pid: 12213, comm: cc_epoll, getprocid: 12213
copy_process return: 0xffff8800081a9800, pid: 12214, getprocname: pid: 12214, comm: cc_epoll, getprocid: 12214
copy_process return: 0xffff8800081a9800, pid: 12214, getprocname: pid: 12214, comm: cc_epoll, getprocid: 12214
copy_process return: 0xffff8800004d8000, pid: 12215, getprocname: pid: 12215, comm: cc_epoll, getprocid: 12215
copy_process return: 0xffff8800004d8000, pid: 12215, getprocname: pid: 12215, comm: cc_epoll, getprocid: 12215
copy_process return: 0xffff880000564800, pid: 12216, getprocname: pid: 12216, comm: cc_epoll, getprocid: 12216
copy_process return: 0xffff880000564800, pid: 12216, getprocname: pid: 12216, comm: cc_epoll, getprocid: 12216
copy_process return: 0xffff880000566000, pid: 12217, getprocname: pid: 12217, comm: cc_epoll, getprocid: 12217
copy_process return: 0xffff880000566000, pid: 12217, getprocname: pid: 12217, comm: cc_epoll, getprocid: 12217
有三个需要注意的地方: 1 )、SystemTap 脚本里面嵌入 C 语言代码要在每个大括号前加%前缀,是%{…… %} 而不是%{ …… }%; 2 )、获取脚本函数参数要用 STAP_ARG_前缀; 3 )、一般 long 等返回值用 STAP_RETURN,而 string 类型返回值要用 snprintf、strncat 等方式把字符串复制到 STAP_RETVALUE 里面。
7.21 调试内核模块
这小节就不细讲了,这篇博客 (http://blog.chinaunix.net/uid-14528823-id-4726046.html) 写得很详细,这里只 copy 两个关键点过来记录一下: 要调试自己的内核模块,需要注意的有两个关键点: 1)、使用 SystemTap 调试内核模块,探测点的编写格式示例为: module("ext3").function("ext3_*") 2)、需要将自己的模块 cp 到 /lib/modules/uname -r/extra 目录中,否则找不到符号,如果 /lib/modules/uname -r/目录下没有 extra 这个目录,自己 mkdir 一下就可以。
7.22 一些错误提示及解决办法
错误提示 1:
ERROR: MAXACTION exceeded near keyword at debug_connection.stp:86:9
ERROR: MAXACTION exceeded near operator '->' at debug_connection.stp:84:30
解决办法: 加上 stap 参数:-DMAXACTION=102400,如果还报这种类型的错误,只需把 102400 调成更大的值即可。
错误提示 2:
WARNING: Number of errors: 0, skipped probes: 82
解决办法: 加上-DMAXSKIPPED=102400 和-DSTP_NO_OVERLOAD 参数
还有一些可以去掉限制的宏:
MAXSTRINGLEN:这个宏会影响 sprintf 的 buffer 大小,默认为 512 字节。 MAXTRYLOCK:对全局变量进行 try lock 操作的次数,超过则次数还拿不到锁则放弃和跳过该探测点,默认值为 1000.全局变量多的时候可以把这个宏开大一点。
(完)