strace工具用来追踪进程的系统调用。
这篇博客描述过strace的使用,简单介绍过其原理:Linux 调试之strace
接下来给出一个小的mini strace工具demo,以及详细介绍strace工具的原理。
strace工具通过 ptrace 系统调用实现,ptrace系统调用本质上是一个用于读取和修改进程地址空间中的值的工具,不能用于直接跟踪系统调用。只有从正确的位置提取出所需的信息,才能跟踪进程并就进行的系统调用得出结论。
#define _POSIX_C_SOURCE 200112L
/* C standard library */
#include
#include
#include
#include
#include
/* POSIX */
#include
#include
#include
/* Linux */
#include
#include
#ifndef PTRACE_O_EXITKILL
/* eventless options */
#define PTRACE_O_EXITKILL (1 << 20)
#endif
#define FATAL(...) \
do { \
fprintf(stderr, "strace: " __VA_ARGS__); \
fputc('\n', stderr); \
exit(EXIT_FAILURE); \
} while (0)
int
main(int argc, char **argv)
{
if (argc <= 1)
FATAL("too few arguments: %d", argc);
pid_t pid = fork();
switch (pid) {
case -1: /* error */
FATAL("%s", strerror(errno));
case 0: /* child */
ptrace(PTRACE_TRACEME, 0, 0, 0);
/* Because we're now a tracee, execvp will block until the parent
* attaches and allows us to continue. */
execvp(argv[1], argv + 1);
FATAL("%s", strerror(errno));
}
/* parent */
waitpid(pid, 0, 0); // sync with execvp
ptrace(PTRACE_SETOPTIONS, pid, 0, PTRACE_O_EXITKILL);
for (;;) {
/* Enter next system call */
if (ptrace(PTRACE_SYSCALL, pid, 0, 0) == -1)
FATAL("%s", strerror(errno));
//下发ptrace PTRACE_SYSCALL请求后,调用waitpid陷入阻塞态,等待子进程停止
//等待子进程状态改变:由运行态改变为停止态
//子进程变为停止态后,该函数便由阻塞态直接返回
if (waitpid(pid, 0, 0) == -1)
FATAL("%s", strerror(errno));
//父进程开始观测子进程的寄存器值
/* Gather system call arguments */
struct user_regs_struct regs;
//获取调用系统调用调用时的入参值
if (ptrace(PTRACE_GETREGS, pid, 0, ®s) == -1)
FATAL("%s", strerror(errno));
//获取系统调用号
long syscall = regs.orig_rax;
/* Print a representation of the system call */
//打印系统调用的参数
fprintf(stderr, "%ld(%ld, %ld, %ld, %ld, %ld, %ld)",
syscall,
(long)regs.rdi, (long)regs.rsi, (long)regs.rdx,
(long)regs.r10, (long)regs.r8, (long)regs.r9);
/* Run system call and stop on exit */
if (ptrace(PTRACE_SYSCALL, pid, 0, 0) == -1)
FATAL("%s", strerror(errno));
//下发ptrace PTRACE_SYSCALL请求后,调用waitpid陷入阻塞态,等待子进程停止
//等待子进程状态改变:由运行态改变为停止态
//子进程变为停止态后,该函数便由阻塞态直接返回
if (waitpid(pid, 0, 0) == -1)
FATAL("%s", strerror(errno));
/* Get system call result */
//获取系统调用返回时寄存器的值
if (ptrace(PTRACE_GETREGS, pid, 0, ®s) == -1) {
fputs(" = ?\n", stderr);
if (errno == ESRCH)
exit(regs.rdi); // system call was _exit(2) or similar
FATAL("%s", strerror(errno));
}
/* Print system call result */
//打印系统调用的返回值
fprintf(stderr, " = %ld\n", (long)regs.rax);
}
}
PTRACE_SYSCALL是ptrace系统调用的一个请求选项,用于重新启动被停止的被跟踪进程,并在特定条件下再次停止。
用于重新启动被跟踪进程,并在下一次进入或退出系统调用时停止。当使用PTRACE_SYSCALL选项时,被跟踪进程将在下一次系统调用的入口或出口处停止执行,以供跟踪进程进行检查或操作。
这个选项会使被跟踪进程看起来好像是接收到了一个SIGTRAP信号而停止执行。跟踪进程可以在被跟踪进程停止时进行进一步的检查或操作。
系统调用追踪是基于PTRACE_SYSCALL的。如果用该选项激活ptrace,那么内核将开始执行进程,直至调用一个系统调用。在被追踪进程停止之前,跟踪者进程调用wait系列的函数一直是处于阻塞状态的,在被追踪进程停止后,在被追踪进程停止后,wait通知跟踪者进程,跟踪者接下来可以使用一系列的ptrace选项,来分析被跟踪进程的地址空间,以收集有关系统调用的信息。在完成系统调用之后,被跟踪的进程第二次暂停,使得跟踪者进程可以检查调用是否成功,即检查系统调用的返回值。
wait系统调用:
NAME
wait, waitpid, waitid - wait for process to change state
#include
#include
pid_t wait(int *status);
pid_t waitpid(pid_t pid, int *status, int options);
int waitid(idtype_t idtype, id_t id, siginfo_t *infop, int options);
所有这些系统调用都用于等待调用进程的子进程状态变化,并获取有关状态变化的子进程的信息。状态变化可以是:子进程终止、子进程被信号停止或子进程被信号恢复。
我们这里只考虑子进程被信号停止的情况:如果子进程已经改变了状态,由运行态改变为停止态,这些调用将立即返回。否则,它们将阻塞。
PTRACE_GETREGS是ptrace系统调用的一个请求选项,用于将被跟踪进程的通用寄存器(general-purpose registers)的值复制到跟踪进程中的指定地址。
PTRACE_GETREGS:通过使用ptrace系统调用的PTRACE_GETREGS请求选项,可以将被跟踪进程的通用寄存器的值复制到跟踪进程中的指定地址。可以使用
这里使用PTRACE_GETREGS选项来获取调用系统调用时的系统调用号和其相应的参数,系统调用退出时获取其返回值。
打印调用系统调用时的系统调用号和其相应的参数:
long syscall = regs.orig_rax;
fprintf(stderr, "%ld(%ld, %ld, %ld, %ld, %ld, %ld)",
syscall,
(long)regs.rdi, (long)regs.rsi, (long)regs.rdx,
(long)regs.r10, (long)regs.r8, (long)regs.r9);
打印系统调用退出时其返回值:
fprintf(stderr, " = %ld\n", (long)regs.rax);
SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
unsigned long, data)
{
struct task_struct *child;
child = ptrace_get_task_struct(pid);
arch_ptrace(child, request, addr, data);
}
long arch_ptrace(struct task_struct *child, long request,
unsigned long addr, unsigned long data)
{
......
ptrace_request(child, request, addr, data);
......
}
int ptrace_request(struct task_struct *child, long request,
unsigned long addr, unsigned long data)
{
switch (request) {
case PTRACE_SYSCALL:
return ptrace_resume(child, request, data);
}
}
// arch/x86/include/asm/thread_info.h
/*
* thread information flags
* - these are process state flags that various assembly files
* may need to access
* - pending work-to-be-done flags are in LSW
* - other flags in MSW
* Warning: layout of LSW is hardcoded in entry.S
*/
#define TIF_SYSCALL_TRACE 0 /* syscall trace active */
static int ptrace_resume(struct task_struct *child, long request,
unsigned long data)
{
if (request == PTRACE_SYSCALL)
set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
else
clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
//唤醒子进程
wake_up_state(child, __TASK_TRACED);
}
set_tsk_thread_flag() 函数用于设置子进程的标志位TIF_SYSCALL_TRACE,以控制任务的行为。
TIF_SYSCALL_TRACE标志用于跟踪子进程的系统调用状态和行为。
struct thread_info {
__u32 flags;
}
/*
* flag set/clear/test wrappers
* - pass TIF_xxxx constants to these functions
*/
static inline void set_ti_thread_flag(struct thread_info *ti, int flag)
{
set_bit(flag, (unsigned long *)&ti->flags);
}
#define task_thread_info(task) ((struct thread_info *)(task)->stack)
/* set thread flags in other task's structures
* - see asm/thread_info.h for TIF_xxxx flags available
*/
static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
{
set_ti_thread_flag(task_thread_info(tsk), flag);
}
将子进程的的struct thread_info成员flags设置为TIF_SYSCALL_TRACE,这样每当子进程在下一次进入或退出系统调用时都会停止,然后给父进程发送信号。
在使用PTRACE_SYSCALL时,将在被跟踪进程的task_struct中,设置struct thread_info成员flags设置为TIF_SYSCALL_TRACE标志。
这只是设置被跟踪进程的thread_info实例的flags字段对应的标志位。
在该标志被设置后,内核在恢复被跟踪进程的正常工作前,只需要用wake_up_state唤醒被跟踪进程即可。
TIF_SYSCALL_TRACE标志的效果如何?因为系统调用是高度硬件相关的,该标志的效果需要到汇编语言源代码entry.S中才能看到。如果设置了该标志,在系统调用完成后会调用相应的C函数。比如x86_64:
// arch/x86/kernel/entry_64.S
/*
* Register setup:
* rax system call number
* rdi arg0
* rcx return address for syscall/sysret, C arg3
* rsi arg1
* rdx arg2
* r10 arg3 (--> moved to rcx for C)
* r8 arg4
* r9 arg5
* r11 eflags for syscall/sysret, temporary for C
* r12-r15,rbp,rbx saved by C code, not touched.
*
* Interrupts are off on entry.
* Only called from user space.
*
* XXX if we had a free scratch register we could save the RSP into the stack frame
* and report it properly in ps. Unfortunately we haven't.
*
* When user can change the frames always force IRET. That is because
* it deals with uncanonical addresses better. SYSRET has trouble
* with them due to bugs in both AMD and Intel CPUs.
*/
ENTRY(system_call)
......
/* Do syscall tracing */
tracesys:
call syscall_trace_enter
/*
* Syscall return path ending with IRET.
* Has correct top of stack, but partial stack frame.
*/
int_check_syscall_exit_work:
call syscall_trace_leave
END(system_call)
// arch/x86/kernel/ptrace.c
/*
* We must return the syscall number to actually look up in the table.
* This can be -1L to skip running any syscall at all.
*/
long syscall_trace_enter(struct pt_regs *regs)
{
long ret = 0;
user_exit();
/*
* If we stepped into a sysenter/syscall insn, it trapped in
* kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
* If user-mode had set TF itself, then it's still clear from
* do_debug() and we need to set it again to restore the user
* state. If we entered on the slow path, TF was already set.
*/
if (test_thread_flag(TIF_SINGLESTEP))
regs->flags |= X86_EFLAGS_TF;
/* do the secure computing check first */
if (secure_computing(regs->orig_ax)) {
/* seccomp failures shouldn't expose any additional code. */
ret = -1L;
goto out;
}
if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
ret = -1L;
if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
tracehook_report_syscall_entry(regs))
ret = -1L;
if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
trace_sys_enter(regs, regs->orig_ax);
if (IS_IA32)
audit_syscall_entry(AUDIT_ARCH_I386,
regs->orig_ax,
regs->bx, regs->cx,
regs->dx, regs->si);
#ifdef CONFIG_X86_64
else
audit_syscall_entry(AUDIT_ARCH_X86_64,
regs->orig_ax,
regs->di, regs->si,
regs->dx, regs->r10);
#endif
out:
return ret ?: regs->orig_ax;
}
void syscall_trace_leave(struct pt_regs *regs)
{
bool step;
/*
* We may come here right after calling schedule_user()
* or do_notify_resume(), in which case we can be in RCU
* user mode.
*/
user_exit();
audit_syscall_exit(regs);
if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
trace_sys_exit(regs, regs->ax);
/*
* If TIF_SYSCALL_EMU is set, we only get here because of
* TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
* We already reported this syscall instruction in
* syscall_trace_enter().
*/
step = unlikely(test_thread_flag(TIF_SINGLESTEP)) &&
!test_thread_flag(TIF_SYSCALL_EMU);
if (step || test_thread_flag(TIF_SYSCALL_TRACE))
tracehook_report_syscall_exit(regs, step);
user_enter();
}
该标志的效果在所有支持的平台上都是相同的。在被监控进程执行一个系统调用前后,
进程状态设置为TASK_STOPPED,而且会通过SIGCHLD信号通知跟踪者。接下来,跟踪者从wait函数返回,然后下发ptrace请求开始调试被跟踪进程,所需的信息可以从寄存器或特定内存区的内容提取。
PTRACE_GETREGS 是 ptrace 系统调用的一个选项,用于从子进程中获取所有通用寄存器(general-purpose registers)的值,并将其传递到用户空间进行进一步处理。
SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
unsigned long, data)
{
struct task_struct *child;
child = ptrace_get_task_struct(pid);
arch_ptrace(child, request, addr, data);
}
long arch_ptrace(struct task_struct *child, long request,
unsigned long addr, unsigned long data)
{
unsigned long __user *datap = (unsigned long __user *)data;
switch (request) {
case PTRACE_GETREGS: /* Get all gp regs from the child. */
return copy_regset_to_user(child,
task_user_regset_view(current),
REGSET_GENERAL,
0, sizeof(struct user_regs_struct),
datap);
}
}
/*
* Segment register layout in coredumps.
*/
struct user_regs_struct {
unsigned long r15;
unsigned long r14;
unsigned long r13;
unsigned long r12;
unsigned long bp;
unsigned long bx;
unsigned long r11;
unsigned long r10;
unsigned long r9;
unsigned long r8;
unsigned long ax;
unsigned long cx;
unsigned long dx;
unsigned long si;
unsigned long di;
unsigned long orig_ax;
unsigned long ip;
unsigned long cs;
unsigned long flags;
unsigned long sp;
unsigned long ss;
unsigned long fs_base;
unsigned long gs_base;
unsigned long ds;
unsigned long es;
unsigned long fs;
unsigned long gs;
};
对应的 ptrace 选项PTRACE_SETREGS:
long arch_ptrace(struct task_struct *child, long request,
unsigned long addr, unsigned long data)
{
unsigned long __user *datap = (unsigned long __user *)data;
switch (request) {
case PTRACE_SETREGS: /* Set all gp regs in the child. */
return copy_regset_from_user(child,
task_user_regset_view(current),
REGSET_GENERAL,
0, sizeof(struct user_regs_struct),
datap);
}
}
PTRACE_SETREGS 是 ptrace 系统调用的一个选项,用于设置子进程的所有通用寄存器(general-purpose registers)的值。调试器可以使用 PTRACE_SETREGS 选项将用户空间中的通用寄存器的值设置到子进程中,以修改子进程的寄存器状态。