\input{configpres} \title{Kernel Tracing} \section{Overview} \maketitle \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} \begin{frame}[fragile] \frametitle{Kernel Tracing: Overview} \begin{itemize} \item DebugFS interface \item Different tracers: function, function\_graph, wakeup, wakeup\_rt, \dots \item Custom trace events \item Graphical frontend (kernelshark) \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Kernel Tracing: Overview} \begin{figure}[h] \centering \includegraphics[width=10cm]{images/trace_overview.png} \end{figure} \end{frame} \begin{frame}[fragile] \frametitle{Kernel Tracing: DebugFS} \begin{verbatim} $ mount -t debugfs debugfs /sys/kernel/debug $ cd /sys/kernel/debug/tracing $ cat available_tracers blk function_graph mmiotrace wakeup_rt wakeup \end{verbatim} \end{frame} \section{Event tracing} \begin{frame}[fragile] \frametitle{Event tracing} \begin{itemize} \item Pre-defined Events in the kernel \item Event groups \item Each event comes with several options \item Filtering based on event options \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Event tracing} current\_tracer can be set to NOP \begin{verbatim} $ cd /sys/kernel/debug/tracing $ ls events/ [...] irq sched scsi signal skb [...] \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{Event tracing} \begin{verbatim} $ cd /sys/kernel/debug/tracing $ ls -1 events/sched/ enable filter sched_kthread_stop sched_kthread_stop_ret sched_migrate_task sched_pi_setprio [...] \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{Event tracing: Enable events} \begin{verbatim} $ cd /sys/kernel/debug/tracing # Enable ALL events of the group ''sched'' $ echo 1 > events/sched/enable \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{Record a trace} After enabling the events you want to see, do: \begin{verbatim} $ cd /sys/kernel/debug/tracing # Start recording to the ringbuffer $ echo 1 > tracing_on # Stop recording to the ringbuffer $ echo 0 > tracing_on \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{Analyze a trace} You can even do this while recording! \begin{verbatim} $ cd /sys/kernel/debug/tracing # Just print the current content of the ring buffer $ cat trace # or: do a consuming read on the ring buffer $ cat trace_pipe \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{Trace event format and filters} Each trace event has a specific format and parameters. You can put a filter on those parameters for recording a trace: \begin{verbatim} $ cat events/sched/sched_switch/format [...] field:__u8 prev_comm[15]; field:pid_t prev_pid; field:int prev_prio; field:long prev_state; [...] $ echo 'next_comm == bash' > events/sched/sched_switch/filter $ echo 1 > events/sched/sched_switch/enable $ echo 1 > tracing_on $ cat trace \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{Tracing on multicore} \begin{itemize} \item One ringbuffer per cpu \item trace contains ALL events \item the per\_cpu directory contains a trace for each cpu \item tracing\_cpumask can limit tracing to specific cores \end{itemize} \end{frame} \section{Tracers} \begin{frame} \frametitle{Tracers} \begin{itemize} \item Already have some special logic \item Latency hunting \item Callgraphs \item Kernel profiling \item ... \end{itemize} \end{frame} \begin{frame} \frametitle{Tracers} available\_tracers contains the tracers which are enabled in the kernel configuration. The tracer ist set by the current\_tracer file: \begin{itemize} \item function: Can turn all functions into trace events \item function\_graph: Similiar to function, but contains a call graph \item wakeup / wakeup\_rt: Measure the wakeup time for tasks / rt tasks \item irqsoff: useful for latency hunting. Identifies long sections with IRQs turned off \item ... \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Tracer: function} \begin{verbatim} # tracer: function # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | wnck-2022 [003] 5766.659915: skb_release wnck-2022 [003] 5766.659916: sock_wfree wnck-2022 [003] 5766.659917: unix_write_free wnck-2022 [003] 5766.659917: skb_releasee_skb wnck-2022 [003] 5766.659918: kfree <-skb \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{Tracer: function\_graph} \begin{verbatim} $ echo function_graph > current_tracer $ echo 1 > tracing_on $ sleep 1 $ echo 0 > tracing_on $ less trace # tracer: function_graph # CPU DURATION FUNCTION CALLS # | | | | | | | 1) | enqueue_entity() { 1) | update_curr() { 1) 0.336 us | task_of(); 1) 1.524 us | } 1) 0.420 us | place_entity(); \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{function\_graph: Set a trigger function} You can set a trigger function for the function\_graph tracer if you just want to record specific functions and their childs: \begin{verbatim} echo do_IRQ > set_graph_function # Additionals triggers can be set with echo another_function >> set_graph_function \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{Tracer: function / latency\_format} \tiny \begin{verbatim} $ echo 1 > options/latency_format # tracer: function # # function latency trace v1.1.5 on 3.9.4-x1-00124-g0bfd8ff # -------------------------------------------------------------------- # latency: 0 us, #204955/25306195, CPU#0 | (M:desktop VP:0, KP:0, SP:0 HP:0 #P:4) # ----------------- # | task: -0 (uid:0 nice:0 policy:0 rt_prio:0) # ----------------- # # _------=> CPU# # / _-----=> irqs-off # | / _----=> need-resched # || / _---=> hardirq/softirq # ||| / _--=> preempt-depth # |||| / delay # cmd pid ||||| time | caller # \ / ||||| \ | / terminol-11964 1.... 11639243us : ep_read_events_proc <-ep_scan_ready_list.isra.8 \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{Tracer: wakeup\_rt} \tiny \begin{verbatim} # tracer: wakeup_rt # # wakeup_rt latency trace v1.1.5 on 2.6.35-22-generic # -------------------------------------------------------------------- # latency: 11 us, #20/20, CPU#3 | (M:desktop VP:0, KP:0, SP:0 HP:0 #P:4) ... # cmd pid time | caller # \ / \ | / ls-4579 3d.... 1us+: 4579:120:R + [003] 12: 0:R migration/3 ls-4579 3d.... 4us : wake_up_process <-cpu_stop_queue_work ls-4579 3d.... 4us : check_preempt_wakeup <-try_to_wake_up ls-4579 3d.... 4us : resched_task <-check_preempt_wakeup ls-4579 3dN... 5us : task_woken_rt <-try_to_wake_up ls-4579 3dN... 5us : _raw_spin_unlock_irqrestore <-try_to_wake_up \end{verbatim} \end{frame} \section{Kernel function profiler} \begin{frame}[fragile] \frametitle{Kernel function profiler} \begin{verbatim} $ echo 1 > function_profile_enabled $ echo 1 > tracing_on # then do something... $ echo 0 > tracing_on $ less trace_stat/function0 Function Hit Time Avg -------- --- ---- --- __schedule 7064 1958976725 us 277318.3 us schedule 6961 1958965845 us 281420.1 us [...] \end{verbatim} \end{frame} \section{trace\_marker} \begin{frame}[fragile] \frametitle{Custom application tracepoints: ''simple method''} \begin{verbatim} $ echo 1 > tracing_on $ echo "MARK" > trace_marker $ echo 0 > tracing_on $ less trace ... bash-4328 [003] 5603.687935: get_slab bash-4328 [003] 5603.687935: _cond_re bash-4328 [003] 5603.687936: _cond_re bash-4328 [003] 5603.687939: 0: MARK bash-4328 [003] 5603.687939: kfree <- ... \end{verbatim} \end{frame} \section{trace\_printk} \begin{frame}[fragile] \frametitle{trace\_printk()} \begin{itemize} \item trace\_printk() can be used to write messages to the tracing ring buffer \item Usage is similar to printk() \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Control a trace from your kernel code} \begin{verbatim} void my_kernel_function(void) { tracing_on(); do_some_stuff_i_wanna_trace(); tracing_off(); } \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{Tracing related kernel parameters} \begin{verbatim} ftrace= \end{verbatim} Set and start specified tracer as early as possible. \begin{verbatim} ftrace_dump_on_oops[=orig_cpu] \end{verbatim} Dump the tracing ring buffer if an Oops occurs. Using orig\_cpu it will only dump the buffer of the CPU which triggered the Oops. \begin{verbatim} ftrace_filter= \end{verbatim} Only trace specific functions. \begin{verbatim} ftrace_notrace= \end{verbatim} Don't trace specific functions. \begin{verbatim} trace_event= \end{verbatim} Just enable trace events (comma separated list) \end{frame} \begin{frame}[fragile] \frametitle{Dump trace buffer} The trace buffer can also be dumped by:\\ \textbf{SysRQ-z}\\ or \begin{verbatim} echo z > /proc/sysrq-trigger \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{Trace instances} You can have separate trace instances with their own buffers and events: \begin{verbatim} $ cd /sys/kernel/debug/tracing $ mkdir instances/my_inst1 $ cd instances/my_inst1 $ echo 1 > events/sched/enable $ cat trace [...] \end{verbatim} \end{frame} \section{trace-cmd} \begin{frame} \frametitle{trace-cmd} \begin{alertblock}{What is trace-cmd?} trace-cmd is a commandline utility for controlling and analysing kernel traces. \end{alertblock} \end{frame} \begin{frame}[fragile] \frametitle{trace-cmd: Installation} \begin{verbatim} $ git clone git://git.kernel.org/pub/scm/linux/\ kernel/git/rostedt/trace-cmd.git $ cd trace-cmd # On Debian system $ apt-get install libgtk2.0-dev libxml2-dev $ make && make gui $ make install && make install_gui \end{verbatim} \end{frame} \begin{frame}[fragile] \small \frametitle{trace-cmd: Usage} \begin{verbatim} $ trace-cmd [...] record - record a trace into a trace.dat file start - start tracing without recording into a file extract - extract a trace from the kernel stop - stop the kernel from recording trace data reset - disable all kernel tracing / clear trace buffers report - read out the trace stored in a trace.dat file split - parse a trace.dat file into smaller file(s) listen - listen on a network socket for trace clients list - list the available events, plugins or options restore - restore a crashed record stack - output, enable or disable kernel stack tracing \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{trace-cmd: Usage} \begin{verbatim} # Recording a sched_switch trace event $ trace-cmd record -e sched_switch Hit Ctrl^C to stop recording offset=ae000 [...] CPU: 0 entries: 0 overrun: 0 commit overrun: 0 \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{trace-cmd: Usage} \begin{verbatim} # Analysing a trace $ trace-cmd report version = 6 cpus=1 trace-cmd-29057 [000] 6901.652365: wakeup: 29057:120:1 ==+ 29057:120:1 [000] trace-cmd-29057 [000] 6901.652388: wakeup: 29057:120:0 ==+ 323:120:0 [000] trace-cmd-29057 [000] 6901.652393: context_switch: 29057:120:0 ==> 323:120:0 [000] kworker/0:1-323 [000] 6901.652397: wakeup: 323:120:0 ==+ 28355:120:0 [000] \end{verbatim} \end{frame} \begin{frame} \frametitle{trace-cmd: Usage} \begin{itemize} \item trace-cmd record generates a file called trace.dat. This can be overridden by the -o option \item trace-cmd report uses the -i option for specifying an input file \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{trace-cmd: Record specific events} \begin{verbatim} trace-cmd record -e sched # or a specific scheduler event trace-cmd record -e sched_wait_task # List availabe events and options trace-cmd report --events \end{verbatim} \begin{verbatim} \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{trace-cmd: Filters} Based on the options from ''trace-cmd report --events'': \begin{verbatim} trace-cmd record -e context_switch \ -f 'next_pid == 323' \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{trace-cmd: Tracing a specific command} Enable tracing while a specific command is being executed: \begin{verbatim} $ trace-cmd record -p function ls \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{trace-cmd: Recording traces via network} On the host: \begin{verbatim} trace-cmd listen -p 1234 -o trace_remote \end{verbatim} On the target: \begin{verbatim} trace-cmd record -p sched_switch \ -N 192.168.0.182:1234 /bin/ls \end{verbatim} \end{frame} \section{Kernelshark} \begin{frame}[fragile] \frametitle{Kernelshark: A graphical front-end} \begin{verbatim} $ kernelshark # or $ kernelshark -i mytrace.dat \end{verbatim} \end{frame} \begin{frame} \frametitle{Kernelshark} \begin{figure}[h] \centering \includegraphics[width=10cm]{images/kernelshark.png} \end{figure} \end{frame} \begin{frame} \frametitle{Kernelshark} \begin{figure}[h] \centering \includegraphics[width=10cm]{images/kernelshark_zoom.png} \end{figure} \end{frame} \section{Tracecompass} \begin{frame}[fragile] \frametitle{Tracecompass} \begin{itemize} \item Uses the C ommon T race F ormat \item perf can convert traces to CTF \item perf uses libbabeltrace for the convertion \item A recent version of libbabeltrace is needed \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Build perf for your Target} \begin{verbatim} cd kernel_source/tools/perf make ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- # Now copy the perf executable to the target \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{Setup the tools on your host: libbabeltrace} \begin{verbatim} git clone https://github.com/efficios/babeltrace.git cd babeltrace # This is a known working commit. # Recent commits seem to be broken for perf-ctf git checkout 9aac8f729c091ddddb688038f5d417a7b1ce4259 ./bootstrap ./configure make sudo make install \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{Setup the tools on your host: perf} \begin{verbatim} cd kernel_source/tools/perf make LIBBABELTRACE=1 LIBBABELTRACE_DIR=/usr/local \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{Record a trace on the target} \begin{verbatim} ./perf record -e 'sched:*' -a # (stop with Ctrl-C) # Copy perf.data to the host \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{On the host: Convert perf.data to the proper format} \begin{verbatim} LD_LIBRARY_PATH=/usr/local/lib ./perf data convert --to-ctf ./ctf-data # Now the trace data should be available in ctf-data/ # You can import this directory with Eclipse / Tracecompass \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{Tracecompass} \begin{figure}[h] \centering \includegraphics[width=10cm]{images/tracecompass.png} \end{figure} \end{frame} \section{Task naming} \begin{frame}[fragile] \frametitle{Matching PIDs and process names of your trace(s)} \begin{verbatim} $ cat saved_cmdlines [...] 5112 bash 5223 ARTHUR_DENT 5546 kworker/0:2 8465 kworker/0:0 [...] \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{Where's my thread in the trace?!} To ''find'' a specific thread in your trace, you can use the following function to set the thread's name: \begin{verbatim} #define _GNU_SOURCE #include pthread_setname_np(pthread_t thread, const char *name); \end{verbatim} Available since glibc version 2.12 \end{frame} \begin{frame}[fragile] \frametitle{Thread names: Example} \begin{verbatim} /* pthread_example */ pthread_t test_thread; [...] pthread_create(&test_thread, NULL, my_test_thread, NULL); [...] pthread_setname_np(test_thread, "ARTHUR_DENT"); \end{verbatim} Check with the ps command: \begin{verbatim} $ ps H -C pthread_example -o 'pid tid cmd comm' PID TID CMD COMMAND 4515 4515 ./pthread_example pthread_example 4515 4516 ./pthread_example ARTHUR_DENT \end{verbatim} \end{frame} \begin{frame} \frametitle{Thread name in kernelshark} \begin{figure}[h] \centering \includegraphics[width=10cm]{images/pthread_kernelshark.png} \end{figure} \end{frame} \section{kprobes} \begin{frame}[fragile] \frametitle{Dynamic kernel tracepoints: KPROBES} \begin{itemize} \item Similar to Tracepoints \item Can be added / removed dynamically \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Dynamic kernel tracepoints: KPROBES} \begin{verbatim} $ echo 'p:my_k_event do_IRQ' > kprobe_events $ echo 1 > events/kprobes/my_k_event/enabled $ echo 1 > tracing_on $ cat trace -0 [000] d... 545.173709: my_k_event: (do_IRQ+0x0/0xc0) -0 [000] d... 545.331051: my_k_event: (do_IRQ+0x0/0xc0) -0 [000] d... 545.331490: my_k_event: (do_IRQ+0x0/0xc0) -0 [000] d... 545.490730: my_k_event: (do_IRQ+0x0/0xc0) \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{Dynamic kernel tracepoints: KPROBES for custom modules} Let's assume we want to have a tracepoint for the function hello\_init in the module hello.ko \begin{verbatim} # Note: >> will append a new event $ echo 'p:my_mod_event hello:hello_init' \ >> kprobe_events $ echo 1 > events/kprobes/my_mod_event/enable $ insmod hello.ko $ cat trace insmod-9586 [000] d... 13278.003468: my_mod_event: (0xf878d080) \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{Dynamic kernel tracepoints: Question / Exercise} What happens, if we add the following event. What's different? \begin{verbatim} # Note >> will append a new event $ echo 'r:my_exercise_event hello:hello_init' \ >> kprobe_events \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{KPROBES statistics} \begin{verbatim} $ cat kprobe_profile my_mod_event_ret 2 0 my_mod_event 2 0 \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{perf: kprobes} The ''perf'' tool can also be used to create kprobes. \begin{verbatim} $ sudo perf probe --add do_IRQ Added new event: probe:do_IRQ (on do_IRQ) \end{verbatim} \end{frame} \section{uprobes} \begin{frame}[fragile] \frametitle{Dynamic Userspace Tracepoints: uprobes} \begin{itemize} \item Similar to kprobes \item For userspace applications \item A uprobe event is set on a specific offset in a userland process \item Powerful method to correlate your kernel and userland events! \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Dynamic Userspace Tracepoints: uprobes} \begin{verbatim} $ gcc -Wall -g -o pthread_example \ pthread_example.c -lpthread $ objdump -F -D -S pthread_example | less \end{verbatim} \begin{verbatim} 08048594 (File Offset: 0x594): [...] void *my_test_thread(void *x_void_ptr) [...] for (i = 0; i < 10; i++) { 80485a1: c7 45 f4 00 00 00 00 movl $0x0,-0xc(%ebp) 80485a8: eb 1c jmp 80485c6 (File Offset: 0x5c6) printf("The answer is 42!\n"); 80485aa: c7 04 24 50 87 04 08 movl $0x8048750,(%esp) \end{verbatim} So, the file offset for the printf call is 0x5aa ! \end{frame} \begin{frame}[fragile] \frametitle{Dynamic Userspace Tracepoints: uprobes II} \begin{verbatim} echo \ 'p:my_ev /home/devel/pthread/pthread_example:0x5aa' \ > /sys/kernel/debug/tracing/uprobe_events \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{Dynamic Userspace Tracepoints: uprobes III} \begin{verbatim} $ cd /sys/kernel/debug/tracing/ $ echo 1 > events/uprobes/my_ev/enable $ echo 1 > tracing on $ /home/devel/pthread_example/pthread_example $ echo 0 > tracing_on $ less trace # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | ARTHUR_DENT-5223 [000] d... 5653.154822: my_ev: (0x80485aa) ARTHUR_DENT-5223 [000] d... 5654.155723: my_ev: (0x80485aa) \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{uprobes: statistics} \begin{verbatim} $ cat uprobe_profile /home/devel/pthread/pthread_example my_ev 10 \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{perf: uprobes} The ''perf'' tool can also be used to create uprobes. \begin{verbatim} $ sudo perf probe -x ./pthread_example my_ev=pthread_example.c:35 Added new event: probe_pthread_example:my_ev (on @pthread_example.c.c:35 ...) \end{verbatim} \end{frame} \section{sources} \begin{frame} \frametitle{sources} \begin{thebibliography}{1} \bibitem{trace1} http://lwn.net/Articles/365835/ \bibitem{trace2} http://lwn.net/Articles/366796/ \end{thebibliography} \end{frame} \input{tailpres}