% on the following slides, include icon in the left sidebar
\def\lximg{/usr/share/lx/icons/fueller.png}

\input{configpres}

\title{Linux process management / Scheduling / Daemons}
\maketitle

% stop displaying 'fueller.png' on the following slides
\def\lximg{none}

\subsection{Process Management}
\begin{frame}
\frametitle{Binary formats}
A program file includes meta information which describes the format
of the executable. Linux uses the ELF format (Executable Linking Format)
\end{frame}

\begin{frame}
\frametitle{Process Memory Layout}
\begin{itemize}
\item text segment
\item initialized data segment
\item uninitialized data segment
\item stack
\item heap
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Process creation}
From the operatings system's point of view, there are basically two steps,
which are performed when starting a process.
\begin{itemize}
\item A process is created using the fork() system call
\item The execve() system call loads a new program into the process memory
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Parent / Child}
\begin{itemize}
\item init is the first process, which is started (PID == 1), so
\item init is the parent of all processes on the system
\item A parent waits for its childs termination using the wait() system call
\item If the parent process terminates, before the child terminates, the child
is ''adopted'' by PID 1
\item A child which terminates before its parent was able to do a wait() is
turned into a ''Zombie''
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Task states}
Each task can have one of the following states:
\begin{itemize}
\item Interruptible sleep (waiting for an event) (S)
\item Uninterruptible sleep (waiting for I/O) (D)
\item Running (R)
\item Stopped (T)
\item Defunct / ''Zombie'' (Z)
\end{itemize}
\end{frame}

\begin{frame}[fragile]
\frametitle{Task states}
''ps aux'' also shows the task state:
\begin{verbatim}
USER       PID         STAT       COMMAND
postfix   5034  [...]   S    [...] pickup -l
jan       5303  [...]   SN+  [...] man 8 init
jan       5313  [...]   SN+  [...] pager -s
jan       5390  [...]   SNl  [...] evince
jan       5416  [...]   SNs  [...] bash
\end{verbatim}
The first column in the STAT field shows the process state.
\end{frame}

\subsection{Scheduling}
\begin{frame}
\frametitle{The LINUX Scheduler}
\begin{itemize}
\item Responsiveness
\item Fairness
\item Throughput
\item O(log n)
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Scheduling classes}
Normal processes:
\begin{itemize}
\item SCHED\_OTHER
\item SCHED\_BATCH (Linux specific; since 2.6.16)
\item SCHED\_IDLE (Linux specific; since 2.6.23)
\end{itemize}
Realtime:
\begin{itemize}
\item SCHED\_FIFO
\item SCHED\_RR
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{The nice value}
\begin{itemize}
\item Non-Realtime processes don't have a static priority!
\item Their priority is calculated dynamically
\item The calculation can be influenced by the ''nice value''
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{The nice value}
\begin{itemize}
\item The range for possible nice values is: -20 .. +19
\item It tells the system how ''NICE'' the process should behave towards other
processes. So, -20 means ''high priority'' and +19 ''low priority''
\end{itemize}
\end{frame}

\begin{frame}[fragile]
\frametitle{The nice value}
The nice value can be changed using the ''nice'' and the ''renice'' command:
\begin{verbatim}
Usage: nice [OPTION] [COMMAND [ARG]...]
-n, --adjustment=N

renice [-n] prio [-p|--pid] pid  [.. pid]
renice [-n] prio  -g|--pgrp pgrp [.. pgrp]
renice [-n] prio  -u|--user user [.. user]
\end{verbatim}
\end{frame}

\begin{frame}
\frametitle{SCHED\_IDLE and SCHED\_BATCH}
\begin{itemize}
\item SCHED\_BATCH: The scheduler will always assume the process to be CPU
intensive and therefore will apply a penalty when calculating the dynamic
priority.
\item SCHED\_IDLE: For very low prio processes. Even the nice value is
ignored. The resulting priority will be \textbf{below} SCHED\_OTHER and SCHED\_BATCH
with nice +19 assigned to it!
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Realtime scheduling classes}
\begin{itemize}
\item SCHED\_FIFO: Static priority
\item SCHED\_RR: Priority based, Round Robin scheduling per priority
\end{itemize}
Both Realtime scheduling classes accept priorities from 1 to 99, where 99 is
the highest priority.
\end{frame}

\begin{frame}[fragile]
\frametitle{Setting the Scheduling class}
The scheduling class can be set using the chrt command:
\begin{verbatim}
Set policy:
  chrt [opts] <policy> <prio> <pid>
  chrt [opts] <policy> <prio> <cmd> [<arg> ...]
Get policy:
  chrt [opts] {<pid> | <cmd> [<arg> ...]}
Scheduling policies:
-b | --batch set policy to SCHED_BATCH
-f | --fifo  set policy to SCHED_FIFO
-i | --idle  set policy to SCHED_IDLE
-o | --other set policy to SCHED_OTHER
-r | --rr    set policy to SCHED_RR (default)
\end{verbatim}
\end{frame}

\begin{frame}[fragile]
\frametitle{Setting scheduling class and priority}
\begin{lstlisting}
#include <sched.h>

struct sched_param param;
int ret;

params.prio = 80;
ret = sched_setscheduler(0, SCHED_FIFO, &param);
[...]
\end{lstlisting}
\end{frame}

\begin{frame}[fragile]
\frametitle{Resource limits}
\begin{lstlisting}
#include <sys/resource.h>

int setrlimit(int resource,
           const struct rlimit *rlim);
\end{lstlisting}
\end{frame}

\subsection{Daemons}
\begin{frame}
\frametitle{Daemons}
\begin{alertblock}{What is a Daemon?}
A Daemon runs in background and is not attached to a terminal. Daemons are
used for specific tasks, such as a web-server, a printer server, ...
\end{alertblock}
\end{frame}

\begin{frame}
\frametitle{How a Daemon gets created}
\begin{itemize}
\item Like any other process, a Daemon is created using fork()
\item after forking the parent exits, which causes the child to be ''adopted''
by PID 1
\item then the child calls setsid(), to create a new session for that process
\item afterwards several administrative tasks will be done (like changing the
working directory and so on...)
\end{itemize}
\end{frame}

\subsection{Multicore specific scheduling}
\begin{frame}[fragile]
\frametitle{Scheduling on Multicore Systems}
\begin{itemize}
\item CPU affinity
\item Kernelparameters:
\begin{itemize}
\item max\_cpus=
\item isolcpus=
\end{itemize}
\end{itemize}
\end{frame}

\begin{frame}[fragile]
\frametitle{Setting the CPU Affinity}
The cpu affinity can be set using the taskset command:
\begin{verbatim}
taskset [options] mask command [arg]...
taskset [options] -p [mask] pid
\end{verbatim}
\end{frame}

\begin{frame}[fragile]
\frametitle{CPU affinity}
\begin{lstlisting}
#define _GNU_SOURCE
#include <sched.h>

cpu_set_t set;

CPU_ZERO(&set);
CPU_SET(0, &set);
CPU_SET(1, &set);

[...]

sched_setaffinity(pid, CPU_SETSIZE, &set);
\end{lstlisting}
\end{frame}

\begin{frame}[fragile]
\frametitle{SMP and interrupt routing}
\begin{verbatim}
$ ls /proc/irq/
0  1  10  11  12  13  14  15  17  18  19 ... default_smp_affinity

$ cat /proc/irq/default_smp_affinity
3
\end{verbatim}
Set default IRQ affinity to CPU0
\begin{verbatim}
echo 1 > /proc/irq/default_smp_affinity
\end{verbatim}
Set affinity for IRQ19 to CPU1
\begin{verbatim}
echo 2 > /proc/irq/19/smp_affinity
\end{verbatim}
\end{frame}

\subsection{Control Groups: cgroups}

\begin{frame}
\frametitle{What are cgroups}
Control groups (cgroups) are a mechanism for partitioning and aggregating tasks
into hierarchical groups. Each group has several options like
CPU time, a specific set of CPUs, ...
\end{frame}

\begin{frame}[fragile]
\frametitle{Setting up cgroups}
\begin{verbatim}
mount -t cgroup -o cpu,cpuacct none /sys/fs/cgroup/cpu,cpuacct

mount -t cgroup -o cpuset none /sys/fs/cgroup/cpuset
\end{verbatim}
\end{frame}

\begin{frame}[fragile]
\frametitle{Setting up cgroups}
Create groups nice and cpu0only
\begin{verbatim}
mkdir /sys/fs/cgroup/cpu,cpuacct/group_nice

mkdir /sys/fs/cgroup/cpuset/group_cpu0only
\end{verbatim}
\end{frame}

\begin{frame}[fragile]
\frametitle{Setting up cgroups}
Limit CPU time (relative to other tasks) for the nice group
\begin{verbatim}
echo 100 > /sys/fs/cgroup/cpu,cpuacct/group_nice/cpu.shares
\end{verbatim}
Set allowed CPUs and memory nodes for the cpu0only group
\begin{verbatim}
echo 0 > /sys/fs/cgroup/cpuset/group_cpu0only/cpuset.cpus

echo 0 > /sys/fs/cgroup/cpuset/group_cpu0only/cpuset.mems
\end{verbatim}
\end{frame}

\begin{frame}[fragile]
\frametitle{Testing cgroups}
Open two shells. In shell 1 add the current task to the cpu0only and nice groups and burn the cpu:
\begin{verbatim}
echo $$ > /sys/fs/cgroup/cpuset/group_cpu0only/tasks

echo $$ > /sys/fs/cgroup/cpu,cpuacct/group_nice/tasks

while true; do echo -n; done
\end{verbatim}
In shell 2 add the current task to the cpu0only group and burn the cpu:
\begin{verbatim}
echo $$ > /sys/fs/cgroup/cpuset/group_cpu0only/tasks

while true; do echo -n; done
\end{verbatim}
\end{frame}

\begin{frame}[fragile]
\frametitle{Testing cgroups}
Now check the CPU usage with top
\begin{verbatim}
$ top
[...]
PID  USER  PR  NI   VIRT   RES   SHR  S  %CPU  %MEM    TIME+  COMMAND     
871  root  20   0  22588  3480  3036  R  90.7   0.2  0:56.10  bash        
872  root  20   0  22588  3484  3044  R   9.0   0.2  0:05.45  bash        
[...]
\end{verbatim}
\end{frame}

\begin{frame}[fragile]
\frametitle{CPU shares vs. quota}
What is the difference between cpu.shares and cpu.cfs\_quota\_us?
\begin{verbatim}
echo 10000 > /sys/fs/cgroup/cpu,cpuacct/group_nice/cpu.cfs_quota_us
\end{verbatim}
\end{frame}

\begin{frame}[fragile]
\frametitle{Who is using cgroups?}
systemd uses cgroups! Boot with systemd and check sysfs:
\begin{verbatim}
$ ls /sys/fs/cgroup/systemd
... system.slice ... user.slice

$ ls /sys/fs/cgroup/systemd/system.slice
... networking.service ... systemd-user-sessions.service ...
\end{verbatim}
\end{frame}

\subsection{sources}
\begin{frame}
\begin{thebibliography}{1}
\bibitem{kerisk10} The Linux Programming Interface (Michael Kerisk), no starch
press, ISBN 978-1-59327-220-3
\end{thebibliography} 
\end{frame}

\input{tailpres}