Add privsep article.
This commit is contained in:
parent
74d005b9f1
commit
6c0407d613
713
privsep.org
Normal file
713
privsep.org
Normal file
@ -0,0 +1,713 @@
|
||||
#+TITLE: Privilege drop, privilege separation, and restricted-service operating mode in OpenBSD
|
||||
#+DATE: 2023-01-30
|
||||
* Prologue
|
||||
My main focus in OpenBSD are privilege separated network daemons
|
||||
running in restricted-service operation mode. I gave talks at [[https://www.bsdcan.org][BSDCan]]
|
||||
and [[https://fosdem.org][FOSDEM]] in the [[file:index.org::*External Writings & Presentations][past]] about how I used these techniques to write
|
||||
[[https://man.openbsd.org/slaacd.8][slaacd(8)]] and [[https://man.openbsd.org/unwind.8][unwind(8)]]. While I do not think of myself as a one-trick
|
||||
pony, I have written some more: [[https://man.openbsd.org/slowcgi.8][slowcgi(8)]], [[https://man.openbsd.org/rad.8][rad(8)]], [[https://man.openbsd.org/dhcpleased.8][dhcpleased(8)]], and
|
||||
[[https://github.com/fobser/gelatod][gelatod(8)]]. I also wrote the first version of what later turned into
|
||||
[[https://man.openbsd.org/resolvd.8][resolvd(8)]].
|
||||
|
||||
At one point I claimed that it would take me about a week to
|
||||
transmogrify one daemon into a new one.
|
||||
|
||||
* Why
|
||||
Privilege drop, privilege separation, and restricted-service operating
|
||||
mode are exploit mitigations. When[fn:: not if!] an attacker finds a
|
||||
bug we try to stop them from causing damage. The mitigations we are
|
||||
talking about here are aimed at attackers that achieved arbitrary
|
||||
code execution. Due to other [[https://www.openbsd.org/innovations.html][mitigations]] that is quite difficult to
|
||||
pull off. These are the last line of defence. We try to remove as many
|
||||
resources from the attacker to play with and try to crash the program
|
||||
as quickly as possible if an attacker touches something they are not
|
||||
supposed to.
|
||||
|
||||
* Privilege drop
|
||||
Privilege drop is probably the weakest mitigation discussed in this
|
||||
article. It is a very old technique, but still important for
|
||||
set-user-ID root binaries.
|
||||
|
||||
Theo de Raadt [[http://cvsweb.openbsd.org/cgi-bin/cvsweb/src/sbin/ping/ping.c.diff?r1=1.6&r2=1.7][refactored]] [[https://man.openbsd.org/ping.8][ping(8)]] over 26 years ago to open a raw
|
||||
socket early and then drop root privileges. This prevents a local user
|
||||
from elevating their privileges when finding a bug in ping(8):
|
||||
#+begin_src diff
|
||||
@@ -191,6 +191,14 @@
|
||||
char rspace[3 + 4 * NROUTES + 1]; /* record route space */
|
||||
#endif
|
||||
|
||||
+ if (!(proto = getprotobyname("icmp")))
|
||||
+ errx(1, "unknown protocol icmp");
|
||||
+ if ((s = socket(AF_INET, SOCK_RAW, proto->p_proto)) < 0)
|
||||
+ err(1, "socket");
|
||||
+
|
||||
+ /* revoke privs */
|
||||
+ setuid(getuid());
|
||||
+
|
||||
preload = 0;
|
||||
datap = &outpack[8 + sizeof(struct timeval)];
|
||||
while ((ch = getopt(argc, argv, "DI:LRS:c:dfh:i:l:np:qrs:T:t:vw:")) != EOF)
|
||||
@@ -235,6 +243,8 @@
|
||||
loop = 0;
|
||||
break;
|
||||
case 'l':
|
||||
+ if (getuid() != 0)
|
||||
+ errx(1, "must be root to specify preload");
|
||||
preload = strtol(optarg, NULL, 0);
|
||||
if (preload < 0)
|
||||
errx(1, "bad preload value: %s", optarg);
|
||||
@@ -323,12 +333,6 @@
|
||||
*datap++ = i;
|
||||
|
||||
ident = getpid() & 0xFFFF;
|
||||
-
|
||||
- if (!(proto = getprotobyname("icmp")))
|
||||
- errx(1, "unknown protocol icmp");
|
||||
- if ((s = socket(AF_INET, SOCK_RAW, proto->p_proto)) < 0)
|
||||
- err(1, "socket");
|
||||
- hold = 1;
|
||||
|
||||
if (options & F_SADDR) {
|
||||
if (IN_MULTICAST(ntohl(to->sin_addr.s_addr)))
|
||||
#+end_src
|
||||
|
||||
20 years later we realized that we would not drop privileges when
|
||||
ping(8) is invoked as root. We would just "drop" from root to root. We
|
||||
can protect ourselves from a malicious ping target by [[http://cvsweb.openbsd.org/cgi-bin/cvsweb/src/sbin/ping/ping.c.diff?r1=1.214&r2=1.215][dropping]] to a
|
||||
dedicated user:
|
||||
#+begin_src diff
|
||||
@@ -272,8 +275,12 @@
|
||||
|
||||
/* revoke privs */
|
||||
uid = getuid();
|
||||
- if (setresuid(uid, uid, uid) == -1)
|
||||
- err(1, "setresuid");
|
||||
+ if ((pw = getpwnam(PING_USER)) == NULL)
|
||||
+ errx(1, "no %s user", PING_USER);
|
||||
+ if (setgroups(1, &pw->pw_gid) ||
|
||||
+ setresgid(pw->pw_gid, pw->pw_gid, pw->pw_gid) ||
|
||||
+ setresuid(pw->pw_uid, pw->pw_uid, pw->pw_uid))
|
||||
+ err(1, "unable to revoke privs");
|
||||
|
||||
preload = 0;
|
||||
datap = &outpack[ECHOLEN + ECHOTMLEN];
|
||||
#+end_src
|
||||
|
||||
ping(8) needs a raw socket to be able to send ICMP echo request
|
||||
packets. This is an operation that only root is allowed to do[fn::This
|
||||
prevents normal users from sending arbitrary IP packets, for example
|
||||
with spoofed IP addresses or from privileged (<1024) source
|
||||
ports.]. Once that socket is open though, ping(8) no longer needs to do
|
||||
any other privileged operation. It can hold on to the socket for later
|
||||
use and drop root privileges.
|
||||
|
||||
Another use for privilege drop is in daemons to restrict file-system
|
||||
access by [[https://man.openbsd.org/chroot.2][chroot(2)]]'ing to =/var/empty=. The daemon needs root
|
||||
privileges to call chroot(2), but afterwards it can run without
|
||||
elevated permissions. To the process it looks like there is only the
|
||||
=/= directory where it does not have any permissions.
|
||||
|
||||
The standard pattern can be seen in [[https://github.com/openbsd/src/blob/master/usr.sbin/rad/frontend.c#L200][frontend.c]] of [[https://man.openbsd.org/rad.8][rad(8)]]:
|
||||
#+begin_src C
|
||||
if ((pw = getpwnam(RAD_USER)) == NULL)
|
||||
fatal("getpwnam");
|
||||
|
||||
if (chroot(pw->pw_dir) == -1)
|
||||
fatal("chroot");
|
||||
if (chdir("/") == -1)
|
||||
fatal("chdir(\"/\")");
|
||||
|
||||
if (setgroups(1, &pw->pw_gid) ||
|
||||
setresgid(pw->pw_gid, pw->pw_gid, pw->pw_gid) ||
|
||||
setresuid(pw->pw_uid, pw->pw_uid, pw->pw_uid))
|
||||
fatal("can't drop privileges");
|
||||
#+end_src
|
||||
|
||||
We first get a user with [[https://man.openbsd.org/getpwnam.3][getpwnam(3)]] to drop to. The user has
|
||||
=/var/empty= configured as its home directory so we can use that in
|
||||
chroot(2). Next we [[https://man.openbsd.org/chdir.2][chdir(2)]] to the new file-system root to have a
|
||||
valid current working directory. This prevents us accidentally marking
|
||||
a file-system as busy depending on from where the daemon was started,
|
||||
preventing unmounting file-systems while the daemon is running.
|
||||
|
||||
We then drop privileges by putting the user into a single group using
|
||||
[[https://man.openbsd.org/setgroups.2][setgroups(2)]]. The calls to [[https://man.openbsd.org/setresuid.2][setresgid(2) and setresuid(2)]] set the
|
||||
real, effective and saved group and user IDs. This safely drops from
|
||||
=root:wheel= to (in this case) =_rad:_rad= with no way to escalate
|
||||
back to =root=.
|
||||
|
||||
This technique is probably used in most, if not all, OpenBSD's
|
||||
privilege separated daemons.
|
||||
|
||||
* Restricted-service operating mode
|
||||
With privilege drop in ping(8) we prevent a local unprivileged user to
|
||||
gain superuser or root privileges. If there were a [[https://www.freebsd.org/security/advisories/FreeBSD-SA-22:15.ping.asc][bug in message
|
||||
parsing]][fn::I do not want to heckle FreeBSD, it is just that it is a
|
||||
good illustration for what we are currently discussing. FreeBSD's
|
||||
ping(8) is using capsicum, so it is well locked away, too. And it is
|
||||
not like I am not making any [[https://ftp.openbsd.org/pub/OpenBSD/patches/7.0/common/017_slaacd.patch.sig][mistakes]]...], a malicious ping target, or
|
||||
even host in the middle, could still read and exfiltrate ssh
|
||||
private keys. ping(8) runs as my user-id. It can read all files my
|
||||
user can read, it can open network connections to any host on the
|
||||
internet, it can execute arbitrary programs, heck it can talk to my
|
||||
GPU. That is a lot of power that it does not need. It only needs to
|
||||
write to the terminal and send and receive ICMP packets.
|
||||
|
||||
We could lock ping(8) away using chroot(2), that at least takes away
|
||||
file-system access. But what can we do about programs that need
|
||||
file-system access but should not execute other programs or talk to
|
||||
the Internet? Like [[https://man.openbsd.org/file.1][file(1)]] for example.
|
||||
|
||||
Decades ago Niels Provos developed [[http://man.openbsd.org/OpenBSD-5.9/man4/systrace.4][systrace(4)]] but it turned out that
|
||||
it was difficult to use. The only user in OpenBSD base was [[http://cvsweb.openbsd.org/src/usr.bin/ssh/Attic/sandbox-systrace.c?rev=1.18&content-type=text/x-cvsweb-markup][sshd(8)]].
|
||||
|
||||
In 2015 Theo de Raadt tricked Nicholas Marriott into privilege
|
||||
separating and sand-boxing file(1) using systrace. [[http://cvsweb.openbsd.org/src/usr.bin/file/Attic/sandbox.c][It lasted for half
|
||||
a year]], it was that painful.
|
||||
|
||||
One problem with systrace(4) was that it worked on the level of
|
||||
syscalls and their arguments. This is not something user-land
|
||||
developers are intimately familiar with. We are interacting with libc
|
||||
and do not know what kind of syscalls libc does on our behalf. Another
|
||||
issue is that a program might need some [[https://man.openbsd.org/ioctl.2][ioctl(2)]]s or [[https://man.openbsd.org/sysctl.2][sysctl(2)]]s but it
|
||||
should not be able to do all of them. So we need to encode
|
||||
restrictions on arguments of syscalls. This gets unwieldy, fast.
|
||||
|
||||
There was also [[http://man.openbsd.org/OpenBSD-5.9/systrace.1][systrace(1)]] to define a policy outside of the
|
||||
program. It turns out that most programs need to do some sort of
|
||||
initialization where they need wide access to the system. This is
|
||||
before they touch untrusted data. Once the initialization is done we
|
||||
can restrict access. How much we can restrict access can depend on
|
||||
command line flags. systrace(1) could not help with this, the program
|
||||
would retain all the privileges it needed for initialization. They
|
||||
would be fewer than all privileges, but still way to many.
|
||||
|
||||
As far as I know, the experience with file(1) was the last straw. Theo
|
||||
set out to improve on this situation by developing tame(2), which was
|
||||
later renamed to [[https://man.openbsd.org/pledge.2][pledge(2)]][fn::It turned out that it was difficult to
|
||||
use tame(2) in a sentence when presenting the concept, hence the
|
||||
rename to pledge(2).].
|
||||
|
||||
pledge(2) was developed by studying all programs in OpenBSD base and
|
||||
putting their needed services into categories using broad strokes like
|
||||
/memory management/, /read-write on open file descriptors/, /opening
|
||||
of files/, or /networking/. If a program violates what it pledged to
|
||||
do, for example trying to open a file when it did not pledge =rpath=,
|
||||
it will be terminated with an uncatchable =SIGABRT=.
|
||||
|
||||
It is worth repeating that: If a program violates what it pledged to
|
||||
do it will be *terminated* by the kernel. An attacker does not get to
|
||||
play again and try something else[fn::Needless to say that I despise
|
||||
init systems that restart services when they crash.].
|
||||
|
||||
This was an iterative process with patches floating around. A few
|
||||
co-conspirators, including myself, joined the effort a bit later to
|
||||
add pledge(2) to more programs. Once we hit 50 or so pledged programs,
|
||||
Theo considered it mature enough for commit and work continued in
|
||||
tree. Soon after, the list of programs not pledged at all was shorter
|
||||
than the list of pledged programs. This is a huge success and speaks
|
||||
to the usability of pledge(2). In decades we only had one program
|
||||
using systrace(4) in OpenBSD base, then pledge(2) shows up and in less
|
||||
than a year nearly all of OpenBSD base uses it.
|
||||
|
||||
To add pledge(2) to a program we need to know what it does and
|
||||
potentially re-factor it to pull (hoist) one-time initialization up
|
||||
before pledge(2) is called for the first time. Some programs are
|
||||
sloppy in the sense that they open a certain resource the moment they
|
||||
need it, this means that they retain more access than they need. As
|
||||
we have seen with ping(8), if we pull opening of the raw socket before
|
||||
option parsing we can drop root privileges before touching untrusted
|
||||
data[fn::With a set-user-ID root program command line options are
|
||||
untrusted data!].
|
||||
|
||||
Since pledge(2) is internal to the program we can call it once we are
|
||||
done with option parsing and pledge different things depending on
|
||||
given options. [[https://github.com/openbsd/src/blob/master/sbin/ping/ping.c#L770][For example, ping(8) retains the ability to do DNS
|
||||
lookups depending on the =-n= flag]]:
|
||||
#+begin_src C
|
||||
if (options & F_HOSTNAME) {
|
||||
if (pledge("stdio inet dns", NULL) == -1)
|
||||
err(1, "pledge");
|
||||
} else {
|
||||
if (pledge("stdio inet", NULL) == -1)
|
||||
err(1, "pledge");
|
||||
}
|
||||
#+end_src
|
||||
|
||||
pledge(2) is not fine-grained. It turns out that programs fall into
|
||||
broad categories of what they want to do after initialization. There
|
||||
are not hundreds of different promises for every obscure program, it
|
||||
is not needed. To add a new promise, a rule of thumb is: At least two
|
||||
programs have been identified that need a new promise. To add a
|
||||
syscall to an existing promise, that is, to give more power to an
|
||||
existing promise, needs careful evaluation of what all the other
|
||||
programs already using the promise, gain. It is not enough to show
|
||||
that it is fine for the new program, existing programs are much more
|
||||
important. Another question is how much additional kernel attack
|
||||
surface this exposes.
|
||||
|
||||
pledge(2) does not only protect the user of the system or systems on
|
||||
the Internet from harm when a bug is found, it also protects the
|
||||
kernel from user-land. Checking if a syscall is allowed happens early,
|
||||
before a lot of kernel code runs.
|
||||
|
||||
pledge(2) can be used to gain understanding on what a program
|
||||
does. We see the following pledges in file(1):
|
||||
#+begin_src shell
|
||||
$ cat -n file.c | fgrep 'pledge("'
|
||||
171 if (pledge("stdio rpath getpw recvfd sendfd id proc", NULL) == -1)
|
||||
210 if (pledge("stdio rpath sendfd", NULL) == -1)
|
||||
374 if (pledge("stdio getpw recvfd id", NULL) == -1)
|
||||
389 if (pledge("stdio recvfd", NULL) == -1)
|
||||
#+end_src
|
||||
|
||||
The reader is encouraged to stop here and read [[https://github.com/openbsd/src/blob/master/usr.bin/file/file.c][the code]] around those
|
||||
pledge(2) calls to figure out what file does and why it pledges those
|
||||
things.
|
||||
|
||||
------------------------------------------------------------------------
|
||||
|
||||
On line 171, file(1) pledges all the things it needs. It can
|
||||
read-write already open file descriptors[fn::It can only read if the
|
||||
file descriptor was opened for reading and only write if the file
|
||||
descriptor was opened for writing. Since file(1) cannot open files for
|
||||
writing or create new files, it cannot write to disk.], open files for
|
||||
reading, pass file descriptors around, figure out under which user it
|
||||
runs and lookup another user. It can also fork itself.
|
||||
|
||||
On line 210 it forked itself and we are in the parent process. It can
|
||||
shed all pledges that the forked child needs for its initialization as
|
||||
well as the ability to fork another instance. The parent process can
|
||||
now only open files for reading, pass those file descriptors to the
|
||||
child process and read-write already open file descriptors.
|
||||
|
||||
On line 374 we are in the child process and we shed all pledges that
|
||||
the parent needed. We need to be able to read-write open file
|
||||
descriptors and receive file descriptors from the parent
|
||||
process. During initialization we need to know if we are running
|
||||
as root and be able to look up a user. file(1) can privilege drop to a
|
||||
dedicated user when invoked as root! Once we are done with that, the
|
||||
child process, which does all the magic[fn:: Pun intended.], sheds all
|
||||
those additional privileges and on 389 it can only read-write existing
|
||||
file descriptors and receive new file descriptors from the parent
|
||||
process.
|
||||
|
||||
We see file(1) is privilege separated, running two processes. One
|
||||
process opens files but does not look at the contents. The other
|
||||
process, which is completely locked away, parses untrusted data and
|
||||
informs the parent process about the result.
|
||||
|
||||
Neither process can talk to the Internet or write to disk. They cannot
|
||||
create new files or open existing files for writing. While it can
|
||||
read ssh private keys, it cannot exfiltrate them.
|
||||
|
||||
We can find out what file(1) can and cannot do from just those four
|
||||
pledge lines, that is very powerful when starting a code review. They
|
||||
can also be used to get a quick overview how file(1) operates by
|
||||
reading the code adjacent to the pledge(2) calls, that is where
|
||||
interesting stuff happens.
|
||||
|
||||
* Privilege separation
|
||||
A single process design that pledges ~"stdio inet rpath"~ still has a
|
||||
lot of attack surface. This is not good if it is a network daemon
|
||||
running as root and enabled per default on all installations. Like
|
||||
[[https://man.openbsd.org/dhcpleased.8][dhcpleased(8)]] for example. For starters it can read and exfiltrate ssh
|
||||
private keys.
|
||||
|
||||
As we have seen with file(1), we can split up a program into multiple
|
||||
communicating processes that each pledge less operations than the sum
|
||||
of all pledges. We can move the risky operations of parsing untrusted
|
||||
data to a process that does not have access to the Internet, nor the
|
||||
file-system. That process will also not have any elevated privileges
|
||||
to change the system configuration like configuring network addresses
|
||||
or changing the routing table.
|
||||
|
||||
An attacker who finds a loophole into this least privileged process
|
||||
will have a hard time creating havoc. They can only talk to more
|
||||
privileged processes using a very narrow communication channel with
|
||||
easy to parse[fn::When using the imsg framework, it is not even parsing, nor
|
||||
data marshalling. It is raw C structs of a fixed, and known at compile
|
||||
time, size.] messages.
|
||||
|
||||
* OpenBSD network daemons
|
||||
We will look in detail at how dhcpleased(8) uses privilege separation
|
||||
and pledge(2) to implement a DHCP client in a safe way. Other network
|
||||
daemons follow a similar pattern and the reader should be able to
|
||||
study the source code of rad(8), slaacd(8), and unwind(8) to find out
|
||||
how those daemons work since they share a common ancestry.
|
||||
|
||||
** Overview
|
||||
dhcpleased(8) uses three communicating processes to implement
|
||||
privilege separation: /parent/, /engine/, and /frontend/. The /parent/
|
||||
process retains its root privileges to make changes to the system like
|
||||
configuring IP addresses. This process must be protected from the
|
||||
outside world and we make sure that it does not interact with
|
||||
untrusted data. The /frontend/ process interacts with the outside
|
||||
world, for example by sending and receiving DHCP messages. But it does
|
||||
not parse DHCP messages it receives because that is untrusted data. It
|
||||
is the /engine/'s job to parse DHCP messages and implement the state
|
||||
machine for the DHCP protocol. The /engine/ process is completely
|
||||
locked away.
|
||||
|
||||
dhcpleased(8) has a [[https://man.openbsd.org/dhcpleased.conf.5][configuration file]], following the typical OpenBSD
|
||||
syntax. For example, to send a custom vendor-class option in a
|
||||
~DHCPDISCOVER~ or ~DHCPREQUEST~ packet, the configuration looks like
|
||||
this:
|
||||
#+begin_src
|
||||
interface vio0 {
|
||||
send vendor class id "foobar"
|
||||
}
|
||||
#+end_src
|
||||
Most configuration files on OpenBSD use this kind of syntax, from
|
||||
[[https://man.openbsd.org/cwmrc.5][cwmrc(5)]] to [[https://man.openbsd.org/pf.conf.5][pf.conf(5)]].
|
||||
|
||||
It also comes with a control program, [[https://man.openbsd.org/dhcpleasectl.8][dhcpleasectl(8)]], to interact
|
||||
with the running daemon. Many OpenBSD daemons provide a similar tool
|
||||
to interact with them.
|
||||
|
||||
These are the features that come for free when using an existing
|
||||
OpenBSD network daemon as a template to write a new one: It will be
|
||||
privilege separated and using pledge(2) as a guide on how
|
||||
functionality should be split up between the processes. It will have a
|
||||
configuration file that uses a syntax that people familiar with
|
||||
OpenBSD will find easy to use. And there is a control process to
|
||||
interact with the running daemon. And finally there is a logging
|
||||
framework that handles logging to syslog or =stderr=. All this
|
||||
scaffolding and tooling is already there, we just need to swap out the
|
||||
specific code the old daemon uses and replace them with something new.
|
||||
|
||||
** Initialization
|
||||
dhcpleased(8) comes to life in [[https://github.com/openbsd/src/blob/3c46ceeaef274bbef234dac63245c4b6567168d7/sbin/dhcpleased/dhcpleased.c#L144][void main(int argc, char *argv[]) in
|
||||
dhcpleased.c]]. After a bit of house keeping and argument parsing it
|
||||
ends up [[https://github.com/openbsd/src/blob/3c46ceeaef274bbef234dac63245c4b6567168d7/sbin/dhcpleased/dhcpleased.c#L240][creating communications channels]] to talk to the
|
||||
/frontend/ and /engine/ processes and starts those two child
|
||||
processes:
|
||||
#+begin_src C
|
||||
if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC | SOCK_NONBLOCK,
|
||||
PF_UNSPEC, pipe_main2frontend) == -1)
|
||||
fatal("main2frontend socketpair");
|
||||
if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC | SOCK_NONBLOCK,
|
||||
PF_UNSPEC, pipe_main2engine) == -1)
|
||||
fatal("main2engine socketpair");
|
||||
|
||||
/* Start children. */
|
||||
engine_pid = start_child(PROC_ENGINE, saved_argv0, pipe_main2engine[1],
|
||||
debug, verbose);
|
||||
frontend_pid = start_child(PROC_FRONTEND, saved_argv0,
|
||||
pipe_main2frontend[1], debug, verbose);
|
||||
#+end_src
|
||||
~start_child()~ [[https://github.com/openbsd/src/blob/3c46ceeaef274bbef234dac63245c4b6567168d7/sbin/dhcpleased/dhcpleased.c#L401][ensures]] that after forking the file descriptor the
|
||||
child process can use to talk to the parent process has number three:
|
||||
#+begin_src C
|
||||
if (fd != 3) {
|
||||
if (dup2(fd, 3) == -1)
|
||||
#+end_src
|
||||
It then [[https://github.com/openbsd/src/blob/3c46ceeaef274bbef234dac63245c4b6567168d7/sbin/dhcpleased/dhcpleased.c#L407][sets up]] an =argv= array for [[https://man.openbsd.org/execv.3][execvp(3)]] to re-exec itself. The
|
||||
flags =-E= and =-F= control if the child process runs as /frontend/ or
|
||||
/engine/ process:
|
||||
#+begin_src C
|
||||
argv[argc++] = argv0;
|
||||
switch (p) {
|
||||
case PROC_MAIN:
|
||||
fatalx("Can not start main process");
|
||||
case PROC_ENGINE:
|
||||
argv[argc++] = "-E";
|
||||
break;
|
||||
case PROC_FRONTEND:
|
||||
argv[argc++] = "-F";
|
||||
break;
|
||||
}
|
||||
if (debug)
|
||||
argv[argc++] = "-d";
|
||||
if (verbose)
|
||||
argv[argc++] = "-v";
|
||||
if (verbose > 1)
|
||||
argv[argc++] = "-v";
|
||||
argv[argc++] = NULL;
|
||||
|
||||
execvp(argv0, argv);
|
||||
#+end_src
|
||||
Using fork & exec ensures that the child processes get a different
|
||||
memory layout. If there is an information leak in one process it
|
||||
cannot be used by an attacker to find gadgets in a different,
|
||||
potentially more privileged process.
|
||||
|
||||
Going back to the main function, [[https://github.com/openbsd/src/blob/3c46ceeaef274bbef234dac63245c4b6567168d7/sbin/dhcpleased/dhcpleased.c#L200][after option parsing]] we know if we
|
||||
are still in the parent process or in /engine/ or /frontend/ process:
|
||||
#+begin_src C
|
||||
if (engine_flag)
|
||||
engine(debug, verbose);
|
||||
else if (frontend_flag)
|
||||
frontend(debug, verbose);
|
||||
#+end_src
|
||||
|
||||
The ~engine()~ and ~frontend()~ functions live in [[https://github.com/openbsd/src/blob/3c46ceeaef274bbef234dac63245c4b6567168d7/sbin/dhcpleased/engine.c#L177][engine.c]] and
|
||||
[[https://github.com/openbsd/src/blob/3c46ceeaef274bbef234dac63245c4b6567168d7/sbin/dhcpleased/frontend.c#L131][frontend.c]] respectively. Neither returns to the =main()= function.
|
||||
|
||||
In the initialization of the child processes we drop privileges and
|
||||
pledge what the process needs to run. The /engine/ process pledges
|
||||
=stdio recvfd= and the /frontend/ process =stdio unix recvfd route=.
|
||||
|
||||
We then set-up the [[https://github.com/openbsd/src/blob/3c46ceeaef274bbef234dac63245c4b6567168d7/sbin/dhcpleased/frontend.c#L180][communication channel]] to the /parent/ (also known
|
||||
as the /main/) process:
|
||||
#+begin_src
|
||||
imsg_init(&iev_main->ibuf, 3);
|
||||
iev_main->handler = frontend_dispatch_main;
|
||||
#+end_src
|
||||
As mentioned before we force the file descriptor to number 3 in
|
||||
~start_child()~ so the child process knows how it can reach the
|
||||
/parent/ process. We are using [[https://man.openbsd.org/event_init.3][libevent]] to call functions when an
|
||||
event happens on a file descriptor. Here ~frontend_dispatch_main~ is
|
||||
called when we receive a message from the /parent/ process.
|
||||
There is also a function ~frontend_dispatch_engine~ for messages from
|
||||
the /engine/ process. The naming scheme is
|
||||
~RECEIVINGPROCESS_dispatch_SENDINGPROCESS~ and there are functions in
|
||||
=engine.c= and =dhpleased.c= to have a full mesh of communication
|
||||
channels between all three processes.
|
||||
|
||||
Now that we have started the child processes and hooked up the
|
||||
communication channels between /parent/ and /frontend/ as well as
|
||||
/parent/ and /engine/, it is time to send our first message from the
|
||||
/parent/ to both children. The child processes pledged =recvfd=, which
|
||||
allows them to receive open file descriptors over an existing open
|
||||
file descriptor. The /parent/ process calls
|
||||
[[https://github.com/openbsd/src/blob/3c46ceeaef274bbef234dac63245c4b6567168d7/sbin/dhcpleased/dhcpleased.c#L292][main_imsg_send_ipc_sockets()]] to create another socket pair and pass
|
||||
the end points to /engine/ and /frontend/ to create a full mesh.
|
||||
|
||||
The file descriptor is received by [[https://github.com/openbsd/src/blob/3c46ceeaef274bbef234dac63245c4b6567168d7/sbin/dhcpleased/engine.c#L387][engine_dispatch_main()]] and
|
||||
[[https://github.com/openbsd/src/blob/3c46ceeaef274bbef234dac63245c4b6567168d7/sbin/dhcpleased/frontend.c#L232][frontend_dispatch_main()]] using a message type of =IMSG_SOCKET_IPC=.
|
||||
|
||||
After receiving this file descriptor, /engine/ can [[https://github.com/openbsd/src/blob/3c46ceeaef274bbef234dac63245c4b6567168d7/sbin/dhcpleased/engine.c#L446][drop]] the =recvfd=
|
||||
pledge and only pledges =stdio=. It no longer expects any more file
|
||||
descriptors.
|
||||
|
||||
The start-up of /frontend/ is a bit more complicated. It needs to
|
||||
receive a route socket from the /parent/ process to learn of
|
||||
interfaces gaining or losing the =autoconf= flag during runtime. Once
|
||||
it received the =route= socket [[https://github.com/openbsd/src/blob/3c46ceeaef274bbef234dac63245c4b6567168d7/sbin/dhcpleased/frontend.c#L329][from the parent process]] it can get a
|
||||
list of all interfaces that already have the =autoconf= flag and drop
|
||||
the =route= pledge in [[https://github.com/openbsd/src/blob/3c46ceeaef274bbef234dac63245c4b6567168d7/sbin/dhcpleased/frontend.c#L650][frontend_startup()]] afterwards:
|
||||
#+begin_src C
|
||||
frontend_startup(void)
|
||||
{
|
||||
if (!event_initialized(&ev_route))
|
||||
fatalx("%s: did not receive a route socket from the main "
|
||||
"process", __func__);
|
||||
|
||||
init_ifaces();
|
||||
if (pledge("stdio unix recvfd", NULL) == -1)
|
||||
fatal("pledge");
|
||||
event_add(&ev_route, NULL);
|
||||
}
|
||||
#+end_src
|
||||
|
||||
It still needs to hold on to =unix= for communication with
|
||||
dhcpleasectl(8) and =recvfd= for receiving [[https://man.openbsd.org/bpf.4][bpf(4)]] sockets when new
|
||||
interfaces are set to =autoconf= with ifconfig(8).
|
||||
|
||||
The astute reader will notice that we have not talked about pledging
|
||||
the /parent/ process. Unfortunately that is not possible because there
|
||||
is no pledge that would allow opening and programming a new bpf(4)
|
||||
socket and we need to create a new one when an interface is set to
|
||||
=autoconf= while dhcpleased(8) is already running.
|
||||
|
||||
However, not all is lost. The parent process, which has to keep
|
||||
running as root, is not touching any untrusted data. An attacker needs
|
||||
to go through the /frontend/ or /engine/ process to gain a foothold in
|
||||
the /parent/ process. And they need to do this via the
|
||||
=main_dispatch_frontend= and =main_dispatch_engine= functions. We will
|
||||
look at those in a bit to see why that is a very difficult
|
||||
proposition.
|
||||
|
||||
But not all is lost. We can restrict the amount of havoc an attacker
|
||||
can cause if they ever get all the way to the /parent/ process using
|
||||
[[https://man.openbsd.org/unveil.2][unveil(2)]]:
|
||||
#+begin_src
|
||||
if (unveil(conffile, "r") == -1)
|
||||
fatal("unveil %s", conffile);
|
||||
if (unveil("/dev/bpf", "rw") == -1)
|
||||
fatal("unveil /dev/bpf");
|
||||
|
||||
if (unveil(_PATH_LEASE, "rwc") == -1) {
|
||||
no_lease_files = 1;
|
||||
log_warn("disabling lease files, unveil " _PATH_LEASE);
|
||||
}
|
||||
|
||||
if (unveil(NULL, NULL) == -1)
|
||||
fatal("unveil");
|
||||
#+end_src
|
||||
|
||||
It turns out that the parent process needs very little access to the
|
||||
file system. It needs access to the dhcpleased(8) config file, the
|
||||
bpf(4) device and the directory where lease files are stored. That
|
||||
list of files and directory does not include access to ssh private
|
||||
keys.
|
||||
|
||||
** dispatching
|
||||
As we said the /parent/ process is not touching any untrusted data,
|
||||
that is left to the /frontend/ and /engine/ process. The /frontend/
|
||||
and /engine/ processes send imsg messages to the /parent/
|
||||
process. Let's have a look at how those messages arrive and what the
|
||||
/parent/ process does with them.
|
||||
|
||||
Messages from the /frontend/ process arrive at
|
||||
[[https://github.com/openbsd/src/blob/3c46ceeaef274bbef234dac63245c4b6567168d7/sbin/dhcpleased/dhcpleased.c#L431][main_dispatch_frontend()]] in dhcpleased.c:
|
||||
#+begin_src C
|
||||
void
|
||||
main_dispatch_frontend(int fd, short event, void *bula)
|
||||
{
|
||||
// [...]
|
||||
uint32_t if_index;
|
||||
#ifndef SMALL
|
||||
int verbose;
|
||||
#endif /* SMALL */
|
||||
// [...]
|
||||
for (;;) {
|
||||
if ((n = imsg_get(ibuf, &imsg)) == -1)
|
||||
fatal("imsg_get");
|
||||
if (n == 0) /* No more messages. */
|
||||
break;
|
||||
|
||||
switch (imsg.hdr.type) {
|
||||
case IMSG_OPEN_BPFSOCK:
|
||||
if (IMSG_DATA_SIZE(imsg) != sizeof(if_index))
|
||||
fatalx("%s: IMSG_OPEN_BPFSOCK wrong length: "
|
||||
"%lu", __func__, IMSG_DATA_SIZE(imsg));
|
||||
memcpy(&if_index, imsg.data, sizeof(if_index));
|
||||
open_bpfsock(if_index);
|
||||
break;
|
||||
#ifndef SMALL
|
||||
case IMSG_CTL_RELOAD:
|
||||
if (main_reload() == -1)
|
||||
log_warnx("configuration reload failed");
|
||||
else
|
||||
log_warnx("configuration reloaded");
|
||||
break;
|
||||
case IMSG_CTL_LOG_VERBOSE:
|
||||
if (IMSG_DATA_SIZE(imsg) != sizeof(verbose))
|
||||
fatalx("%s: IMSG_CTL_LOG_VERBOSE wrong length: "
|
||||
"%lu", __func__, IMSG_DATA_SIZE(imsg));
|
||||
memcpy(&verbose, imsg.data, sizeof(verbose));
|
||||
log_setverbose(verbose);
|
||||
break;
|
||||
#endif /* SMALL */
|
||||
case IMSG_UPDATE_IF:
|
||||
if (IMSG_DATA_SIZE(imsg) != sizeof(imsg_ifinfo))
|
||||
fatalx("%s: IMSG_UPDATE_IF wrong length: %lu",
|
||||
__func__, IMSG_DATA_SIZE(imsg));
|
||||
memcpy(&imsg_ifinfo, imsg.data, sizeof(imsg_ifinfo));
|
||||
read_lease_file(&imsg_ifinfo);
|
||||
main_imsg_compose_engine(IMSG_UPDATE_IF, -1,
|
||||
&imsg_ifinfo, sizeof(imsg_ifinfo));
|
||||
break;
|
||||
default:
|
||||
log_debug("%s: error handling imsg %d", __func__,
|
||||
imsg.hdr.type);
|
||||
break;
|
||||
}
|
||||
imsg_free(&imsg);
|
||||
}
|
||||
// [...]
|
||||
}
|
||||
#+end_src
|
||||
We see that it handles four message types =IMSG_OPEN_BPFSOCK=,
|
||||
=IMSG_CTL_RELOAD=, =IMSG_CTL_LOG_VERBOSE= and =IMSG_UPDATE_IF=.
|
||||
|
||||
One of them, =IMSG_CTL_RELOAD=, does not have any payload data, the
|
||||
/parent/ process just performs a predefined action. The /frontend/
|
||||
process cannot influence how the action is performed, only when it is
|
||||
performed.
|
||||
|
||||
For the other three, the /frontend/ process sends a piece of data that
|
||||
influences how the action is performed. The piece of data has a fixed,
|
||||
known at compile time, length. For example =IMSG_OPEN_BPFSOCK= is send
|
||||
by the /frontend/ process when it learns of a new network interface
|
||||
gaining the /autoconf/ flag. It needs a new bpf(4) socket to send an
|
||||
receive DHCP messages on that interface[fn::bpf(4) sockets are bound
|
||||
to a specific interface and then locked, meaning the /frontend/
|
||||
process will not be able to change anything about the socket. That
|
||||
also means that the /frontend/ process cannot send and receive
|
||||
arbitrary messages but only those that match the bpf filter.]. It
|
||||
sends a single =uint32_t= that uniquely identifies the network
|
||||
interface in the system. If it sends something else, we assume the
|
||||
/frontend/ process has been compromised and is no longer trusted. The
|
||||
/parent/ process terminates itself:
|
||||
#+begin_src C
|
||||
if (IMSG_DATA_SIZE(imsg) != sizeof(if_index))
|
||||
fatalx("%s: IMSG_OPEN_BPFSOCK wrong length: "
|
||||
"%lu", __func__, IMSG_DATA_SIZE(imsg));
|
||||
#+end_src
|
||||
We are using the same pattern of checking the data size for all the
|
||||
other message types send from the /frontend/ process. The same is true
|
||||
for how [[https://github.com/openbsd/src/blob/3c46ceeaef274bbef234dac63245c4b6567168d7/sbin/dhcpleased/dhcpleased.c#L514][main_dispatch_engine()]] handles the messages send by the
|
||||
/engine/ process.
|
||||
|
||||
The /frontend/ process sends the =IMSG_OPEN_BPFSOCK= message using
|
||||
[[https://github.com/openbsd/src/blob/3c46ceeaef274bbef234dac63245c4b6567168d7/sbin/dhcpleased/frontend.c#L632][frontend_imsg_compose_main()]] in =frontend.c=. The sending function
|
||||
names follow a pattern of
|
||||
~SENDINGPROCESS_imsg_compose_RECEIVINGPROCESS~ and they are each a
|
||||
thin wrapper around ~imsg_compose_event()~ defined in =dhcpleased.c=
|
||||
which in turn is a thin wrapper around [[https://man.openbsd.org/imsg_init.3][imsg_compose(3).]]
|
||||
|
||||
A good way to understand what dhcpleased(8) is doing is searching
|
||||
for where all the [[https://github.com/openbsd/src/blob/3c46ceeaef274bbef234dac63245c4b6567168d7/sbin/dhcpleased/dhcpleased.h#L194][imsg_types]] are sent and received.
|
||||
|
||||
** Configuration file, control process and logging framework.
|
||||
We will not go into detail on how these features are implemented. They
|
||||
are not that relevant to the topic of privilege separation and the
|
||||
author is not an expert in LR(1) grammars or [[https://man.openbsd.org/yacc.1][yacc(1)]]. We will give
|
||||
some pointers to get interested readers started to learn on their own.
|
||||
|
||||
The parsed configuration file is stored and passed around in a ~struct
|
||||
dhcpleased_conf~ structure. The entry point into the parser is
|
||||
~parse_config()~ in [[https://github.com/openbsd/src/blob/3c46ceeaef274bbef234dac63245c4b6567168d7/sbin/dhcpleased/parse.y#L722][parse.y]]. parse.y contains a hand written lexer as
|
||||
well as the yacc(1) grammar. Changing or adding to the grammar of the
|
||||
configuration file is done in three places:
|
||||
1. New tokens are added to the grammar at the beginning of the file
|
||||
using the =%token= keyword. Tokens are in all caps.
|
||||
2. Adding rules to the grammar, using the defined tokens
|
||||
3. Teaching the lexer about new tokens. This is done by adding to the
|
||||
~keywords~ array in the [[https://github.com/openbsd/src/blob/3c46ceeaef274bbef234dac63245c4b6567168d7/sbin/dhcpleased/parse.y#L357][lookup()]] function.
|
||||
|
||||
Passing =-nv= flags to dhcpleased(8) runs a configuration test and
|
||||
prints out the canonical form of the parsed configuration file. The
|
||||
code for this lives in [[https://github.com/openbsd/src/blob/3c46ceeaef274bbef234dac63245c4b6567168d7/sbin/dhcpleased/printconf.c][printconf.c]]. Care should be taken to print a
|
||||
valid configuration, i.e. one that can be passed to dhcpleased(8)
|
||||
again.
|
||||
|
||||
The source code for the control process can be found in
|
||||
[[https://github.com/openbsd/src/blob/3c46ceeaef274bbef234dac63245c4b6567168d7/usr.sbin/dhcpleasectl/dhcpleasectl.c][usr.sbin/dhcpleased/dhcpleasectl.c]]. Communication is done using
|
||||
[[https://man.openbsd.org/unix.4][UNIX-domain sockets]] and connections are accepted by the /frontend/
|
||||
process. Communication uses imsgs, and messages from the control
|
||||
process are handled by [[https://github.com/openbsd/src/blob/3c46ceeaef274bbef234dac63245c4b6567168d7/sbin/dhcpleased/control.c#L221][control_dispatch_imsg()]] in control.c. The main
|
||||
difference to the dispatch function between the dhcpleased(8)
|
||||
processes is that messages with invalid data sizes are ignored instead
|
||||
of exiting the daemon. This is done because everyone in the =wheel=
|
||||
group can send messages to the daemon and we do not want them to be
|
||||
able to make the daemon crash[fn::Or even everyone on the system in
|
||||
case of unwind(8).]:
|
||||
#+begin_src C
|
||||
case IMSG_CTL_LOG_VERBOSE:
|
||||
if (IMSG_DATA_SIZE(imsg) != sizeof(verbose))
|
||||
break;
|
||||
#+end_src
|
||||
Here dhcpleased(8) expects single integer to set the verbosity of the
|
||||
daemon. If we get something else we just ignore the message using
|
||||
~break~.
|
||||
|
||||
Finally, the network daemons come with a logging framework that handles
|
||||
logging to syslog or =stderr= when the daemons runs in the foreground
|
||||
using =-d=. The code for this can be found in [[https://github.com/openbsd/src/blob/3c46ceeaef274bbef234dac63245c4b6567168d7/sbin/dhcpleased/log.c][log.c]].
|
||||
|
||||
* Epilogue
|
||||
Writing software in C with security in mind can be a lot of fun when
|
||||
standing on the shoulders of giants and having things like privilege
|
||||
separation and restricted-service operating mode in your toolbox.
|
||||
|
||||
I wrote two daemons, dhcpleased(8) and slaacd(8), that are enabled by
|
||||
default on every OpenBSD installation. We might eventually add a third
|
||||
one. Having the mitigations shown here as well as all the other
|
||||
mitigations constantly being added and enabled per default is what
|
||||
lets me sleep at night. I am cautiously optimistic that when a bug is
|
||||
found in dhcpleased(8) or slaacd(8) an attacker will have a hard time
|
||||
pivoting to arbitrary code execution as root.
|
Loading…
Reference in New Issue
Block a user