1{ config, lib, pkgs, ... }:
2
3with lib;
4
5let
6
7 # The container's init script, a small wrapper around the regular
8 # NixOS stage-2 init script.
9 containerInit = (cfg:
10 let
11 renderExtraVeth = (name: cfg:
12 ''
13 echo "Bringing ${name} up"
14 ip link set dev ${name} up
15 ${optionalString (cfg.localAddress != null) ''
16 echo "Setting ip for ${name}"
17 ip addr add ${cfg.localAddress} dev ${name}
18 ''}
19 ${optionalString (cfg.localAddress6 != null) ''
20 echo "Setting ip6 for ${name}"
21 ip -6 addr add ${cfg.localAddress6} dev ${name}
22 ''}
23 ${optionalString (cfg.hostAddress != null) ''
24 echo "Setting route to host for ${name}"
25 ip route add ${cfg.hostAddress} dev ${name}
26 ''}
27 ${optionalString (cfg.hostAddress6 != null) ''
28 echo "Setting route6 to host for ${name}"
29 ip -6 route add ${cfg.hostAddress6} dev ${name}
30 ''}
31 ''
32 );
33 in
34 pkgs.writeScript "container-init"
35 ''
36 #! ${pkgs.stdenv.shell} -e
37
38 # Initialise the container side of the veth pair.
39 if [ "$PRIVATE_NETWORK" = 1 ]; then
40
41 ip link set host0 name eth0
42 ip link set dev eth0 up
43
44 if [ -n "$LOCAL_ADDRESS" ]; then
45 ip addr add $LOCAL_ADDRESS dev eth0
46 fi
47 if [ -n "$LOCAL_ADDRESS6" ]; then
48 ip -6 addr add $LOCAL_ADDRESS6 dev eth0
49 fi
50 if [ -n "$HOST_ADDRESS" ]; then
51 ip route add $HOST_ADDRESS dev eth0
52 ip route add default via $HOST_ADDRESS
53 fi
54 if [ -n "$HOST_ADDRESS6" ]; then
55 ip -6 route add $HOST_ADDRESS6 dev eth0
56 ip -6 route add default via $HOST_ADDRESS6
57 fi
58
59 ${concatStringsSep "\n" (mapAttrsToList renderExtraVeth cfg.extraVeths)}
60 fi
61
62 # Start the regular stage 1 script.
63 exec "$1"
64 ''
65 );
66
67 nspawnExtraVethArgs = (name: cfg: "--network-veth-extra=${name}");
68
69 startScript = cfg:
70 ''
71 mkdir -p -m 0755 "$root/etc" "$root/var/lib"
72 mkdir -p -m 0700 "$root/var/lib/private" "$root/root" /run/containers
73 if ! [ -e "$root/etc/os-release" ]; then
74 touch "$root/etc/os-release"
75 fi
76
77 if ! [ -e "$root/etc/machine-id" ]; then
78 touch "$root/etc/machine-id"
79 fi
80
81 mkdir -p -m 0755 \
82 "/nix/var/nix/profiles/per-container/$INSTANCE" \
83 "/nix/var/nix/gcroots/per-container/$INSTANCE"
84
85 cp --remove-destination /etc/resolv.conf "$root/etc/resolv.conf"
86
87 if [ "$PRIVATE_NETWORK" = 1 ]; then
88 extraFlags+=" --network-veth"
89 if [ -n "$HOST_BRIDGE" ]; then
90 extraFlags+=" --network-bridge=$HOST_BRIDGE"
91 fi
92 if [ -n "$HOST_PORT" ]; then
93 OIFS=$IFS
94 IFS=","
95 for i in $HOST_PORT
96 do
97 extraFlags+=" --port=$i"
98 done
99 IFS=$OIFS
100 fi
101 fi
102
103 extraFlags+=" ${concatStringsSep " " (mapAttrsToList nspawnExtraVethArgs cfg.extraVeths)}"
104
105 for iface in $INTERFACES; do
106 extraFlags+=" --network-interface=$iface"
107 done
108
109 for iface in $MACVLANS; do
110 extraFlags+=" --network-macvlan=$iface"
111 done
112
113 # If the host is 64-bit and the container is 32-bit, add a
114 # --personality flag.
115 ${optionalString (config.nixpkgs.system == "x86_64-linux") ''
116 if [ "$(< ''${SYSTEM_PATH:-/nix/var/nix/profiles/per-container/$INSTANCE/system}/system)" = i686-linux ]; then
117 extraFlags+=" --personality=x86"
118 fi
119 ''}
120
121 # Run systemd-nspawn without startup notification (we'll
122 # wait for the container systemd to signal readiness).
123 exec ${config.systemd.package}/bin/systemd-nspawn \
124 --keep-unit \
125 -M "$INSTANCE" -D "$root" $extraFlags \
126 $EXTRA_NSPAWN_FLAGS \
127 --notify-ready=yes \
128 --bind-ro=/nix/store \
129 --bind-ro=/nix/var/nix/db \
130 --bind-ro=/nix/var/nix/daemon-socket \
131 --bind="/nix/var/nix/profiles/per-container/$INSTANCE:/nix/var/nix/profiles" \
132 --bind="/nix/var/nix/gcroots/per-container/$INSTANCE:/nix/var/nix/gcroots" \
133 --setenv PRIVATE_NETWORK="$PRIVATE_NETWORK" \
134 --setenv HOST_BRIDGE="$HOST_BRIDGE" \
135 --setenv HOST_ADDRESS="$HOST_ADDRESS" \
136 --setenv LOCAL_ADDRESS="$LOCAL_ADDRESS" \
137 --setenv HOST_ADDRESS6="$HOST_ADDRESS6" \
138 --setenv LOCAL_ADDRESS6="$LOCAL_ADDRESS6" \
139 --setenv HOST_PORT="$HOST_PORT" \
140 --setenv PATH="$PATH" \
141 ${if cfg.additionalCapabilities != null && cfg.additionalCapabilities != [] then
142 ''--capability="${concatStringsSep " " cfg.additionalCapabilities}"'' else ""
143 } \
144 ${if cfg.tmpfs != null && cfg.tmpfs != [] then
145 ''--tmpfs=${concatStringsSep " --tmpfs=" cfg.tmpfs}'' else ""
146 } \
147 ${containerInit cfg} "''${SYSTEM_PATH:-/nix/var/nix/profiles/system}/init"
148 '';
149
150 preStartScript = cfg:
151 ''
152 # Clean up existing machined registration and interfaces.
153 machinectl terminate "$INSTANCE" 2> /dev/null || true
154
155 if [ "$PRIVATE_NETWORK" = 1 ]; then
156 ip link del dev "ve-$INSTANCE" 2> /dev/null || true
157 ip link del dev "vb-$INSTANCE" 2> /dev/null || true
158 fi
159
160 ${concatStringsSep "\n" (
161 mapAttrsToList (name: cfg:
162 ''ip link del dev ${name} 2> /dev/null || true ''
163 ) cfg.extraVeths
164 )}
165 '';
166
167 postStartScript = (cfg:
168 let
169 ipcall = cfg: ipcmd: variable: attribute:
170 if cfg.${attribute} == null then
171 ''
172 if [ -n "${variable}" ]; then
173 ${ipcmd} add ${variable} dev $ifaceHost
174 fi
175 ''
176 else
177 ''${ipcmd} add ${cfg.${attribute}} dev $ifaceHost'';
178 renderExtraVeth = name: cfg:
179 if cfg.hostBridge != null then
180 ''
181 # Add ${name} to bridge ${cfg.hostBridge}
182 ip link set dev ${name} master ${cfg.hostBridge} up
183 ''
184 else
185 ''
186 # Set IPs and routes for ${name}
187 ${optionalString (cfg.hostAddress != null) ''
188 ip addr add ${cfg.hostAddress} dev ${name}
189 ''}
190 ${optionalString (cfg.hostAddress6 != null) ''
191 ip -6 addr add ${cfg.hostAddress6} dev ${name}
192 ''}
193 ${optionalString (cfg.localAddress != null) ''
194 ip route add ${cfg.localAddress} dev ${name}
195 ''}
196 ${optionalString (cfg.localAddress6 != null) ''
197 ip -6 route add ${cfg.localAddress6} dev ${name}
198 ''}
199 '';
200 in
201 ''
202 if [ "$PRIVATE_NETWORK" = 1 ]; then
203 if [ -z "$HOST_BRIDGE" ]; then
204 ifaceHost=ve-$INSTANCE
205 ip link set dev $ifaceHost up
206
207 ${ipcall cfg "ip addr" "$HOST_ADDRESS" "hostAddress"}
208 ${ipcall cfg "ip -6 addr" "$HOST_ADDRESS6" "hostAddress6"}
209 ${ipcall cfg "ip route" "$LOCAL_ADDRESS" "localAddress"}
210 ${ipcall cfg "ip -6 route" "$LOCAL_ADDRESS6" "localAddress6"}
211 fi
212 ${concatStringsSep "\n" (mapAttrsToList renderExtraVeth cfg.extraVeths)}
213 fi
214
215 # Get the leader PID so that we can signal it in
216 # preStop. We can't use machinectl there because D-Bus
217 # might be shutting down. FIXME: in systemd 219 we can
218 # just signal systemd-nspawn to do a clean shutdown.
219 machinectl show "$INSTANCE" | sed 's/Leader=\(.*\)/\1/;t;d' > "/run/containers/$INSTANCE.pid"
220 ''
221 );
222
223 serviceDirectives = cfg: {
224 ExecReload = pkgs.writeScript "reload-container"
225 ''
226 #! ${pkgs.stdenv.shell} -e
227 ${pkgs.nixos-container}/bin/nixos-container run "$INSTANCE" -- \
228 bash --login -c "''${SYSTEM_PATH:-/nix/var/nix/profiles/system}/bin/switch-to-configuration test"
229 '';
230
231 SyslogIdentifier = "container %i";
232
233 EnvironmentFile = "-/etc/containers/%i.conf";
234
235 Type = "notify";
236
237 # Note that on reboot, systemd-nspawn returns 133, so this
238 # unit will be restarted. On poweroff, it returns 0, so the
239 # unit won't be restarted.
240 RestartForceExitStatus = "133";
241 SuccessExitStatus = "133";
242
243 Restart = "on-failure";
244
245 # Hack: we don't want to kill systemd-nspawn, since we call
246 # "machinectl poweroff" in preStop to shut down the
247 # container cleanly. But systemd requires sending a signal
248 # (at least if we want remaining processes to be killed
249 # after the timeout). So send an ignored signal.
250 KillMode = "mixed";
251 KillSignal = "WINCH";
252
253 DevicePolicy = "closed";
254 DeviceAllow = map (d: "${d.node} ${d.modifier}") cfg.allowedDevices;
255 };
256
257
258 system = config.nixpkgs.system;
259
260 bindMountOpts = { name, config, ... }: {
261
262 options = {
263 mountPoint = mkOption {
264 example = "/mnt/usb";
265 type = types.str;
266 description = "Mount point on the container file system.";
267 };
268 hostPath = mkOption {
269 default = null;
270 example = "/home/alice";
271 type = types.nullOr types.str;
272 description = "Location of the host path to be mounted.";
273 };
274 isReadOnly = mkOption {
275 default = true;
276 type = types.bool;
277 description = "Determine whether the mounted path will be accessed in read-only mode.";
278 };
279 };
280
281 config = {
282 mountPoint = mkDefault name;
283 };
284
285 };
286
287 allowedDeviceOpts = { name, config, ... }: {
288 options = {
289 node = mkOption {
290 example = "/dev/net/tun";
291 type = types.str;
292 description = "Path to device node";
293 };
294 modifier = mkOption {
295 example = "rw";
296 type = types.str;
297 description = ''
298 Device node access modifier. Takes a combination
299 <literal>r</literal> (read), <literal>w</literal> (write), and
300 <literal>m</literal> (mknod). See the
301 <literal>systemd.resource-control(5)</literal> man page for more
302 information.'';
303 };
304 };
305 };
306
307
308 mkBindFlag = d:
309 let flagPrefix = if d.isReadOnly then " --bind-ro=" else " --bind=";
310 mountstr = if d.hostPath != null then "${d.hostPath}:${d.mountPoint}" else "${d.mountPoint}";
311 in flagPrefix + mountstr ;
312
313 mkBindFlags = bs: concatMapStrings mkBindFlag (lib.attrValues bs);
314
315 networkOptions = {
316 hostBridge = mkOption {
317 type = types.nullOr types.string;
318 default = null;
319 example = "br0";
320 description = ''
321 Put the host-side of the veth-pair into the named bridge.
322 Only one of hostAddress* or hostBridge can be given.
323 '';
324 };
325
326 forwardPorts = mkOption {
327 type = types.listOf (types.submodule {
328 options = {
329 protocol = mkOption {
330 type = types.str;
331 default = "tcp";
332 description = "The protocol specifier for port forwarding between host and container";
333 };
334 hostPort = mkOption {
335 type = types.int;
336 description = "Source port of the external interface on host";
337 };
338 containerPort = mkOption {
339 type = types.nullOr types.int;
340 default = null;
341 description = "Target port of container";
342 };
343 };
344 });
345 default = [];
346 example = [ { protocol = "tcp"; hostPort = 8080; containerPort = 80; } ];
347 description = ''
348 List of forwarded ports from host to container. Each forwarded port
349 is specified by protocol, hostPort and containerPort. By default,
350 protocol is tcp and hostPort and containerPort are assumed to be
351 the same if containerPort is not explicitly given.
352 '';
353 };
354
355
356 hostAddress = mkOption {
357 type = types.nullOr types.str;
358 default = null;
359 example = "10.231.136.1";
360 description = ''
361 The IPv4 address assigned to the host interface.
362 (Not used when hostBridge is set.)
363 '';
364 };
365
366 hostAddress6 = mkOption {
367 type = types.nullOr types.string;
368 default = null;
369 example = "fc00::1";
370 description = ''
371 The IPv6 address assigned to the host interface.
372 (Not used when hostBridge is set.)
373 '';
374 };
375
376 localAddress = mkOption {
377 type = types.nullOr types.str;
378 default = null;
379 example = "10.231.136.2";
380 description = ''
381 The IPv4 address assigned to the interface in the container.
382 If a hostBridge is used, this should be given with netmask to access
383 the whole network. Otherwise the default netmask is /32 and routing is
384 set up from localAddress to hostAddress and back.
385 '';
386 };
387
388 localAddress6 = mkOption {
389 type = types.nullOr types.string;
390 default = null;
391 example = "fc00::2";
392 description = ''
393 The IPv6 address assigned to the interface in the container.
394 If a hostBridge is used, this should be given with netmask to access
395 the whole network. Otherwise the default netmask is /128 and routing is
396 set up from localAddress6 to hostAddress6 and back.
397 '';
398 };
399
400 };
401
402 dummyConfig =
403 {
404 extraVeths = {};
405 additionalCapabilities = [];
406 allowedDevices = [];
407 hostAddress = null;
408 hostAddress6 = null;
409 localAddress = null;
410 localAddress6 = null;
411 tmpfs = null;
412 };
413
414in
415
416{
417 options = {
418
419 boot.isContainer = mkOption {
420 type = types.bool;
421 default = false;
422 description = ''
423 Whether this NixOS machine is a lightweight container running
424 in another NixOS system.
425 '';
426 };
427
428 boot.enableContainers = mkOption {
429 type = types.bool;
430 default = !config.boot.isContainer;
431 description = ''
432 Whether to enable support for nixos containers.
433 '';
434 };
435
436 containers = mkOption {
437 type = types.attrsOf (types.submodule (
438 { config, options, name, ... }:
439 {
440 options = {
441
442 config = mkOption {
443 description = ''
444 A specification of the desired configuration of this
445 container, as a NixOS module.
446 '';
447 type = lib.mkOptionType {
448 name = "Toplevel NixOS config";
449 merge = loc: defs: (import ../../lib/eval-config.nix {
450 inherit system;
451 modules =
452 let extraConfig =
453 { boot.isContainer = true;
454 networking.hostName = mkDefault name;
455 networking.useDHCP = false;
456 };
457 in [ extraConfig ] ++ (map (x: x.value) defs);
458 prefix = [ "containers" name ];
459 }).config;
460 };
461 };
462
463 path = mkOption {
464 type = types.path;
465 example = "/nix/var/nix/profiles/containers/webserver";
466 description = ''
467 As an alternative to specifying
468 <option>config</option>, you can specify the path to
469 the evaluated NixOS system configuration, typically a
470 symlink to a system profile.
471 '';
472 };
473
474 additionalCapabilities = mkOption {
475 type = types.listOf types.str;
476 default = [];
477 example = [ "CAP_NET_ADMIN" "CAP_MKNOD" ];
478 description = ''
479 Grant additional capabilities to the container. See the
480 capabilities(7) and systemd-nspawn(1) man pages for more
481 information.
482 '';
483 };
484 enableTun = mkOption {
485 type = types.bool;
486 default = false;
487 description = ''
488 Allows the container to create and setup tunnel interfaces
489 by granting the <literal>NET_ADMIN</literal> capability and
490 enabling access to <literal>/dev/net/tun</literal>.
491 '';
492 };
493
494 privateNetwork = mkOption {
495 type = types.bool;
496 default = false;
497 description = ''
498 Whether to give the container its own private virtual
499 Ethernet interface. The interface is called
500 <literal>eth0</literal>, and is hooked up to the interface
501 <literal>ve-<replaceable>container-name</replaceable></literal>
502 on the host. If this option is not set, then the
503 container shares the network interfaces of the host,
504 and can bind to any port on any interface.
505 '';
506 };
507
508 interfaces = mkOption {
509 type = types.listOf types.string;
510 default = [];
511 example = [ "eth1" "eth2" ];
512 description = ''
513 The list of interfaces to be moved into the container.
514 '';
515 };
516
517 macvlans = mkOption {
518 type = types.listOf types.str;
519 default = [];
520 example = [ "eth1" "eth2" ];
521 description = ''
522 The list of host interfaces from which macvlans will be
523 created. For each interface specified, a macvlan interface
524 will be created and moved to the container.
525 '';
526 };
527
528 extraVeths = mkOption {
529 type = with types; attrsOf (submodule { options = networkOptions; });
530 default = {};
531 description = ''
532 Extra veth-pairs to be created for the container
533 '';
534 };
535
536 autoStart = mkOption {
537 type = types.bool;
538 default = false;
539 description = ''
540 Whether the container is automatically started at boot-time.
541 '';
542 };
543
544 bindMounts = mkOption {
545 type = with types; loaOf (submodule bindMountOpts);
546 default = {};
547 example = { "/home" = { hostPath = "/home/alice";
548 isReadOnly = false; };
549 };
550
551 description =
552 ''
553 An extra list of directories that is bound to the container.
554 '';
555 };
556
557 allowedDevices = mkOption {
558 type = with types; listOf (submodule allowedDeviceOpts);
559 default = [];
560 example = [ { node = "/dev/net/tun"; modifier = "rw"; } ];
561 description = ''
562 A list of device nodes to which the containers has access to.
563 '';
564 };
565
566 tmpfs = mkOption {
567 type = types.listOf types.str;
568 default = [];
569 example = [ "/var" ];
570 description = ''
571 Mounts a set of tmpfs file systems into the container.
572 Multiple paths can be specified.
573 Valid items must conform to the --tmpfs argument
574 of systemd-nspawn. See systemd-nspawn(1) for details.
575 '';
576 };
577
578 } // networkOptions;
579
580 config = mkMerge
581 [
582 (mkIf options.config.isDefined {
583 path = config.config.system.build.toplevel;
584 })
585 ];
586 }));
587
588 default = {};
589 example = literalExample
590 ''
591 { webserver =
592 { path = "/nix/var/nix/profiles/webserver";
593 };
594 database =
595 { config =
596 { config, pkgs, ... }:
597 { services.postgresql.enable = true;
598 services.postgresql.package = pkgs.postgresql96;
599
600 system.stateVersion = "17.03";
601 };
602 };
603 }
604 '';
605 description = ''
606 A set of NixOS system configurations to be run as lightweight
607 containers. Each container appears as a service
608 <literal>container-<replaceable>name</replaceable></literal>
609 on the host system, allowing it to be started and stopped via
610 <command>systemctl</command>.
611 '';
612 };
613
614 };
615
616
617 config = mkIf (config.boot.enableContainers) (let
618
619 unit = {
620 description = "Container '%i'";
621
622 unitConfig.RequiresMountsFor = [ "/var/lib/containers/%i" ];
623
624 path = [ pkgs.iproute ];
625
626 environment.INSTANCE = "%i";
627 environment.root = "/var/lib/containers/%i";
628
629 preStart = preStartScript dummyConfig;
630
631 script = startScript dummyConfig;
632
633 postStart = postStartScript dummyConfig;
634
635 preStop =
636 ''
637 pid="$(cat /run/containers/$INSTANCE.pid)"
638 if [ -n "$pid" ]; then
639 kill -RTMIN+4 "$pid"
640 fi
641 rm -f "/run/containers/$INSTANCE.pid"
642 '';
643
644 restartIfChanged = false;
645
646 serviceConfig = serviceDirectives dummyConfig;
647 };
648 in {
649 systemd.services = listToAttrs (filter (x: x.value != null) (
650 # The generic container template used by imperative containers
651 [{ name = "container@"; value = unit; }]
652 # declarative containers
653 ++ (mapAttrsToList (name: cfg: nameValuePair "container@${name}" (let
654 config = cfg // (
655 if cfg.enableTun then
656 {
657 allowedDevices = cfg.allowedDevices
658 ++ [ { node = "/dev/net/tun"; modifier = "rw"; } ];
659 additionalCapabilities = cfg.additionalCapabilities
660 ++ [ "CAP_NET_ADMIN" ];
661 }
662 else {});
663 in
664 unit // {
665 preStart = preStartScript config;
666 script = startScript config;
667 postStart = postStartScript config;
668 serviceConfig = serviceDirectives config;
669 } // (
670 if config.autoStart then
671 {
672 wantedBy = [ "multi-user.target" ];
673 wants = [ "network.target" ];
674 after = [ "network.target" ];
675 restartTriggers = [ config.path ];
676 reloadIfChanged = true;
677 }
678 else {})
679 )) config.containers)
680 ));
681
682 # Generate a configuration file in /etc/containers for each
683 # container so that container@.target can get the container
684 # configuration.
685 environment.etc =
686 let mkPortStr = p: p.protocol + ":" + (toString p.hostPort) + ":" + (if p.containerPort == null then toString p.hostPort else toString p.containerPort);
687 in mapAttrs' (name: cfg: nameValuePair "containers/${name}.conf"
688 { text =
689 ''
690 SYSTEM_PATH=${cfg.path}
691 ${optionalString cfg.privateNetwork ''
692 PRIVATE_NETWORK=1
693 ${optionalString (cfg.hostBridge != null) ''
694 HOST_BRIDGE=${cfg.hostBridge}
695 ''}
696 ${optionalString (length cfg.forwardPorts > 0) ''
697 HOST_PORT=${concatStringsSep "," (map mkPortStr cfg.forwardPorts)}
698 ''}
699 ${optionalString (cfg.hostAddress != null) ''
700 HOST_ADDRESS=${cfg.hostAddress}
701 ''}
702 ${optionalString (cfg.hostAddress6 != null) ''
703 HOST_ADDRESS6=${cfg.hostAddress6}
704 ''}
705 ${optionalString (cfg.localAddress != null) ''
706 LOCAL_ADDRESS=${cfg.localAddress}
707 ''}
708 ${optionalString (cfg.localAddress6 != null) ''
709 LOCAL_ADDRESS6=${cfg.localAddress6}
710 ''}
711 ''}
712 INTERFACES="${toString cfg.interfaces}"
713 MACVLANS="${toString cfg.macvlans}"
714 ${optionalString cfg.autoStart ''
715 AUTO_START=1
716 ''}
717 EXTRA_NSPAWN_FLAGS="${mkBindFlags cfg.bindMounts}"
718 '';
719 }) config.containers;
720
721 # Generate /etc/hosts entries for the containers.
722 networking.extraHosts = concatStrings (mapAttrsToList (name: cfg: optionalString (cfg.localAddress != null)
723 ''
724 ${head (splitString "/" cfg.localAddress)} ${name}.containers
725 '') config.containers);
726
727 networking.dhcpcd.denyInterfaces = [ "ve-*" "vb-*" ];
728
729 services.udev.extraRules = optionalString config.networking.networkmanager.enable ''
730 # Don't manage interfaces created by nixos-container.
731 ENV{INTERFACE}=="v[eb]-*", ENV{NM_UNMANAGED}="1"
732 '';
733
734 environment.systemPackages = [ pkgs.nixos-container ];
735 });
736}