1{
2 config,
3 lib,
4 pkgs,
5 ...
6}@host:
7
8with lib;
9
10let
11
12 configurationPrefix = optionalString (versionAtLeast config.system.stateVersion "22.05") "nixos-";
13 configurationDirectoryName = "${configurationPrefix}containers";
14 configurationDirectory = "/etc/${configurationDirectoryName}";
15 stateDirectory = "/var/lib/${configurationPrefix}containers";
16
17 nixos-container = pkgs.nixos-container.override {
18 inherit stateDirectory configurationDirectory;
19 };
20
21 # The container's init script, a small wrapper around the regular
22 # NixOS stage-2 init script.
23 containerInit = (
24 cfg:
25 let
26 renderExtraVeth = (
27 name: cfg: ''
28 echo "Bringing ${name} up"
29 ip link set dev ${name} up
30 ${optionalString (cfg.localAddress != null) ''
31 echo "Setting ip for ${name}"
32 ip addr add ${cfg.localAddress} dev ${name}
33 ''}
34 ${optionalString (cfg.localAddress6 != null) ''
35 echo "Setting ip6 for ${name}"
36 ip -6 addr add ${cfg.localAddress6} dev ${name}
37 ''}
38 ${optionalString (cfg.hostAddress != null) ''
39 echo "Setting route to host for ${name}"
40 ip route add ${cfg.hostAddress} dev ${name}
41 ''}
42 ${optionalString (cfg.hostAddress6 != null) ''
43 echo "Setting route6 to host for ${name}"
44 ip -6 route add ${cfg.hostAddress6} dev ${name}
45 ''}
46 ''
47 );
48 in
49 pkgs.writeScript "container-init" ''
50 #! ${pkgs.runtimeShell} -e
51
52 # Exit early if we're asked to shut down.
53 trap "exit 0" SIGRTMIN+3
54
55 # Initialise the container side of the veth pair.
56 if [ -n "$HOST_ADDRESS" ] || [ -n "$HOST_ADDRESS6" ] ||
57 [ -n "$LOCAL_ADDRESS" ] || [ -n "$LOCAL_ADDRESS6" ] ||
58 [ -n "$HOST_BRIDGE" ]; then
59 ip link set host0 name eth0
60 ip link set dev eth0 up
61
62 if [ -n "$LOCAL_ADDRESS" ]; then
63 ip addr add $LOCAL_ADDRESS dev eth0
64 fi
65 if [ -n "$LOCAL_ADDRESS6" ]; then
66 ip -6 addr add $LOCAL_ADDRESS6 dev eth0
67 fi
68 if [ -n "$HOST_ADDRESS" ]; then
69 ip route add $HOST_ADDRESS dev eth0
70 ip route add default via $HOST_ADDRESS
71 fi
72 if [ -n "$HOST_ADDRESS6" ]; then
73 ip -6 route add $HOST_ADDRESS6 dev eth0
74 ip -6 route add default via $HOST_ADDRESS6
75 fi
76 fi
77
78 ${concatStringsSep "\n" (mapAttrsToList renderExtraVeth cfg.extraVeths)}
79
80 # Start the regular stage 2 script.
81 # We source instead of exec to not lose an early stop signal, which is
82 # also the only _reliable_ shutdown signal we have since early stop
83 # does not execute ExecStop* commands.
84 set +e
85 . "$1"
86 ''
87 );
88
89 nspawnExtraVethArgs = (name: cfg: "--network-veth-extra=${name}");
90
91 startScript = cfg: ''
92 # Declare root explicitly to avoid shellcheck warnings, it comes from the env
93 declare root
94
95 mkdir -p "$root/etc" "$root/var/lib"
96 chmod 0755 "$root/etc" "$root/var/lib"
97 mkdir -p "$root/var/lib/private" "$root/root" /run/nixos-containers
98 chmod 0700 "$root/var/lib/private" "$root/root" /run/nixos-containers
99 if ! [ -e "$root/etc/os-release" ] && ! [ -h "$root/etc/os-release" ]; then
100 touch "$root/etc/os-release"
101 fi
102
103 if ! [ -e "$root/etc/machine-id" ]; then
104 touch "$root/etc/machine-id"
105 fi
106
107 mkdir -p \
108 "/nix/var/nix/profiles/per-container/$INSTANCE" \
109 "/nix/var/nix/gcroots/per-container/$INSTANCE"
110 chmod 0755 \
111 "/nix/var/nix/profiles/per-container/$INSTANCE" \
112 "/nix/var/nix/gcroots/per-container/$INSTANCE"
113
114 cp --remove-destination /etc/resolv.conf "$root/etc/resolv.conf"
115
116 if [ -n "$FLAKE" ] && [ ! -e "/nix/var/nix/profiles/per-container/$INSTANCE/system" ]; then
117 # we create the etc/nixos-container config file, then if we utilize the update function, we can then build all the necessary system files for the container
118 ${lib.getExe nixos-container} update "$INSTANCE"
119 fi
120
121 declare -a extraFlags
122
123 if [ "$PRIVATE_NETWORK" = 1 ]; then
124 extraFlags+=("--private-network")
125 fi
126
127 NIX_BIND_OPT=""
128 if [ -n "$PRIVATE_USERS" ]; then
129 extraFlags+=("--private-users=$PRIVATE_USERS")
130 if [[
131 "$PRIVATE_USERS" = "pick"
132 || ("$PRIVATE_USERS" =~ ^[[:digit:]]+$ && "$PRIVATE_USERS" -gt 0)
133 ]]; then
134 # when user namespacing is enabled, we use `idmap` mount option so that
135 # bind mounts under /nix get proper owner (and not nobody/nogroup).
136 NIX_BIND_OPT=":idmap"
137 fi
138 fi
139
140 if [ -n "$HOST_ADDRESS" ] || [ -n "$LOCAL_ADDRESS" ] ||
141 [ -n "$HOST_ADDRESS6" ] || [ -n "$LOCAL_ADDRESS6" ]; then
142 extraFlags+=("--network-veth")
143 fi
144
145 if [ -n "$HOST_PORT" ]; then
146 OIFS=$IFS
147 IFS=","
148 for i in $HOST_PORT
149 do
150 extraFlags+=("--port=$i")
151 done
152 IFS=$OIFS
153 fi
154
155 if [ -n "$HOST_BRIDGE" ]; then
156 extraFlags+=("--network-bridge=$HOST_BRIDGE")
157 fi
158
159 if [ -n "$NETWORK_NAMESPACE_PATH" ]; then
160 extraFlags+=("--network-namespace-path=$NETWORK_NAMESPACE_PATH")
161 fi
162
163 extraFlags+=(${lib.escapeShellArgs (mapAttrsToList nspawnExtraVethArgs cfg.extraVeths)})
164
165 for iface in $INTERFACES; do
166 extraFlags+=("--network-interface=$iface")
167 done
168
169 for iface in $MACVLANS; do
170 extraFlags+=("--network-macvlan=$iface")
171 done
172
173 # If the host is 64-bit and the container is 32-bit, add a
174 # --personality flag.
175 ${optionalString (pkgs.stdenv.hostPlatform.system == "x86_64-linux") ''
176 if [ "$(< "''${SYSTEM_PATH:-/nix/var/nix/profiles/per-container/$INSTANCE/system}/system")" = i686-linux ]; then
177 extraFlags+=("--personality=x86")
178 fi
179 ''}
180
181 export SYSTEMD_NSPAWN_UNIFIED_HIERARCHY=1
182
183 # Run systemd-nspawn without startup notification (we'll
184 # wait for the container systemd to signal readiness)
185 # Kill signal handling means systemd-nspawn will pass a system-halt signal
186 # to the container systemd when it receives SIGTERM for container shutdown;
187 # containerInit and stage2 have to handle this as well.
188 # TODO: fix shellcheck issue properly
189 # shellcheck disable=SC2086
190 exec ${config.systemd.package}/bin/systemd-nspawn \
191 --keep-unit \
192 -M "$INSTANCE" -D "$root" "''${extraFlags[@]}" \
193 --notify-ready=yes \
194 --kill-signal=SIGRTMIN+3 \
195 --bind-ro=/nix/store:/nix/store$NIX_BIND_OPT \
196 --bind-ro=/nix/var/nix/db:/nix/var/nix/db$NIX_BIND_OPT \
197 --bind-ro=/nix/var/nix/daemon-socket:/nix/var/nix/daemon-socket$NIX_BIND_OPT \
198 --bind="/nix/var/nix/profiles/per-container/$INSTANCE:/nix/var/nix/profiles$NIX_BIND_OPT" \
199 --bind="/nix/var/nix/gcroots/per-container/$INSTANCE:/nix/var/nix/gcroots$NIX_BIND_OPT" \
200 ${optionalString (!cfg.ephemeral) "--link-journal=try-guest"} \
201 --setenv PRIVATE_NETWORK="$PRIVATE_NETWORK" \
202 --setenv PRIVATE_USERS="$PRIVATE_USERS" \
203 --setenv HOST_BRIDGE="$HOST_BRIDGE" \
204 --setenv HOST_ADDRESS="$HOST_ADDRESS" \
205 --setenv LOCAL_ADDRESS="$LOCAL_ADDRESS" \
206 --setenv HOST_ADDRESS6="$HOST_ADDRESS6" \
207 --setenv LOCAL_ADDRESS6="$LOCAL_ADDRESS6" \
208 --setenv HOST_PORT="$HOST_PORT" \
209 --setenv PATH="$PATH" \
210 ${optionalString cfg.ephemeral "--ephemeral"} \
211 ${
212 optionalString (
213 cfg.additionalCapabilities != null && cfg.additionalCapabilities != [ ]
214 ) ''--capability="${concatStringsSep "," cfg.additionalCapabilities}"''
215 } \
216 ${
217 optionalString (
218 cfg.tmpfs != null && cfg.tmpfs != [ ]
219 ) ''--tmpfs=${concatStringsSep " --tmpfs=" cfg.tmpfs}''
220 } \
221 $EXTRA_NSPAWN_FLAGS \
222 ${containerInit cfg} "''${SYSTEM_PATH:-/nix/var/nix/profiles/system}/init"
223 '';
224
225 preStartScript = cfg: ''
226 # Clean up existing machined registration and interfaces.
227 machinectl terminate "$INSTANCE" 2> /dev/null || true
228
229 if [ -n "$HOST_ADDRESS" ] || [ -n "$LOCAL_ADDRESS" ] ||
230 [ -n "$HOST_ADDRESS6" ] || [ -n "$LOCAL_ADDRESS6" ]; then
231 ip link del dev "ve-$INSTANCE" 2> /dev/null || true
232 ip link del dev "vb-$INSTANCE" 2> /dev/null || true
233 fi
234
235 ${concatStringsSep "\n" (
236 mapAttrsToList (name: cfg: "ip link del dev ${name} 2> /dev/null || true ") cfg.extraVeths
237 )}
238 '';
239
240 postStartScript = (
241 cfg:
242 let
243 ipcall =
244 cfg: ipcmd: variable: attribute:
245 if cfg.${attribute} == null then
246 ''
247 if [ -n "${variable}" ]; then
248 ${ipcmd} add "${variable}" dev "$ifaceHost"
249 fi
250 ''
251 else
252 ''${ipcmd} add ${cfg.${attribute}} dev "$ifaceHost"'';
253 renderExtraVeth =
254 name: cfg:
255 if cfg.hostBridge != null then
256 ''
257 # Add ${name} to bridge ${cfg.hostBridge}
258 ip link set dev "${name}" master "${cfg.hostBridge}" up
259 ''
260 else
261 ''
262 echo "Bring ${name} up"
263 ip link set dev "${name}" up
264 # Set IPs and routes for ${name}
265 ${optionalString (cfg.hostAddress != null) ''
266 ip addr add ${cfg.hostAddress} dev "${name}"
267 ''}
268 ${optionalString (cfg.hostAddress6 != null) ''
269 ip -6 addr add ${cfg.hostAddress6} dev "${name}"
270 ''}
271 ${optionalString (cfg.localAddress != null) ''
272 ip route add ${cfg.localAddress} dev "${name}"
273 ''}
274 ${optionalString (cfg.localAddress6 != null) ''
275 ip -6 route add ${cfg.localAddress6} dev "${name}"
276 ''}
277 '';
278 in
279 ''
280 if [ -n "$HOST_ADDRESS" ] || [ -n "$LOCAL_ADDRESS" ] ||
281 [ -n "$HOST_ADDRESS6" ] || [ -n "$LOCAL_ADDRESS6" ]; then
282 if [ -z "$HOST_BRIDGE" ]; then
283 ifaceHost=ve-$INSTANCE
284 ip link set dev "$ifaceHost" up
285
286 ${ipcall cfg "ip addr" "$HOST_ADDRESS" "hostAddress"}
287 ${ipcall cfg "ip -6 addr" "$HOST_ADDRESS6" "hostAddress6"}
288 ${ipcall cfg "ip route" "$LOCAL_ADDRESS" "localAddress"}
289 ${ipcall cfg "ip -6 route" "$LOCAL_ADDRESS6" "localAddress6"}
290 fi
291 fi
292 ${concatStringsSep "\n" (mapAttrsToList renderExtraVeth cfg.extraVeths)}
293 ''
294 );
295
296 serviceDirectives = cfg: {
297 ExecReload = pkgs.writeScript "reload-container" ''
298 #! ${pkgs.runtimeShell} -e
299 ${nixos-container}/bin/nixos-container run "$INSTANCE" -- \
300 bash --login -c "''${SYSTEM_PATH:-/nix/var/nix/profiles/system}/bin/switch-to-configuration test"
301 '';
302
303 SyslogIdentifier = "container %i";
304
305 EnvironmentFile = "-${configurationDirectory}/%i.conf";
306
307 Type = "notify";
308
309 RuntimeDirectory = lib.optional cfg.ephemeral "${configurationDirectoryName}/%i";
310
311 # Note that on reboot, systemd-nspawn returns 133, so this
312 # unit will be restarted. On poweroff, it returns 0, so the
313 # unit won't be restarted.
314 RestartForceExitStatus = "133";
315 SuccessExitStatus = "133";
316
317 # Some containers take long to start
318 # especially when you automatically start many at once
319 TimeoutStartSec = cfg.timeoutStartSec;
320
321 Restart = "on-failure";
322
323 Slice = "machine.slice";
324 Delegate = true;
325
326 # We rely on systemd-nspawn turning a SIGTERM to itself into a shutdown
327 # signal (SIGRTMIN+3) for the inner container.
328 KillMode = "mixed";
329 KillSignal = "TERM";
330
331 DevicePolicy = "closed";
332 DeviceAllow = map (d: "${d.node} ${d.modifier}") cfg.allowedDevices;
333 };
334
335 kernelVersion = config.boot.kernelPackages.kernel.version;
336
337 bindMountOpts =
338 { name, ... }:
339 {
340
341 options = {
342 mountPoint = mkOption {
343 example = "/mnt/usb";
344 type = types.str;
345 description = "Mount point on the container file system.";
346 };
347 hostPath = mkOption {
348 default = null;
349 example = "/home/alice";
350 type = types.nullOr types.str;
351 description = "Location of the host path to be mounted.";
352 };
353 isReadOnly = mkOption {
354 default = true;
355 type = types.bool;
356 description = "Determine whether the mounted path will be accessed in read-only mode.";
357 };
358 };
359
360 config = {
361 mountPoint = mkDefault name;
362 };
363
364 };
365
366 allowedDeviceOpts =
367 { ... }:
368 {
369 options = {
370 node = mkOption {
371 example = "/dev/net/tun";
372 type = types.str;
373 description = "Path to device node";
374 };
375 modifier = mkOption {
376 example = "rw";
377 type = types.str;
378 description = ''
379 Device node access modifier. Takes a combination
380 `r` (read), `w` (write), and
381 `m` (mknod). See the
382 {manpage}`systemd.resource-control(5)` man page for more
383 information.'';
384 };
385 };
386 };
387
388 mkBindFlag =
389 d:
390 let
391 flagPrefix = if d.isReadOnly then " --bind-ro=" else " --bind=";
392 mountstr = if d.hostPath != null then "${d.hostPath}:${d.mountPoint}" else "${d.mountPoint}";
393 in
394 flagPrefix + mountstr;
395
396 mkBindFlags = bs: concatMapStrings mkBindFlag (lib.attrValues bs);
397
398 networkOptions = {
399 hostBridge = mkOption {
400 type = types.nullOr types.str;
401 default = null;
402 example = "br0";
403 description = ''
404 Put the host-side of the veth-pair into the named bridge.
405 Only one of hostAddress* or hostBridge can be given.
406 '';
407 };
408
409 forwardPorts = mkOption {
410 type = types.listOf (
411 types.submodule {
412 options = {
413 protocol = mkOption {
414 type = types.str;
415 default = "tcp";
416 description = "The protocol specifier for port forwarding between host and container";
417 };
418 hostPort = mkOption {
419 type = types.port;
420 description = "Source port of the external interface on host";
421 };
422 containerPort = mkOption {
423 type = types.nullOr types.port;
424 default = null;
425 description = "Target port of container";
426 };
427 };
428 }
429 );
430 default = [ ];
431 example = [
432 {
433 protocol = "tcp";
434 hostPort = 8080;
435 containerPort = 80;
436 }
437 ];
438 description = ''
439 List of forwarded ports from host to container. Each forwarded port
440 is specified by protocol, hostPort and containerPort. By default,
441 protocol is tcp and hostPort and containerPort are assumed to be
442 the same if containerPort is not explicitly given.
443 '';
444 };
445
446 hostAddress = mkOption {
447 type = types.nullOr types.str;
448 default = null;
449 example = "10.231.136.1";
450 description = ''
451 The IPv4 address assigned to the host interface.
452 (Not used when hostBridge is set.)
453 '';
454 };
455
456 hostAddress6 = mkOption {
457 type = types.nullOr types.str;
458 default = null;
459 example = "fc00::1";
460 description = ''
461 The IPv6 address assigned to the host interface.
462 (Not used when hostBridge is set.)
463 '';
464 };
465
466 localAddress = mkOption {
467 type = types.nullOr types.str;
468 default = null;
469 example = "10.231.136.2";
470 description = ''
471 The IPv4 address assigned to the interface in the container.
472 If a hostBridge is used, this should be given with netmask to access
473 the whole network. Otherwise the default netmask is /32 and routing is
474 set up from localAddress to hostAddress and back.
475 '';
476 };
477
478 localAddress6 = mkOption {
479 type = types.nullOr types.str;
480 default = null;
481 example = "fc00::2";
482 description = ''
483 The IPv6 address assigned to the interface in the container.
484 If a hostBridge is used, this should be given with netmask to access
485 the whole network. Otherwise the default netmask is /128 and routing is
486 set up from localAddress6 to hostAddress6 and back.
487 '';
488 };
489
490 };
491
492 dummyConfig = {
493 extraVeths = { };
494 additionalCapabilities = [ ];
495 ephemeral = false;
496 timeoutStartSec = "1min";
497 allowedDevices = [ ];
498 hostAddress = null;
499 hostAddress6 = null;
500 localAddress = null;
501 localAddress6 = null;
502 tmpfs = null;
503 };
504
505in
506
507{
508 options = {
509
510 boot.isContainer = mkOption {
511 type = types.bool;
512 default = false;
513 description = ''
514 Whether this NixOS machine is a lightweight container running
515 in another NixOS system.
516 '';
517 };
518
519 boot.enableContainers = mkOption {
520 type = types.bool;
521 default = config.containers != { };
522 defaultText = lib.literalExpression "config.containers != { }";
523 description = ''
524 Whether to enable support for NixOS containers.
525 '';
526 };
527
528 containers = mkOption {
529 type = types.attrsOf (
530 types.submodule (
531 {
532 config,
533 options,
534 name,
535 ...
536 }:
537 {
538 options = {
539 config = mkOption {
540 description = ''
541 A specification of the desired configuration of this
542 container, as a NixOS module.
543 '';
544 type = lib.mkOptionType {
545 name = "Toplevel NixOS config";
546 merge =
547 loc: defs:
548 (import "${toString config.nixpkgs}/nixos/lib/eval-config.nix" {
549 modules =
550 let
551 extraConfig =
552 { options, ... }:
553 {
554 _file = "module at ${__curPos.file}:${toString __curPos.line}";
555 config = {
556 nixpkgs =
557 if options.nixpkgs ? hostPlatform then
558 { inherit (host.pkgs.stdenv) hostPlatform; }
559 else
560 { localSystem = host.pkgs.stdenv.hostPlatform; };
561 boot.isContainer = true;
562 networking.hostName = mkDefault name;
563 networking.useDHCP = false;
564 assertions = [
565 {
566 assertion =
567 (builtins.compareVersions kernelVersion "5.8" <= 0)
568 -> config.privateNetwork
569 -> stringLength name <= 11;
570 message = ''
571 Container name `${name}` is too long: When `privateNetwork` is enabled, container names can
572 not be longer than 11 characters, because the container's interface name is derived from it.
573 You should either make the container name shorter or upgrade to a more recent kernel that
574 supports interface altnames (i.e. at least Linux 5.8 - please see https://github.com/NixOS/nixpkgs/issues/38509
575 for details).
576 '';
577 }
578 {
579 assertion = !lib.strings.hasInfix "_" name;
580 message = ''
581 Names containing underscores are not allowed in nixos-containers. Please rename the container '${name}'
582 '';
583 }
584 ];
585 };
586 };
587 in
588 [ extraConfig ] ++ (map (x: x.value) defs);
589 prefix = [
590 "containers"
591 name
592 ];
593 inherit (config) specialArgs;
594
595 # The system is inherited from the host above.
596 # Set it to null, to remove the "legacy" entrypoint's non-hermetic default.
597 system = null;
598 }).config;
599 };
600 };
601
602 path = mkOption {
603 type = types.path;
604 example = "/nix/var/nix/profiles/per-container/webserver";
605 description = ''
606 As an alternative to specifying
607 {option}`config`, you can specify the path to
608 the evaluated NixOS system configuration, typically a
609 symlink to a system profile.
610 '';
611 };
612
613 additionalCapabilities = mkOption {
614 type = types.listOf types.str;
615 default = [ ];
616 example = [
617 "CAP_NET_ADMIN"
618 "CAP_MKNOD"
619 ];
620 description = ''
621 Grant additional capabilities to the container. See the
622 {manpage}`capabilities(7)` and {manpage}`systemd-nspawn(1)` man pages for more
623 information.
624 '';
625 };
626
627 nixpkgs = mkOption {
628 type = types.path;
629 default = pkgs.path;
630 defaultText = literalExpression "pkgs.path";
631 description = ''
632 A path to the nixpkgs that provide the modules, pkgs and lib for evaluating the container.
633
634 To only change the `pkgs` argument used inside the container modules,
635 set the `nixpkgs.*` options in the container {option}`config`.
636 Setting `config.nixpkgs.pkgs = pkgs` speeds up the container evaluation
637 by reusing the system pkgs, but the `nixpkgs.config` option in the
638 container config is ignored in this case.
639 '';
640 };
641
642 specialArgs = mkOption {
643 type = types.attrsOf types.unspecified;
644 default = { };
645 description = ''
646 A set of special arguments to be passed to NixOS modules.
647 This will be merged into the `specialArgs` used to evaluate
648 the NixOS configurations.
649 '';
650 };
651
652 ephemeral = mkOption {
653 type = types.bool;
654 default = false;
655 description = ''
656 Runs container in ephemeral mode with the empty root filesystem at boot.
657 This way container will be bootstrapped from scratch on each boot
658 and will be cleaned up on shutdown leaving no traces behind.
659 Useful for completely stateless, reproducible containers.
660
661 Note that this option might require to do some adjustments to the container configuration,
662 e.g. you might want to set
663 {var}`systemd.network.networks.$interface.dhcpV4Config.ClientIdentifier` to "mac"
664 if you use {var}`macvlans` option.
665 This way dhcp client identifier will be stable between the container restarts.
666
667 Note that the container journal will not be linked to the host if this option is enabled.
668 '';
669 };
670
671 enableTun = mkOption {
672 type = types.bool;
673 default = false;
674 description = ''
675 Allows the container to create and setup tunnel interfaces
676 by granting the `NET_ADMIN` capability and
677 enabling access to `/dev/net/tun`.
678 '';
679 };
680
681 privateNetwork = mkOption {
682 type = types.bool;
683 default = false;
684 description = ''
685 Whether to give the container its own private virtual
686 Ethernet interface. The interface is called
687 `eth0`, and is hooked up to the interface
688 `ve-«container-name»`
689 on the host. If this option is not set, then the
690 container shares the network interfaces of the host,
691 and can bind to any port on any interface.
692 '';
693 };
694
695 networkNamespace = mkOption {
696 type = types.nullOr types.path;
697 default = null;
698 description = ''
699 Takes the path to a file representing a kernel network namespace that the container
700 shall run in. The specified path should refer to a (possibly bind-mounted) network
701 namespace file, as exposed by the kernel below /proc/<PID>/ns/net. This makes the
702 container enter the given network namespace. One of the typical use cases is to give
703 a network namespace under /run/netns created by {manpage}`ip-netns(8)`.
704 Note that this option cannot be used together with other network-related options,
705 such as --private-network or --network-interface=.
706 '';
707 };
708
709 privateUsers = mkOption {
710 type = types.either types.ints.u32 (
711 types.enum [
712 "no"
713 "identity"
714 "pick"
715 ]
716 );
717 default = "no";
718 description = ''
719 Whether to give the container its own private UIDs/GIDs space (user namespacing).
720 Disabled by default (`no`).
721
722 If set to a number (usually above host's UID/GID range: 65536),
723 user namespacing is enabled and the container UID/GIDs will start at that number.
724
725 If set to `identity`, mostly equivalent to `0`, this will only provide
726 process capability isolation (no UID/GID isolation, as they are the same as host).
727
728 If set to `pick`, user namespacing is enabled and the UID/GID range is automatically chosen,
729 so that no overlapping UID/GID ranges are assigned to multiple containers.
730 This is the recommanded option as it enhances container security massively and operates fully automatically in most cases.
731
732 See <https://www.freedesktop.org/software/systemd/man/latest/systemd-nspawn.html#--private-users=> for details.
733 '';
734 };
735
736 interfaces = mkOption {
737 type = types.listOf types.str;
738 default = [ ];
739 example = [
740 "eth1"
741 "eth2"
742 ];
743 description = ''
744 The list of interfaces to be moved into the container.
745 '';
746 };
747
748 macvlans = mkOption {
749 type = types.listOf types.str;
750 default = [ ];
751 example = [
752 "eth1"
753 "eth2"
754 ];
755 description = ''
756 The list of host interfaces from which macvlans will be
757 created. For each interface specified, a macvlan interface
758 will be created and moved to the container.
759 '';
760 };
761
762 extraVeths = mkOption {
763 type =
764 with types;
765 attrsOf (submodule {
766 options = networkOptions;
767 });
768 default = { };
769 description = ''
770 Extra veth-pairs to be created for the container.
771 '';
772 };
773
774 autoStart = mkOption {
775 type = types.bool;
776 default = false;
777 description = ''
778 Whether the container is automatically started at boot-time.
779 '';
780 };
781
782 restartIfChanged = mkOption {
783 type = types.bool;
784 default = true;
785 description = ''
786 Whether the container should be restarted during a NixOS
787 configuration switch if its definition has changed.
788 '';
789 };
790
791 timeoutStartSec = mkOption {
792 type = types.str;
793 default = "1min";
794 description = ''
795 Time for the container to start. In case of a timeout,
796 the container processes get killed.
797 See {manpage}`systemd.time(7)`
798 for more information about the format.
799 '';
800 };
801
802 bindMounts = mkOption {
803 type = with types; attrsOf (submodule bindMountOpts);
804 default = { };
805 example = literalExpression ''
806 { "/home" = { hostPath = "/home/alice";
807 isReadOnly = false; };
808 }
809 '';
810
811 description = ''
812 An extra list of directories that is bound to the container.
813 '';
814 };
815
816 allowedDevices = mkOption {
817 type = with types; listOf (submodule allowedDeviceOpts);
818 default = [ ];
819 example = [
820 {
821 node = "/dev/net/tun";
822 modifier = "rwm";
823 }
824 ];
825 description = ''
826 A list of device nodes to which the containers has access to.
827 '';
828 };
829
830 tmpfs = mkOption {
831 type = types.listOf types.str;
832 default = [ ];
833 example = [ "/var" ];
834 description = ''
835 Mounts a set of tmpfs file systems into the container.
836 Multiple paths can be specified.
837 Valid items must conform to the --tmpfs argument
838 of systemd-nspawn. See {manpage}`systemd-nspawn(1)` for details.
839 '';
840 };
841
842 extraFlags = mkOption {
843 type = types.listOf types.str;
844 default = [ ];
845 example = [ "--drop-capability=CAP_SYS_CHROOT" ];
846 description = ''
847 Extra flags passed to the systemd-nspawn command.
848 See {manpage}`systemd-nspawn(1)` for details.
849 '';
850 };
851
852 flake = lib.mkOption {
853 type = lib.types.nullOr lib.types.str;
854 default = null;
855 example = "github:NixOS/nixpkgs/master";
856 description = ''
857 The Flake URI of the NixOS configuration to use for the container.
858 Replaces the option {option}`containers.<name>.path`.
859 '';
860 };
861
862 # Removed option. See `checkAssertion` below for the accompanying error message.
863 pkgs = mkOption { visible = false; };
864 }
865 // networkOptions;
866
867 config =
868 let
869 # Throw an error when removed option `pkgs` is used.
870 # Because this is a submodule we cannot use `mkRemovedOptionModule` or option `assertions`.
871 optionPath = "containers.${name}.pkgs";
872 files = showFiles options.pkgs.files;
873 checkAssertion =
874 if options.pkgs.isDefined then
875 throw ''
876 The option definition `${optionPath}' in ${files} no longer has any effect; please remove it.
877
878 Alternatively, you can use the following options:
879 - containers.${name}.nixpkgs
880 This sets the nixpkgs (and thereby the modules, pkgs and lib) that
881 are used for evaluating the container.
882
883 - containers.${name}.config.nixpkgs.pkgs
884 This only sets the `pkgs` argument used inside the container modules.
885 ''
886 else if options.config.isDefined && (options.flake.value != null) then
887 throw ''
888 The options 'containers.${name}.path' and 'containers.${name}.flake' cannot both be set.
889 ''
890 else
891 null;
892 in
893 {
894 path = builtins.seq checkAssertion mkMerge [
895 (mkIf options.config.isDefined config.config.system.build.toplevel)
896 (mkIf (config.flake != null) "/nix/var/nix/profiles/per-container/${name}")
897 ];
898 };
899 }
900 )
901 );
902
903 default = { };
904 example = literalExpression ''
905 { webserver =
906 { path = "/nix/var/nix/profiles/webserver";
907 };
908 database =
909 { config =
910 { config, pkgs, ... }:
911 { services.postgresql.enable = true;
912 services.postgresql.package = pkgs.postgresql_14;
913
914 system.stateVersion = "${lib.trivial.release}";
915 };
916 };
917 }
918 '';
919 description = ''
920 A set of NixOS system configurations to be run as lightweight
921 containers. Each container appears as a service
922 `container-«name»`
923 on the host system, allowing it to be started and stopped via
924 {command}`systemctl`.
925 '';
926 };
927
928 };
929
930 config = mkMerge [
931 {
932 warnings =
933 optional (!config.boot.enableContainers && config.containers != { })
934 "containers.<name> is used, but boot.enableContainers is false. To use containers.<name>, set boot.enableContainers to true.";
935
936 assertions =
937 let
938 mapper =
939 name: cfg:
940 optional (cfg.networkNamespace != null && (cfg.privateNetwork || cfg.interfaces != [ ]))
941 "containers.${name}.networkNamespace is mutally exclusive to containers.${name}.privateNetwork and containers.${name}.interfaces.";
942 in
943 mkMerge (mapAttrsToList mapper config.containers);
944 }
945
946 (mkIf (config.boot.enableContainers) (
947 let
948 unit = {
949 description = "Container '%i'";
950
951 unitConfig.RequiresMountsFor = "${stateDirectory}/%i";
952
953 path = [
954 pkgs.iproute2
955 config.nix.package
956 ];
957
958 environment = {
959 root = "${stateDirectory}/%i";
960 INSTANCE = "%i";
961 };
962
963 preStart = preStartScript dummyConfig;
964
965 script = startScript dummyConfig;
966
967 postStart = postStartScript dummyConfig;
968
969 restartIfChanged = false;
970
971 serviceConfig = serviceDirectives dummyConfig;
972 };
973 in
974 {
975 warnings = (
976 optional
977 (config.virtualisation.containers.enable && versionOlder config.system.stateVersion "22.05")
978 ''
979 Enabling both boot.enableContainers & virtualisation.containers on system.stateVersion < 22.05 is unsupported.
980 ''
981 );
982
983 systemd.targets.multi-user.wants = [ "machines.target" ];
984
985 systemd.services = listToAttrs (
986 filter (x: x.value != null) (
987 # The generic container template used by imperative containers
988 [
989 {
990 name = "container@";
991 value = unit;
992 }
993 ]
994 # declarative containers
995 ++ (mapAttrsToList (
996 name: cfg:
997 nameValuePair "container@${name}" (
998 let
999 containerConfig =
1000 cfg
1001 // (optionalAttrs cfg.enableTun {
1002 allowedDevices = cfg.allowedDevices ++ [
1003 {
1004 node = "/dev/net/tun";
1005 modifier = "rwm";
1006 }
1007 ];
1008 additionalCapabilities = cfg.additionalCapabilities ++ [ "CAP_NET_ADMIN" ];
1009 })
1010 // (optionalAttrs
1011 (
1012 !cfg.enableTun
1013 && cfg.privateNetwork
1014 && (cfg.privateUsers == "pick" || (builtins.isInt cfg.privateUsers && cfg.privateUsers > 0))
1015 )
1016 {
1017 allowedDevices = cfg.allowedDevices ++ [
1018 {
1019 node = "/dev/net/tun";
1020 modifier = "rwm";
1021 }
1022 ];
1023 }
1024 );
1025 in
1026 recursiveUpdate unit {
1027 preStart = preStartScript containerConfig;
1028 script = startScript containerConfig;
1029 postStart = postStartScript containerConfig;
1030 serviceConfig = serviceDirectives containerConfig;
1031 unitConfig.RequiresMountsFor =
1032 lib.optional (!containerConfig.ephemeral) "${stateDirectory}/%i"
1033 ++ builtins.map (d: if d.hostPath != null then d.hostPath else d.mountPoint) (
1034 builtins.attrValues cfg.bindMounts
1035 );
1036 environment.root =
1037 if containerConfig.ephemeral then "/run/nixos-containers/%i" else "${stateDirectory}/%i";
1038 }
1039 // (optionalAttrs containerConfig.autoStart {
1040 wantedBy = [ "machines.target" ];
1041 wants = [ "network.target" ] ++ (map (i: "sys-subsystem-net-devices-${i}.device") cfg.interfaces);
1042 after = [ "network.target" ] ++ (map (i: "sys-subsystem-net-devices-${i}.device") cfg.interfaces);
1043 restartTriggers = [
1044 containerConfig.path
1045 config.environment.etc."${configurationDirectoryName}/${name}.conf".source
1046 ];
1047 restartIfChanged = containerConfig.restartIfChanged;
1048 })
1049 )
1050 ) config.containers)
1051 )
1052 );
1053
1054 # Generate a configuration file in /etc/nixos-containers for each
1055 # container so that container@.target can get the container
1056 # configuration.
1057 environment.etc =
1058 let
1059 mkPortStr =
1060 p:
1061 p.protocol
1062 + ":"
1063 + (toString p.hostPort)
1064 + ":"
1065 + (if p.containerPort == null then toString p.hostPort else toString p.containerPort);
1066 in
1067 mapAttrs' (
1068 name: cfg:
1069 nameValuePair "${configurationDirectoryName}/${name}.conf" {
1070 text = ''
1071 ${optionalString (cfg.flake == null) ''
1072 SYSTEM_PATH=${cfg.path}
1073 ''}
1074 ${optionalString (cfg.flake != null) ''
1075 FLAKE=${cfg.flake}
1076 ''}
1077 ${optionalString cfg.privateNetwork ''
1078 PRIVATE_NETWORK=1
1079 ${optionalString (cfg.hostBridge != null) ''
1080 HOST_BRIDGE=${cfg.hostBridge}
1081 ''}
1082 ${optionalString (length cfg.forwardPorts > 0) ''
1083 HOST_PORT=${concatStringsSep "," (map mkPortStr cfg.forwardPorts)}
1084 ''}
1085 ${optionalString (cfg.hostAddress != null) ''
1086 HOST_ADDRESS=${cfg.hostAddress}
1087 ''}
1088 ${optionalString (cfg.hostAddress6 != null) ''
1089 HOST_ADDRESS6=${cfg.hostAddress6}
1090 ''}
1091 ${optionalString (cfg.localAddress != null) ''
1092 LOCAL_ADDRESS=${cfg.localAddress}
1093 ''}
1094 ${optionalString (cfg.localAddress6 != null) ''
1095 LOCAL_ADDRESS6=${cfg.localAddress6}
1096 ''}
1097 ''}
1098 ${optionalString (cfg.networkNamespace != null) ''
1099 NETWORK_NAMESPACE_PATH=${cfg.networkNamespace}
1100 ''}
1101 PRIVATE_USERS=${toString cfg.privateUsers}
1102 INTERFACES="${toString cfg.interfaces}"
1103 MACVLANS="${toString cfg.macvlans}"
1104 ${optionalString cfg.autoStart ''
1105 AUTO_START=1
1106 ''}
1107 EXTRA_NSPAWN_FLAGS="${
1108 mkBindFlags cfg.bindMounts
1109 + optionalString (cfg.extraFlags != [ ]) (" " + concatStringsSep " " cfg.extraFlags)
1110 }"
1111 '';
1112 }
1113 ) config.containers;
1114
1115 # Generate /etc/hosts entries for the containers.
1116 networking.extraHosts = concatStrings (
1117 mapAttrsToList (
1118 name: cfg:
1119 optionalString (cfg.localAddress != null) ''
1120 ${head (splitString "/" cfg.localAddress)} ${name}.containers
1121 ''
1122 ) config.containers
1123 );
1124
1125 networking.dhcpcd.denyInterfaces = [
1126 "ve-*"
1127 "vb-*"
1128 ];
1129
1130 services.udev.extraRules = optionalString config.networking.networkmanager.enable ''
1131 # Don't manage interfaces created by nixos-container.
1132 ENV{INTERFACE}=="v[eb]-*", ENV{NM_UNMANAGED}="1"
1133 '';
1134
1135 environment.systemPackages = [
1136 nixos-container
1137 ];
1138
1139 boot.kernelModules = [
1140 "bridge"
1141 "macvlan"
1142 "tap"
1143 "tun"
1144 ];
1145 }
1146 ))
1147 ];
1148
1149 meta.buildDocsInSandbox = false;
1150}