1{ config, lib, pkgs, ... }:
2
3with lib;
4
5let
6
7 # The container's init script, a small wrapper around the regular
8 # NixOS stage-2 init script.
9 containerInit = (cfg:
10 let
11 renderExtraVeth = (name: cfg:
12 ''
13 echo "Bringing ${name} up"
14 ip link set dev ${name} up
15 ${optionalString (cfg . "localAddress" or null != null) ''
16 echo "Setting ip for ${name}"
17 ip addr add ${cfg . "localAddress"} dev ${name}
18 ''}
19 ${optionalString (cfg . "localAddress6" or null != null) ''
20 echo "Setting ip6 for ${name}"
21 ip -6 addr add ${cfg . "localAddress6"} dev ${name}
22 ''}
23 ${optionalString (cfg . "hostAddress" or null != null) ''
24 echo "Setting route to host for ${name}"
25 ip route add ${cfg . "hostAddress"} dev ${name}
26 ''}
27 ${optionalString (cfg . "hostAddress6" or null != null) ''
28 echo "Setting route6 to host for ${name}"
29 ip -6 route add ${cfg . "hostAddress6"} dev ${name}
30 ''}
31 ''
32 );
33 in
34 pkgs.writeScript "container-init"
35 ''
36 #! ${pkgs.stdenv.shell} -e
37
38 # Initialise the container side of the veth pair.
39 if [ "$PRIVATE_NETWORK" = 1 ]; then
40
41 ip link set host0 name eth0
42 ip link set dev eth0 up
43
44 if [ -n "$LOCAL_ADDRESS" ]; then
45 ip addr add $LOCAL_ADDRESS dev eth0
46 fi
47 if [ -n "$LOCAL_ADDRESS6" ]; then
48 ip -6 addr add $LOCAL_ADDRESS6 dev eth0
49 fi
50 if [ -n "$HOST_ADDRESS" ]; then
51 ip route add $HOST_ADDRESS dev eth0
52 ip route add default via $HOST_ADDRESS
53 fi
54 if [ -n "$HOST_ADDRESS6" ]; then
55 ip -6 route add $HOST_ADDRESS6 dev eth0
56 ip -6 route add default via $HOST_ADDRESS6
57 fi
58
59 ${concatStringsSep "\n" (mapAttrsToList renderExtraVeth cfg . "extraVeths" or {})}
60 ip a
61 ip r
62 fi
63
64 # Start the regular stage 1 script.
65 exec "$1"
66 ''
67 );
68
69 nspawnExtraVethArgs = (name: cfg: "--network-veth-extra=${name}");
70 startScript = (cfg:
71 ''
72 mkdir -p -m 0755 "$root/etc" "$root/var/lib"
73 mkdir -p -m 0700 "$root/var/lib/private" "$root/root" /run/containers
74 if ! [ -e "$root/etc/os-release" ]; then
75 touch "$root/etc/os-release"
76 fi
77
78 if ! [ -e "$root/etc/machine-id" ]; then
79 touch "$root/etc/machine-id"
80 fi
81
82 mkdir -p -m 0755 \
83 "/nix/var/nix/profiles/per-container/$INSTANCE" \
84 "/nix/var/nix/gcroots/per-container/$INSTANCE"
85
86 cp --remove-destination /etc/resolv.conf "$root/etc/resolv.conf"
87
88 if [ "$PRIVATE_NETWORK" = 1 ]; then
89 extraFlags+=" --network-veth"
90 if [ -n "$HOST_BRIDGE" ]; then
91 extraFlags+=" --network-bridge=$HOST_BRIDGE"
92 fi
93 fi
94
95 ${if cfg . "extraVeths" or null != null then
96 ''extraFlags+=" ${concatStringsSep " " (mapAttrsToList nspawnExtraVethArgs cfg . "extraVeths" or {})}"''
97 else
98 ''# No extra veth pairs to create''
99 }
100
101 for iface in $INTERFACES; do
102 extraFlags+=" --network-interface=$iface"
103 done
104
105 for iface in $MACVLANS; do
106 extraFlags+=" --network-macvlan=$iface"
107 done
108
109 # If the host is 64-bit and the container is 32-bit, add a
110 # --personality flag.
111 ${optionalString (config.nixpkgs.system == "x86_64-linux") ''
112 if [ "$(< ''${SYSTEM_PATH:-/nix/var/nix/profiles/per-container/$INSTANCE/system}/system)" = i686-linux ]; then
113 extraFlags+=" --personality=x86"
114 fi
115 ''}
116
117 # Run systemd-nspawn without startup notification (we'll
118 # wait for the container systemd to signal readiness).
119 EXIT_ON_REBOOT=1 \
120 exec ${config.systemd.package}/bin/systemd-nspawn \
121 --keep-unit \
122 -M "$INSTANCE" -D "$root" $extraFlags \
123 $EXTRA_NSPAWN_FLAGS \
124 --notify-ready=yes \
125 --bind-ro=/nix/store \
126 --bind-ro=/nix/var/nix/db \
127 --bind-ro=/nix/var/nix/daemon-socket \
128 --bind="/nix/var/nix/profiles/per-container/$INSTANCE:/nix/var/nix/profiles" \
129 --bind="/nix/var/nix/gcroots/per-container/$INSTANCE:/nix/var/nix/gcroots" \
130 --setenv PRIVATE_NETWORK="$PRIVATE_NETWORK" \
131 --setenv HOST_BRIDGE="$HOST_BRIDGE" \
132 --setenv HOST_ADDRESS="$HOST_ADDRESS" \
133 --setenv LOCAL_ADDRESS="$LOCAL_ADDRESS" \
134 --setenv HOST_ADDRESS6="$HOST_ADDRESS6" \
135 --setenv LOCAL_ADDRESS6="$LOCAL_ADDRESS6" \
136 --setenv PATH="$PATH" \
137 ${containerInit cfg} "''${SYSTEM_PATH:-/nix/var/nix/profiles/system}/init"
138 ''
139 );
140
141 preStartScript = (cfg:
142 ''
143 # Clean up existing machined registration and interfaces.
144 machinectl terminate "$INSTANCE" 2> /dev/null || true
145
146 if [ "$PRIVATE_NETWORK" = 1 ]; then
147 ip link del dev "ve-$INSTANCE" 2> /dev/null || true
148 ip link del dev "vb-$INSTANCE" 2> /dev/null || true
149 fi
150
151 ${concatStringsSep "\n" (
152 mapAttrsToList (name: cfg:
153 ''ip link del dev ${name} 2> /dev/null || true ''
154 ) cfg . "extraVeths" or {}
155 )}
156 ''
157 );
158 postStartScript = (cfg:
159 let
160 ipcall = (cfg: ipcmd: variable: attribute:
161 if cfg . attribute or null == null then
162 ''
163 if [ -n "${variable}" ]; then
164 ${ipcmd} add ${variable} dev $ifaceHost
165 fi
166 ''
167 else
168 ''${ipcmd} add ${cfg . attribute} dev $ifaceHost''
169 );
170 renderExtraVeth = (name: cfg:
171 if cfg . "hostBridge" or null != null then
172 ''
173 # Add ${name} to bridge ${cfg.hostBridge}
174 ip link set dev ${name} master ${cfg.hostBridge} up
175 ''
176 else
177 ''
178 # Set IPs and routes for ${name}
179 ${optionalString (cfg . "hostAddress" or null != null) ''
180 ip addr add ${cfg . "hostAddress"} dev ${name}
181 ''}
182 ${optionalString (cfg . "hostAddress6" or null != null) ''
183 ip -6 addr add ${cfg . "hostAddress6"} dev ${name}
184 ''}
185 ${optionalString (cfg . "localAddress" or null != null) ''
186 ip route add ${cfg . "localAddress"} dev ${name}
187 ''}
188 ${optionalString (cfg . "localAddress6" or null != null) ''
189 ip -6 route add ${cfg . "localAddress6"} dev ${name}
190 ''}
191 ''
192 );
193 in
194 ''
195 if [ "$PRIVATE_NETWORK" = 1 ]; then
196 if [ -z "$HOST_BRIDGE" ]; then
197 ifaceHost=ve-$INSTANCE
198 ip link set dev $ifaceHost up
199
200 ${ipcall cfg "ip addr" "$HOST_ADDRESS" "hostAddress"}
201 ${ipcall cfg "ip -6 addr" "$HOST_ADDRESS6" "hostAddress6"}
202 ${ipcall cfg "ip route" "$LOCAL_ADDRESS" "localAddress"}
203 ${ipcall cfg "ip -6 route" "$LOCAL_ADDRESS6" "localAddress6"}
204 fi
205 ${concatStringsSep "\n" (mapAttrsToList renderExtraVeth cfg . "extraVeths" or {})}
206 fi
207
208 # Get the leader PID so that we can signal it in
209 # preStop. We can't use machinectl there because D-Bus
210 # might be shutting down. FIXME: in systemd 219 we can
211 # just signal systemd-nspawn to do a clean shutdown.
212 machinectl show "$INSTANCE" | sed 's/Leader=\(.*\)/\1/;t;d' > "/run/containers/$INSTANCE.pid"
213 ''
214 );
215
216 system = config.nixpkgs.system;
217
218 bindMountOpts = { name, config, ... }: {
219
220 options = {
221 mountPoint = mkOption {
222 example = "/mnt/usb";
223 type = types.str;
224 description = "Mount point on the container file system.";
225 };
226 hostPath = mkOption {
227 default = null;
228 example = "/home/alice";
229 type = types.nullOr types.str;
230 description = "Location of the host path to be mounted.";
231 };
232 isReadOnly = mkOption {
233 default = true;
234 example = true;
235 type = types.bool;
236 description = "Determine whether the mounted path will be accessed in read-only mode.";
237 };
238 };
239
240 config = {
241 mountPoint = mkDefault name;
242 };
243
244 };
245
246 mkBindFlag = d:
247 let flagPrefix = if d.isReadOnly then " --bind-ro=" else " --bind=";
248 mountstr = if d.hostPath != null then "${d.hostPath}:${d.mountPoint}" else "${d.mountPoint}";
249 in flagPrefix + mountstr ;
250
251 mkBindFlags = bs: concatMapStrings mkBindFlag (lib.attrValues bs);
252
253 networkOptions = {
254 hostBridge = mkOption {
255 type = types.nullOr types.string;
256 default = null;
257 example = "br0";
258 description = ''
259 Put the host-side of the veth-pair into the named bridge.
260 Only one of hostAddress* or hostBridge can be given.
261 '';
262 };
263
264 hostAddress = mkOption {
265 type = types.nullOr types.str;
266 default = null;
267 example = "10.231.136.1";
268 description = ''
269 The IPv4 address assigned to the host interface.
270 (Not used when hostBridge is set.)
271 '';
272 };
273
274 hostAddress6 = mkOption {
275 type = types.nullOr types.string;
276 default = null;
277 example = "fc00::1";
278 description = ''
279 The IPv6 address assigned to the host interface.
280 (Not used when hostBridge is set.)
281 '';
282 };
283
284 localAddress = mkOption {
285 type = types.nullOr types.str;
286 default = null;
287 example = "10.231.136.2";
288 description = ''
289 The IPv4 address assigned to the interface in the container.
290 If a hostBridge is used, this should be given with netmask to access
291 the whole network. Otherwise the default netmask is /32 and routing is
292 set up from localAddress to hostAddress and back.
293 '';
294 };
295
296 localAddress6 = mkOption {
297 type = types.nullOr types.string;
298 default = null;
299 example = "fc00::2";
300 description = ''
301 The IPv6 address assigned to the interface in the container.
302 If a hostBridge is used, this should be given with netmask to access
303 the whole network. Otherwise the default netmask is /128 and routing is
304 set up from localAddress6 to hostAddress6 and back.
305 '';
306 };
307
308 };
309
310in
311
312{
313 options = {
314
315 boot.isContainer = mkOption {
316 type = types.bool;
317 default = false;
318 description = ''
319 Whether this NixOS machine is a lightweight container running
320 in another NixOS system.
321 '';
322 };
323
324 boot.enableContainers = mkOption {
325 type = types.bool;
326 default = !config.boot.isContainer;
327 description = ''
328 Whether to enable support for nixos containers.
329 '';
330 };
331
332 containers = mkOption {
333 type = types.attrsOf (types.submodule (
334 { config, options, name, ... }:
335 {
336 options = {
337
338 config = mkOption {
339 description = ''
340 A specification of the desired configuration of this
341 container, as a NixOS module.
342 '';
343 type = lib.mkOptionType {
344 name = "Toplevel NixOS config";
345 merge = loc: defs: (import ../../lib/eval-config.nix {
346 inherit system;
347 modules =
348 let extraConfig =
349 { boot.isContainer = true;
350 networking.hostName = mkDefault name;
351 networking.useDHCP = false;
352 };
353 in [ extraConfig ] ++ (map (x: x.value) defs);
354 prefix = [ "containers" name ];
355 }).config;
356 };
357 };
358
359 path = mkOption {
360 type = types.path;
361 example = "/nix/var/nix/profiles/containers/webserver";
362 description = ''
363 As an alternative to specifying
364 <option>config</option>, you can specify the path to
365 the evaluated NixOS system configuration, typically a
366 symlink to a system profile.
367 '';
368 };
369
370 privateNetwork = mkOption {
371 type = types.bool;
372 default = false;
373 description = ''
374 Whether to give the container its own private virtual
375 Ethernet interface. The interface is called
376 <literal>eth0</literal>, and is hooked up to the interface
377 <literal>ve-<replaceable>container-name</replaceable></literal>
378 on the host. If this option is not set, then the
379 container shares the network interfaces of the host,
380 and can bind to any port on any interface.
381 '';
382 };
383
384 interfaces = mkOption {
385 type = types.listOf types.string;
386 default = [];
387 example = [ "eth1" "eth2" ];
388 description = ''
389 The list of interfaces to be moved into the container.
390 '';
391 };
392
393 extraVeths = mkOption {
394 type = types.attrsOf types.optionSet;
395 default = {};
396 options = networkOptions;
397 description = ''
398 Extra veth-pairs to be created for the container
399 '';
400 };
401
402 autoStart = mkOption {
403 type = types.bool;
404 default = false;
405 description = ''
406 Wether the container is automatically started at boot-time.
407 '';
408 };
409
410 bindMounts = mkOption {
411 type = types.loaOf types.optionSet;
412 options = [ bindMountOpts ];
413 default = {};
414 example = { "/home" = { hostPath = "/home/alice";
415 isReadOnly = false; };
416 };
417
418 description =
419 ''
420 An extra list of directories that is bound to the container.
421 '';
422 };
423
424 } // networkOptions;
425
426 config = mkMerge
427 [
428 (mkIf options.config.isDefined {
429 path = config.config.system.build.toplevel;
430 })
431 ];
432 }));
433
434 default = {};
435 example = literalExample
436 ''
437 { webserver =
438 { path = "/nix/var/nix/profiles/webserver";
439 };
440 database =
441 { config =
442 { config, pkgs, ... }:
443 { services.postgresql.enable = true;
444 services.postgresql.package = pkgs.postgresql92;
445 };
446 };
447 }
448 '';
449 description = ''
450 A set of NixOS system configurations to be run as lightweight
451 containers. Each container appears as a service
452 <literal>container-<replaceable>name</replaceable></literal>
453 on the host system, allowing it to be started and stopped via
454 <command>systemctl</command> .
455 '';
456 };
457
458 };
459
460
461 config = mkIf (config.boot.enableContainers) (let
462
463 unit = {
464 description = "Container '%i'";
465
466 unitConfig.RequiresMountsFor = [ "/var/lib/containers/%i" ];
467
468 path = [ pkgs.iproute ];
469
470 environment.INSTANCE = "%i";
471 environment.root = "/var/lib/containers/%i";
472
473 preStart = preStartScript {};
474
475 script = startScript {};
476
477 postStart = postStartScript {};
478
479 preStop =
480 ''
481 pid="$(cat /run/containers/$INSTANCE.pid)"
482 if [ -n "$pid" ]; then
483 kill -RTMIN+4 "$pid"
484 fi
485 rm -f "/run/containers/$INSTANCE.pid"
486 '';
487
488 restartIfChanged = false;
489
490 serviceConfig = {
491 ExecReload = pkgs.writeScript "reload-container"
492 ''
493 #! ${pkgs.stdenv.shell} -e
494 ${pkgs.nixos-container}/bin/nixos-container run "$INSTANCE" -- \
495 bash --login -c "''${SYSTEM_PATH:-/nix/var/nix/profiles/system}/bin/switch-to-configuration test"
496 '';
497
498 SyslogIdentifier = "container %i";
499
500 EnvironmentFile = "-/etc/containers/%i.conf";
501
502 Type = "notify";
503
504 # Note that on reboot, systemd-nspawn returns 133, so this
505 # unit will be restarted. On poweroff, it returns 0, so the
506 # unit won't be restarted.
507 RestartForceExitStatus = "133";
508 SuccessExitStatus = "133";
509
510 Restart = "on-failure";
511
512 # Hack: we don't want to kill systemd-nspawn, since we call
513 # "machinectl poweroff" in preStop to shut down the
514 # container cleanly. But systemd requires sending a signal
515 # (at least if we want remaining processes to be killed
516 # after the timeout). So send an ignored signal.
517 KillMode = "mixed";
518 KillSignal = "WINCH";
519
520 DevicePolicy = "closed";
521 };
522 };
523 in {
524 systemd.services = listToAttrs (filter (x: x.value != null) (
525 # The generic container template used by imperative containers
526 [{ name = "container@"; value = unit; }]
527 # declarative containers
528 ++ (mapAttrsToList (name: cfg: nameValuePair "container@${name}" (
529 unit // {
530 preStart = preStartScript cfg;
531 script = startScript cfg;
532 postStart = postStartScript cfg;
533 } // (
534 if cfg.autoStart then
535 {
536 wantedBy = [ "multi-user.target" ];
537 wants = [ "network.target" ];
538 after = [ "network.target" ];
539 restartTriggers = [ cfg.path ];
540 reloadIfChanged = true;
541 }
542 else {})
543 )) config.containers)
544 ));
545
546 # Generate a configuration file in /etc/containers for each
547 # container so that container@.target can get the container
548 # configuration.
549 environment.etc = mapAttrs' (name: cfg: nameValuePair "containers/${name}.conf"
550 { text =
551 ''
552 SYSTEM_PATH=${cfg.path}
553 ${optionalString cfg.privateNetwork ''
554 PRIVATE_NETWORK=1
555 ${optionalString (cfg.hostBridge != null) ''
556 HOST_BRIDGE=${cfg.hostBridge}
557 ''}
558 ${optionalString (cfg.hostAddress != null) ''
559 HOST_ADDRESS=${cfg.hostAddress}
560 ''}
561 ${optionalString (cfg.hostAddress6 != null) ''
562 HOST_ADDRESS6=${cfg.hostAddress6}
563 ''}
564 ${optionalString (cfg.localAddress != null) ''
565 LOCAL_ADDRESS=${cfg.localAddress}
566 ''}
567 ${optionalString (cfg.localAddress6 != null) ''
568 LOCAL_ADDRESS6=${cfg.localAddress6}
569 ''}
570 ''}
571 INTERFACES="${toString cfg.interfaces}"
572 ${optionalString cfg.autoStart ''
573 AUTO_START=1
574 ''}
575 EXTRA_NSPAWN_FLAGS="${mkBindFlags cfg.bindMounts}"
576 '';
577 }) config.containers;
578
579 # Generate /etc/hosts entries for the containers.
580 networking.extraHosts = concatStrings (mapAttrsToList (name: cfg: optionalString (cfg.localAddress != null)
581 ''
582 ${cfg.localAddress} ${name}.containers
583 '') config.containers);
584
585 networking.dhcpcd.denyInterfaces = [ "ve-*" ];
586
587 environment.systemPackages = [ pkgs.nixos-container ];
588 });
589}