1{ config, pkgs, lib, utils, ... }:
2
3let
4 toplevelConfig = config;
5 inherit (lib) types;
6 inherit (utils.systemdUtils.lib) mkPathSafeName;
7in {
8 options.systemd.services = lib.mkOption {
9 type = types.attrsOf (types.submodule ({ name, config, ... }: {
10 options.confinement.enable = lib.mkOption {
11 type = types.bool;
12 default = false;
13 description = lib.mdDoc ''
14 If set, all the required runtime store paths for this service are
15 bind-mounted into a `tmpfs`-based
16 {manpage}`chroot(2)`.
17 '';
18 };
19
20 options.confinement.fullUnit = lib.mkOption {
21 type = types.bool;
22 default = false;
23 description = lib.mdDoc ''
24 Whether to include the full closure of the systemd unit file into the
25 chroot, instead of just the dependencies for the executables.
26
27 ::: {.warning}
28 While it may be tempting to just enable this option to
29 make things work quickly, please be aware that this might add paths
30 to the closure of the chroot that you didn't anticipate. It's better
31 to use {option}`confinement.packages` to **explicitly** add additional store paths to the
32 chroot.
33 :::
34 '';
35 };
36
37 options.confinement.packages = lib.mkOption {
38 type = types.listOf (types.either types.str types.package);
39 default = [];
40 description = let
41 mkScOption = optName: "{option}`serviceConfig.${optName}`";
42 in lib.mdDoc ''
43 Additional packages or strings with context to add to the closure of
44 the chroot. By default, this includes all the packages from the
45 ${lib.concatMapStringsSep ", " mkScOption [
46 "ExecReload" "ExecStartPost" "ExecStartPre" "ExecStop"
47 "ExecStopPost"
48 ]} and ${mkScOption "ExecStart"} options. If you want to have all the
49 dependencies of this systemd unit, you can use
50 {option}`confinement.fullUnit`.
51
52 ::: {.note}
53 The store paths listed in {option}`path` are
54 **not** included in the closure as
55 well as paths from other options except those listed
56 above.
57 :::
58 '';
59 };
60
61 options.confinement.binSh = lib.mkOption {
62 type = types.nullOr types.path;
63 default = toplevelConfig.environment.binsh;
64 defaultText = lib.literalExpression "config.environment.binsh";
65 example = lib.literalExpression ''"''${pkgs.dash}/bin/dash"'';
66 description = lib.mdDoc ''
67 The program to make available as {file}`/bin/sh` inside
68 the chroot. If this is set to `null`, no
69 {file}`/bin/sh` is provided at all.
70
71 This is useful for some applications, which for example use the
72 {manpage}`system(3)` library function to execute commands.
73 '';
74 };
75
76 options.confinement.mode = lib.mkOption {
77 type = types.enum [ "full-apivfs" "chroot-only" ];
78 default = "full-apivfs";
79 description = lib.mdDoc ''
80 The value `full-apivfs` (the default) sets up
81 private {file}`/dev`, {file}`/proc`,
82 {file}`/sys` and {file}`/tmp` file systems in a separate user
83 name space.
84
85 If this is set to `chroot-only`, only the file
86 system name space is set up along with the call to
87 {manpage}`chroot(2)`.
88
89 ::: {.note}
90 This doesn't cover network namespaces and is solely for
91 file system level isolation.
92 :::
93 '';
94 };
95
96 config = let
97 rootName = "${mkPathSafeName name}-chroot";
98 inherit (config.confinement) binSh fullUnit;
99 wantsAPIVFS = lib.mkDefault (config.confinement.mode == "full-apivfs");
100 in lib.mkIf config.confinement.enable {
101 serviceConfig = {
102 RootDirectory = "/var/empty";
103 TemporaryFileSystem = "/";
104 PrivateMounts = lib.mkDefault true;
105
106 # https://github.com/NixOS/nixpkgs/issues/14645 is a future attempt
107 # to change some of these to default to true.
108 #
109 # If we run in chroot-only mode, having something like PrivateDevices
110 # set to true by default will mount /dev within the chroot, whereas
111 # with "chroot-only" it's expected that there are no /dev, /proc and
112 # /sys file systems available.
113 #
114 # However, if this suddenly becomes true, the attack surface will
115 # increase, so let's explicitly set these options to true/false
116 # depending on the mode.
117 MountAPIVFS = wantsAPIVFS;
118 PrivateDevices = wantsAPIVFS;
119 PrivateTmp = wantsAPIVFS;
120 PrivateUsers = wantsAPIVFS;
121 ProtectControlGroups = wantsAPIVFS;
122 ProtectKernelModules = wantsAPIVFS;
123 ProtectKernelTunables = wantsAPIVFS;
124 };
125 confinement.packages = let
126 execOpts = [
127 "ExecReload" "ExecStart" "ExecStartPost" "ExecStartPre" "ExecStop"
128 "ExecStopPost"
129 ];
130 execPkgs = lib.concatMap (opt: let
131 isSet = config.serviceConfig ? ${opt};
132 in lib.flatten (lib.optional isSet config.serviceConfig.${opt})) execOpts;
133 unitAttrs = toplevelConfig.systemd.units."${name}.service";
134 allPkgs = lib.singleton (builtins.toJSON unitAttrs);
135 unitPkgs = if fullUnit then allPkgs else execPkgs;
136 in unitPkgs ++ lib.optional (binSh != null) binSh;
137 };
138 }));
139 };
140
141 config.assertions = lib.concatLists (lib.mapAttrsToList (name: cfg: let
142 whatOpt = optName: "The 'serviceConfig' option '${optName}' for"
143 + " service '${name}' is enabled in conjunction with"
144 + " 'confinement.enable'";
145 in lib.optionals cfg.confinement.enable [
146 { assertion = !cfg.serviceConfig.RootDirectoryStartOnly or false;
147 message = "${whatOpt "RootDirectoryStartOnly"}, but right now systemd"
148 + " doesn't support restricting bind-mounts to 'ExecStart'."
149 + " Please either define a separate service or find a way to run"
150 + " commands other than ExecStart within the chroot.";
151 }
152 { assertion = !cfg.serviceConfig.DynamicUser or false;
153 message = "${whatOpt "DynamicUser"}. Please create a dedicated user via"
154 + " the 'users.users' option instead as this combination is"
155 + " currently not supported.";
156 }
157 { assertion = cfg.serviceConfig ? ProtectSystem -> cfg.serviceConfig.ProtectSystem == false;
158 message = "${whatOpt "ProtectSystem"}. ProtectSystem is not compatible"
159 + " with service confinement as it fails to remount /usr within"
160 + " our chroot. Please disable the option.";
161 }
162 ]) config.systemd.services);
163
164 config.systemd.packages = lib.concatLists (lib.mapAttrsToList (name: cfg: let
165 rootPaths = let
166 contents = lib.concatStringsSep "\n" cfg.confinement.packages;
167 in pkgs.writeText "${mkPathSafeName name}-string-contexts.txt" contents;
168
169 chrootPaths = pkgs.runCommand "${mkPathSafeName name}-chroot-paths" {
170 closureInfo = pkgs.closureInfo { inherit rootPaths; };
171 serviceName = "${name}.service";
172 excludedPath = rootPaths;
173 } ''
174 mkdir -p "$out/lib/systemd/system/$serviceName.d"
175 serviceFile="$out/lib/systemd/system/$serviceName.d/confinement.conf"
176
177 echo '[Service]' > "$serviceFile"
178
179 # /bin/sh is special here, because the option value could contain a
180 # symlink and we need to properly resolve it.
181 ${lib.optionalString (cfg.confinement.binSh != null) ''
182 binsh=${lib.escapeShellArg cfg.confinement.binSh}
183 realprog="$(readlink -e "$binsh")"
184 echo "BindReadOnlyPaths=$realprog:/bin/sh" >> "$serviceFile"
185 ''}
186
187 while read storePath; do
188 if [ -L "$storePath" ]; then
189 # Currently, systemd can't cope with symlinks in Bind(ReadOnly)Paths,
190 # so let's just bind-mount the target to that location.
191 echo "BindReadOnlyPaths=$(readlink -e "$storePath"):$storePath"
192 elif [ "$storePath" != "$excludedPath" ]; then
193 echo "BindReadOnlyPaths=$storePath"
194 fi
195 done < "$closureInfo/store-paths" >> "$serviceFile"
196 '';
197 in lib.optional cfg.confinement.enable chrootPaths) config.systemd.services);
198}