1{
2 config,
3 pkgs,
4 lib,
5 utils,
6 ...
7}:
8
9let
10 toplevelConfig = config;
11 inherit (lib) types;
12 inherit (utils.systemdUtils.lib) mkPathSafeName;
13in
14{
15 options.systemd.services = lib.mkOption {
16 type = types.attrsOf (
17 types.submodule (
18 { name, config, ... }:
19 {
20 options.confinement.enable = lib.mkOption {
21 type = types.bool;
22 default = false;
23 description = ''
24 If set, all the required runtime store paths for this service are
25 bind-mounted into a `tmpfs`-based
26 {manpage}`chroot(2)`.
27 '';
28 };
29
30 options.confinement.fullUnit = lib.mkOption {
31 type = types.bool;
32 default = false;
33 description = ''
34 Whether to include the full closure of the systemd unit file into the
35 chroot, instead of just the dependencies for the executables.
36
37 ::: {.warning}
38 While it may be tempting to just enable this option to
39 make things work quickly, please be aware that this might add paths
40 to the closure of the chroot that you didn't anticipate. It's better
41 to use {option}`confinement.packages` to **explicitly** add additional store paths to the
42 chroot.
43 :::
44 '';
45 };
46
47 options.confinement.packages = lib.mkOption {
48 type = types.listOf (types.either types.str types.package);
49 default = [ ];
50 description =
51 let
52 mkScOption = optName: "{option}`serviceConfig.${optName}`";
53 in
54 ''
55 Additional packages or strings with context to add to the closure of
56 the chroot. By default, this includes all the packages from the
57 ${
58 lib.concatMapStringsSep ", " mkScOption [
59 "ExecReload"
60 "ExecStartPost"
61 "ExecStartPre"
62 "ExecStop"
63 "ExecStopPost"
64 ]
65 } and ${mkScOption "ExecStart"} options. If you want to have all the
66 dependencies of this systemd unit, you can use
67 {option}`confinement.fullUnit`.
68
69 ::: {.note}
70 The store paths listed in {option}`path` are
71 **not** included in the closure as
72 well as paths from other options except those listed
73 above.
74 :::
75 '';
76 };
77
78 options.confinement.binSh = lib.mkOption {
79 type = types.nullOr types.path;
80 default = toplevelConfig.environment.binsh;
81 defaultText = lib.literalExpression "config.environment.binsh";
82 example = lib.literalExpression ''"''${pkgs.dash}/bin/dash"'';
83 description = ''
84 The program to make available as {file}`/bin/sh` inside
85 the chroot. If this is set to `null`, no
86 {file}`/bin/sh` is provided at all.
87
88 This is useful for some applications, which for example use the
89 {manpage}`system(3)` library function to execute commands.
90 '';
91 };
92
93 options.confinement.mode = lib.mkOption {
94 type = types.enum [
95 "full-apivfs"
96 "chroot-only"
97 ];
98 default = "full-apivfs";
99 description = ''
100 The value `full-apivfs` (the default) sets up
101 private {file}`/dev`, {file}`/proc`,
102 {file}`/sys`, {file}`/tmp` and {file}`/var/tmp` file systems
103 in a separate user name space.
104
105 If this is set to `chroot-only`, only the file
106 system name space is set up along with the call to
107 {manpage}`chroot(2)`.
108
109 In all cases, unless `serviceConfig.PrivateTmp=true` is set,
110 both {file}`/tmp` and {file}`/var/tmp` paths are added to `InaccessiblePaths=`.
111 This is to overcome options like `DynamicUser=true`
112 implying `PrivateTmp=true` without letting it being turned off.
113 Beware however that giving processes the `CAP_SYS_ADMIN` and `@mount` privileges
114 can let them undo the effects of `InaccessiblePaths=`.
115
116 ::: {.note}
117 This doesn't cover network namespaces and is solely for
118 file system level isolation.
119 :::
120 '';
121 };
122
123 config =
124 let
125 inherit (config.confinement) binSh fullUnit;
126 wantsAPIVFS = lib.mkDefault (config.confinement.mode == "full-apivfs");
127 in
128 lib.mkIf config.confinement.enable {
129 serviceConfig = {
130 ReadOnlyPaths = [ "+/" ];
131 RuntimeDirectory = [ "confinement/${mkPathSafeName name}" ];
132 RootDirectory = "/run/confinement/${mkPathSafeName name}";
133 InaccessiblePaths = [
134 "-+/run/confinement/${mkPathSafeName name}"
135 ];
136 PrivateMounts = lib.mkDefault true;
137
138 # https://github.com/NixOS/nixpkgs/issues/14645 is a future attempt
139 # to change some of these to default to true.
140 #
141 # If we run in chroot-only mode, having something like PrivateDevices
142 # set to true by default will mount /dev within the chroot, whereas
143 # with "chroot-only" it's expected that there are no /dev, /proc and
144 # /sys file systems available.
145 #
146 # However, if this suddenly becomes true, the attack surface will
147 # increase, so let's explicitly set these options to true/false
148 # depending on the mode.
149 MountAPIVFS = wantsAPIVFS;
150 PrivateDevices = wantsAPIVFS;
151 PrivateTmp = wantsAPIVFS;
152 PrivateUsers = wantsAPIVFS;
153 ProtectControlGroups = wantsAPIVFS;
154 ProtectKernelModules = wantsAPIVFS;
155 ProtectKernelTunables = wantsAPIVFS;
156 };
157 confinement.packages =
158 let
159 execOpts = [
160 "ExecReload"
161 "ExecStart"
162 "ExecStartPost"
163 "ExecStartPre"
164 "ExecStop"
165 "ExecStopPost"
166 ];
167 execPkgs = lib.concatMap (
168 opt:
169 let
170 isSet = config.serviceConfig ? ${opt};
171 in
172 lib.flatten (lib.optional isSet config.serviceConfig.${opt})
173 ) execOpts;
174 unitAttrs = toplevelConfig.systemd.units."${name}.service";
175 allPkgs = lib.singleton (builtins.toJSON unitAttrs);
176 unitPkgs = if fullUnit then allPkgs else execPkgs;
177 in
178 unitPkgs ++ lib.optional (binSh != null) binSh;
179 };
180 }
181 )
182 );
183 };
184
185 config.assertions = lib.concatLists (
186 lib.mapAttrsToList (
187 name: cfg:
188 let
189 whatOpt =
190 optName:
191 "The 'serviceConfig' option '${optName}' for"
192 + " service '${name}' is enabled in conjunction with"
193 + " 'confinement.enable'";
194 in
195 lib.optionals cfg.confinement.enable [
196 {
197 assertion = !cfg.serviceConfig.RootDirectoryStartOnly or false;
198 message =
199 "${whatOpt "RootDirectoryStartOnly"}, but right now systemd"
200 + " doesn't support restricting bind-mounts to 'ExecStart'."
201 + " Please either define a separate service or find a way to run"
202 + " commands other than ExecStart within the chroot.";
203 }
204 ]
205 ) config.systemd.services
206 );
207
208 config.systemd.packages = lib.concatLists (
209 lib.mapAttrsToList (
210 name: cfg:
211 let
212 rootPaths =
213 let
214 contents = lib.concatStringsSep "\n" cfg.confinement.packages;
215 in
216 pkgs.writeText "${mkPathSafeName name}-string-contexts.txt" contents;
217
218 chrootPaths =
219 pkgs.runCommand "${mkPathSafeName name}-chroot-paths"
220 {
221 closureInfo = pkgs.closureInfo { inherit rootPaths; };
222 serviceName = "${name}.service";
223 excludedPath = rootPaths;
224 }
225 ''
226 mkdir -p "$out/lib/systemd/system/$serviceName.d"
227 serviceFile="$out/lib/systemd/system/$serviceName.d/confinement.conf"
228
229 echo '[Service]' > "$serviceFile"
230
231 # /bin/sh is special here, because the option value could contain a
232 # symlink and we need to properly resolve it.
233 ${lib.optionalString (cfg.confinement.binSh != null) ''
234 binsh=${lib.escapeShellArg cfg.confinement.binSh}
235 realprog="$(readlink -e "$binsh")"
236 echo "BindReadOnlyPaths=$realprog:/bin/sh" >> "$serviceFile"
237 ''}
238
239 # If DynamicUser= is enabled, PrivateTmp=true is implied (and cannot be turned off).
240 # so disable them unless PrivateTmp=true is explicitely set.
241 ${lib.optionalString (!cfg.serviceConfig.PrivateTmp) ''
242 echo "InaccessiblePaths=-+/tmp" >> "$serviceFile"
243 echo "InaccessiblePaths=-+/var/tmp" >> "$serviceFile"
244 ''}
245
246 while read storePath; do
247 if [ -L "$storePath" ]; then
248 # Currently, systemd can't cope with symlinks in Bind(ReadOnly)Paths,
249 # so let's just bind-mount the target to that location.
250 echo "BindReadOnlyPaths=$(readlink -e "$storePath"):$storePath"
251 elif [ "$storePath" != "$excludedPath" ]; then
252 echo "BindReadOnlyPaths=$storePath"
253 fi
254 done < "$closureInfo/store-paths" >> "$serviceFile"
255 '';
256 in
257 lib.optional cfg.confinement.enable chrootPaths
258 ) config.systemd.services
259 );
260}