1{ lib, config, helpers, pkgs, ... }:
2
3with lib;
4let
5 cfg = config.modules.apps;
6 ollama = cfg.ollama.package;
7 ollamaArgs = [
8 "${ollama}/bin/ollama"
9 "serve"
10 ];
11
12 toEnvironmentCfg = vars: mapAttrsToList (k: v: "${k}=${escapeShellArg v}") vars;
13
14 env = {
15 OLLAMA_HOST = cfg.ollama.host;
16 OLLAMA_FLASH_ATTENTION = if cfg.ollama.flashAttention then "1" else "0";
17 OLLAMA_SCHED_SPREAD = if cfg.ollama.schedSpread then "1" else "0";
18 OLLAMA_INTEL_GPU = if cfg.ollama.intelGpu then "1" else "0";
19 OLLAMA_NEW_ENGINE = if cfg.ollama.newEngine then "1" else "0";
20 OLLAMA_KV_CACHE_TYPE = cfg.ollama.kvCacheType;
21 OLLAMA_CONTEXT_LENGTH = toString cfg.ollama.defaultContextLength;
22 OLLAMA_MAX_LOADED_MODELS = toString cfg.ollama.maxLoadedModels;
23 };
24in {
25 options.modules.apps.ollama = {
26 enable = mkOption {
27 default = false;
28 description = "Whether to enable Ollama.";
29 type = types.bool;
30 };
31
32 enableServer = mkOption {
33 default = true;
34 description = "Whether to enable Ollama's server.";
35 type = types.bool;
36 };
37
38 package = mkOption {
39 default = pkgs.ollama;
40 type = types.package;
41 };
42
43 host = mkOption {
44 default = "http://0.0.0.0:11434";
45 description = "Determines the host and port to listen on";
46 type = types.str;
47 };
48
49 maxLoadedModels = mkOption {
50 default = 3;
51 type = types.int;
52 };
53
54 defaultContextLength = mkOption {
55 default = 32768;
56 type = types.int;
57 };
58
59 flashAttention = mkOption {
60 default = true;
61 description = ''
62 Enables experimental flash attention feature.
63 Effect: Activates an experimental optimization for attention mechanisms.
64 Scenario: Can potentially improve performance on compatible hardware but may introduce instability.
65 '';
66 type = types.bool;
67 };
68
69 kvCacheType = mkOption {
70 default = "q8_0";
71 type = types.enum [ "f16" "q8_0" "q4_0" ];
72 description = ''
73 Determines the K/V cache quantization type
74 Effect: Activates quantization of the K/V cache reducing memory usage with flash attention.
75 Scenario: Can lead to reduced VRAM usage at the cost of accuracy.
76 Models with a higher Grouped Query Attention (GQA) count (e.g. Qwen 2) will see a larger negative impact.
77 '';
78 };
79
80 schedSpread = mkOption {
81 default = false;
82 description = ''
83 Allows scheduling models across all GPUs.
84 Effect: Enables multi-GPU usage for model inference.
85 Scenario: Beneficial in high-performance computing environments with multiple GPUs to maximize hardware utilization.
86 '';
87 type = types.bool;
88 };
89
90 newEngine = mkOption {
91 default = true;
92 type = types.bool;
93 };
94
95 intelGpu = mkOption {
96 default = false;
97 description = ''
98 Enables experimental Intel GPU detection.
99 Effect: Allows usage of Intel GPUs for model inference.
100 Scenario: Useful for organizations leveraging Intel GPU hardware for AI workloads.
101 '';
102 type = types.bool;
103 };
104 };
105
106 config = mkIf (cfg.enable && cfg.ollama.enable) (mkMerge [
107 {
108 home.packages = [ ollama ];
109 }
110
111 (helpers.mkIfLinux {
112 systemd.user.services.ollama = mkIf cfg.ollama.enableServer {
113 Unit = {
114 Description = "Ollama";
115 Documentation = "https://github.com/jmorganca/ollama";
116 };
117 Install.WantedBy = [ "default.target" ];
118 Service = {
119 Environment = toEnvironmentCfg env;
120 ExecStart = escapeShellArgs ollamaArgs;
121 Restart = "on-failure";
122 RestartSec = 5;
123 };
124 };
125 })
126
127 (helpers.mkIfDarwin {
128 launchd.agents.ollama = mkIf cfg.ollama.enableServer{
129 enable = true;
130 config = {
131 EnvironmentVariables = env;
132 ProcessType = "Background";
133 ProgramArguments = ollamaArgs;
134 KeepAlive = {
135 Crashed = true;
136 SuccessfulExit = false;
137 };
138 };
139 };
140 })
141 ]);
142}