Personal Nix setup
at main 4.1 kB view raw
1{ lib, config, helpers, pkgs, ... }: 2 3with lib; 4let 5 cfg = config.modules.apps; 6 ollama = cfg.ollama.package; 7 ollamaArgs = [ 8 "${ollama}/bin/ollama" 9 "serve" 10 ]; 11 12 toEnvironmentCfg = vars: mapAttrsToList (k: v: "${k}=${escapeShellArg v}") vars; 13 14 env = { 15 OLLAMA_HOST = cfg.ollama.host; 16 OLLAMA_FLASH_ATTENTION = if cfg.ollama.flashAttention then "1" else "0"; 17 OLLAMA_SCHED_SPREAD = if cfg.ollama.schedSpread then "1" else "0"; 18 OLLAMA_INTEL_GPU = if cfg.ollama.intelGpu then "1" else "0"; 19 OLLAMA_NEW_ENGINE = if cfg.ollama.newEngine then "1" else "0"; 20 OLLAMA_KV_CACHE_TYPE = cfg.ollama.kvCacheType; 21 OLLAMA_CONTEXT_LENGTH = toString cfg.ollama.defaultContextLength; 22 OLLAMA_MAX_LOADED_MODELS = toString cfg.ollama.maxLoadedModels; 23 }; 24in { 25 options.modules.apps.ollama = { 26 enable = mkOption { 27 default = false; 28 description = "Whether to enable Ollama."; 29 type = types.bool; 30 }; 31 32 enableServer = mkOption { 33 default = true; 34 description = "Whether to enable Ollama's server."; 35 type = types.bool; 36 }; 37 38 package = mkOption { 39 default = pkgs.ollama; 40 type = types.package; 41 }; 42 43 host = mkOption { 44 default = "http://0.0.0.0:11434"; 45 description = "Determines the host and port to listen on"; 46 type = types.str; 47 }; 48 49 maxLoadedModels = mkOption { 50 default = 3; 51 type = types.int; 52 }; 53 54 defaultContextLength = mkOption { 55 default = 32768; 56 type = types.int; 57 }; 58 59 flashAttention = mkOption { 60 default = true; 61 description = '' 62 Enables experimental flash attention feature. 63 Effect: Activates an experimental optimization for attention mechanisms. 64 Scenario: Can potentially improve performance on compatible hardware but may introduce instability. 65 ''; 66 type = types.bool; 67 }; 68 69 kvCacheType = mkOption { 70 default = "q8_0"; 71 type = types.enum [ "f16" "q8_0" "q4_0" ]; 72 description = '' 73 Determines the K/V cache quantization type 74 Effect: Activates quantization of the K/V cache reducing memory usage with flash attention. 75 Scenario: Can lead to reduced VRAM usage at the cost of accuracy. 76 Models with a higher Grouped Query Attention (GQA) count (e.g. Qwen 2) will see a larger negative impact. 77 ''; 78 }; 79 80 schedSpread = mkOption { 81 default = false; 82 description = '' 83 Allows scheduling models across all GPUs. 84 Effect: Enables multi-GPU usage for model inference. 85 Scenario: Beneficial in high-performance computing environments with multiple GPUs to maximize hardware utilization. 86 ''; 87 type = types.bool; 88 }; 89 90 newEngine = mkOption { 91 default = true; 92 type = types.bool; 93 }; 94 95 intelGpu = mkOption { 96 default = false; 97 description = '' 98 Enables experimental Intel GPU detection. 99 Effect: Allows usage of Intel GPUs for model inference. 100 Scenario: Useful for organizations leveraging Intel GPU hardware for AI workloads. 101 ''; 102 type = types.bool; 103 }; 104 }; 105 106 config = mkIf (cfg.enable && cfg.ollama.enable) (mkMerge [ 107 { 108 home.packages = [ ollama ]; 109 } 110 111 (helpers.mkIfLinux { 112 systemd.user.services.ollama = mkIf cfg.ollama.enableServer { 113 Unit = { 114 Description = "Ollama"; 115 Documentation = "https://github.com/jmorganca/ollama"; 116 }; 117 Install.WantedBy = [ "default.target" ]; 118 Service = { 119 Environment = toEnvironmentCfg env; 120 ExecStart = escapeShellArgs ollamaArgs; 121 Restart = "on-failure"; 122 RestartSec = 5; 123 }; 124 }; 125 }) 126 127 (helpers.mkIfDarwin { 128 launchd.agents.ollama = mkIf cfg.ollama.enableServer{ 129 enable = true; 130 config = { 131 EnvironmentVariables = env; 132 ProcessType = "Background"; 133 ProgramArguments = ollamaArgs; 134 KeepAlive = { 135 Crashed = true; 136 SuccessfulExit = false; 137 }; 138 }; 139 }; 140 }) 141 ]); 142}