{ lib, config, helpers, pkgs, ... }:

with lib;
let
  cfg = config.modules.apps;
  ollama = cfg.ollama.package;
  ollamaArgs = [
    "${ollama}/bin/ollama"
    "serve"
  ];

  toEnvironmentCfg = vars: mapAttrsToList (k: v: "${k}=${escapeShellArg v}") vars;

  env = {
    OLLAMA_HOST = cfg.ollama.host;
    OLLAMA_FLASH_ATTENTION = if cfg.ollama.flashAttention then "1" else "0";
    OLLAMA_SCHED_SPREAD = if cfg.ollama.schedSpread then "1" else "0";
    OLLAMA_INTEL_GPU = if cfg.ollama.intelGpu then "1" else "0";
    OLLAMA_NEW_ENGINE = if cfg.ollama.newEngine then "1" else "0";
    OLLAMA_KV_CACHE_TYPE = cfg.ollama.kvCacheType;
    OLLAMA_CONTEXT_LENGTH = toString cfg.ollama.defaultContextLength;
    OLLAMA_MAX_LOADED_MODELS = toString cfg.ollama.maxLoadedModels;
  };
in {
  options.modules.apps.ollama = {
    enable = mkOption {
      default = false;
      description = "Whether to enable Ollama.";
      type = types.bool;
    };

    enableServer = mkOption {
      default = true;
      description = "Whether to enable Ollama's server.";
      type = types.bool;
    };

    package = mkOption {
      default = pkgs.ollama;
      type = types.package;
    };

    host = mkOption {
      default = "http://0.0.0.0:11434";
      description = "Determines the host and port to listen on";
      type = types.str;
    };

    maxLoadedModels = mkOption {
      default = 3;
      type = types.int;
    };

    defaultContextLength = mkOption {
      default = 32768;
      type = types.int;
    };

    flashAttention = mkOption {
      default = true;
      description = ''
        Enables experimental flash attention feature.
        Effect: Activates an experimental optimization for attention mechanisms.
        Scenario: Can potentially improve performance on compatible hardware but may introduce instability.
      '';
      type = types.bool;
    };

    kvCacheType = mkOption {
      default = "q8_0";
      type = types.enum [ "f16" "q8_0" "q4_0" ];
      description = ''
        Determines the K/V cache quantization type
        Effect: Activates quantization of the K/V cache reducing memory usage with flash attention.
        Scenario: Can lead to reduced VRAM usage at the cost of accuracy.
        Models with a higher Grouped Query Attention (GQA) count (e.g. Qwen 2) will see a larger negative impact.
      '';
    };

    schedSpread = mkOption {
      default = false;
      description = ''
        Allows scheduling models across all GPUs.
        Effect: Enables multi-GPU usage for model inference.
        Scenario: Beneficial in high-performance computing environments with multiple GPUs to maximize hardware utilization.
      '';
      type = types.bool;
    };

    newEngine = mkOption {
      default = true;
      type = types.bool;
    };

    intelGpu = mkOption {
      default = false;
      description = ''
        Enables experimental Intel GPU detection.
        Effect: Allows usage of Intel GPUs for model inference.
        Scenario: Useful for organizations leveraging Intel GPU hardware for AI workloads.
      '';
      type = types.bool;
    };
  };

  config = mkIf (cfg.enable && cfg.ollama.enable) (mkMerge [
    {
      home.packages = [ ollama ];
    }

    (helpers.mkIfLinux {
      systemd.user.services.ollama = mkIf cfg.ollama.enableServer {
        Unit = {
          Description = "Ollama";
          Documentation = "https://github.com/jmorganca/ollama";
        };
        Install.WantedBy = [ "default.target" ];
        Service = {
          Environment = toEnvironmentCfg env;
          ExecStart = escapeShellArgs ollamaArgs;
          Restart = "on-failure";
          RestartSec = 5;
        };
      };
    })

    (helpers.mkIfDarwin {
      launchd.agents.ollama = mkIf cfg.ollama.enableServer{
        enable = true;
        config = {
          EnvironmentVariables = env;
          ProcessType = "Background";
          ProgramArguments = ollamaArgs;
          KeepAlive = {
            Crashed = true;
            SuccessfulExit = false;
          };
        };
      };
    })
  ]);
}