1{
2 lib,
3 buildPythonPackage,
4 fetchFromGitHub,
5
6 # build-system
7 setuptools-scm,
8
9 # dependencies
10 accelerate,
11 datasets,
12 dill,
13 evaluate,
14 jsonlines,
15 more-itertools,
16 numexpr,
17 peft,
18 pybind11,
19 pytablewriter,
20 rouge-score,
21 sacrebleu,
22 scikit-learn,
23 sqlitedict,
24 torch,
25 tqdm-multiprocess,
26 transformers,
27 word2number,
28 zstandard,
29
30 # optional-dependencies
31 # api
32 aiohttp,
33 requests,
34 tenacity,
35 tiktoken,
36 tqdm,
37 # hf_transfer
38 hf-transfer,
39 # ifeval
40 immutabledict,
41 langdetect,
42 nltk,
43 # neuronx
44 optimum,
45 # mamba
46 causal-conv1d,
47 mamba-ssm,
48 # math
49 antlr4-python3-runtime,
50 sympy,
51 # sentencepiece
52 sentencepiece,
53 # vllm
54 vllm,
55 # wandb
56 numpy,
57 pandas,
58 wandb,
59
60 # tests
61 pytestCheckHook,
62 writableTmpDirAsHomeHook,
63}:
64
65buildPythonPackage rec {
66 pname = "lm-eval";
67 version = "0.4.9.1";
68 pyproject = true;
69
70 src = fetchFromGitHub {
71 owner = "EleutherAI";
72 repo = "lm-evaluation-harness";
73 tag = "v${version}";
74 hash = "sha256-N5NRRabjWxPchwOIkjqYTCKInCmVSY6T5cAmdxNbCkU=";
75 };
76
77 build-system = [
78 setuptools-scm
79 ];
80
81 dependencies = [
82 accelerate
83 datasets
84 dill
85 evaluate
86 jsonlines
87 more-itertools
88 numexpr
89 peft
90 pybind11
91 pytablewriter
92 rouge-score
93 sacrebleu
94 scikit-learn
95 sqlitedict
96 torch
97 tqdm-multiprocess
98 transformers
99 word2number
100 zstandard
101 ];
102
103 optional-dependencies = {
104 api = [
105 aiohttp
106 requests
107 tenacity
108 tiktoken
109 tqdm
110 ];
111 hf_transfer = [ hf-transfer ];
112 ifeval = [
113 immutabledict
114 langdetect
115 nltk
116 ];
117 neuronx = [ optimum ] ++ optimum.optional-dependencies.neuronx;
118 mamba = [
119 causal-conv1d
120 mamba-ssm
121 ];
122 math = [
123 antlr4-python3-runtime
124 sympy
125 ];
126 optimum = [ optimum ] ++ optimum.optional-dependencies.openvino;
127 sentencepiece = [ sentencepiece ];
128 vllm = [ vllm ];
129 wandb = [
130 numpy
131 pandas
132 wandb
133 ];
134 # Still missing dependencies for the following:
135 # deepsparse, gptq, ibm_watsonx_ai, multilingual, promptsource, sparseml,
136 # zeno, gptqmodel, japanese_leaderboard; all = [...];
137 };
138
139 pythonRelaxDeps = [ "datasets" ];
140
141 pythonImportsCheck = [ "lm_eval" ];
142
143 nativeCheckInputs = [
144 pytestCheckHook
145 writableTmpDirAsHomeHook
146 ]
147 ++ optional-dependencies.api;
148
149 disabledTests = [
150 "test_deepsparse" # deepsparse is not available
151
152 # download models from the internet
153 "test_get_batched_requests_with_no_ssl"
154 "test_model_tokenized_call_usage"
155 ];
156
157 disabledTestPaths = [
158 # attempts to download models
159 "tests/models/test_huggingface.py"
160 "tests/test_evaluator.py"
161 "tests/test_include_path.py"
162 "tests/test_prompt.py"
163 "tests/test_task_manager.py"
164 "tests/test_tasks.py"
165 "tests/test_unitxt_tasks.py"
166
167 # optimum-intel is not available
168 "tests/models/test_openvino.py"
169
170 # zeno-client is not packaged
171 "tests/scripts/test_zeno_visualize.py"
172 ];
173
174 meta = {
175 changelog = "https://github.com/EleutherAI/lm-evaluation-harness/releases/tag/${src.tag}";
176 description = "Framework for few-shot evaluation of language models";
177 homepage = "https://github.com/EleutherAI/lm-evaluation-harness";
178 license = [ lib.licenses.mit ];
179 maintainers = [ lib.maintainers.booxter ];
180 };
181}