at master 3.4 kB view raw
1{ 2 lib, 3 buildPythonPackage, 4 fetchFromGitHub, 5 6 # build-system 7 setuptools-scm, 8 9 # dependencies 10 accelerate, 11 datasets, 12 dill, 13 evaluate, 14 jsonlines, 15 more-itertools, 16 numexpr, 17 peft, 18 pybind11, 19 pytablewriter, 20 rouge-score, 21 sacrebleu, 22 scikit-learn, 23 sqlitedict, 24 torch, 25 tqdm-multiprocess, 26 transformers, 27 word2number, 28 zstandard, 29 30 # optional-dependencies 31 # api 32 aiohttp, 33 requests, 34 tenacity, 35 tiktoken, 36 tqdm, 37 # hf_transfer 38 hf-transfer, 39 # ifeval 40 immutabledict, 41 langdetect, 42 nltk, 43 # neuronx 44 optimum, 45 # mamba 46 causal-conv1d, 47 mamba-ssm, 48 # math 49 antlr4-python3-runtime, 50 sympy, 51 # sentencepiece 52 sentencepiece, 53 # vllm 54 vllm, 55 # wandb 56 numpy, 57 pandas, 58 wandb, 59 60 # tests 61 pytestCheckHook, 62 writableTmpDirAsHomeHook, 63}: 64 65buildPythonPackage rec { 66 pname = "lm-eval"; 67 version = "0.4.9.1"; 68 pyproject = true; 69 70 src = fetchFromGitHub { 71 owner = "EleutherAI"; 72 repo = "lm-evaluation-harness"; 73 tag = "v${version}"; 74 hash = "sha256-N5NRRabjWxPchwOIkjqYTCKInCmVSY6T5cAmdxNbCkU="; 75 }; 76 77 build-system = [ 78 setuptools-scm 79 ]; 80 81 dependencies = [ 82 accelerate 83 datasets 84 dill 85 evaluate 86 jsonlines 87 more-itertools 88 numexpr 89 peft 90 pybind11 91 pytablewriter 92 rouge-score 93 sacrebleu 94 scikit-learn 95 sqlitedict 96 torch 97 tqdm-multiprocess 98 transformers 99 word2number 100 zstandard 101 ]; 102 103 optional-dependencies = { 104 api = [ 105 aiohttp 106 requests 107 tenacity 108 tiktoken 109 tqdm 110 ]; 111 hf_transfer = [ hf-transfer ]; 112 ifeval = [ 113 immutabledict 114 langdetect 115 nltk 116 ]; 117 neuronx = [ optimum ] ++ optimum.optional-dependencies.neuronx; 118 mamba = [ 119 causal-conv1d 120 mamba-ssm 121 ]; 122 math = [ 123 antlr4-python3-runtime 124 sympy 125 ]; 126 optimum = [ optimum ] ++ optimum.optional-dependencies.openvino; 127 sentencepiece = [ sentencepiece ]; 128 vllm = [ vllm ]; 129 wandb = [ 130 numpy 131 pandas 132 wandb 133 ]; 134 # Still missing dependencies for the following: 135 # deepsparse, gptq, ibm_watsonx_ai, multilingual, promptsource, sparseml, 136 # zeno, gptqmodel, japanese_leaderboard; all = [...]; 137 }; 138 139 pythonRelaxDeps = [ "datasets" ]; 140 141 pythonImportsCheck = [ "lm_eval" ]; 142 143 nativeCheckInputs = [ 144 pytestCheckHook 145 writableTmpDirAsHomeHook 146 ] 147 ++ optional-dependencies.api; 148 149 disabledTests = [ 150 "test_deepsparse" # deepsparse is not available 151 152 # download models from the internet 153 "test_get_batched_requests_with_no_ssl" 154 "test_model_tokenized_call_usage" 155 ]; 156 157 disabledTestPaths = [ 158 # attempts to download models 159 "tests/models/test_huggingface.py" 160 "tests/test_evaluator.py" 161 "tests/test_include_path.py" 162 "tests/test_prompt.py" 163 "tests/test_task_manager.py" 164 "tests/test_tasks.py" 165 "tests/test_unitxt_tasks.py" 166 167 # optimum-intel is not available 168 "tests/models/test_openvino.py" 169 170 # zeno-client is not packaged 171 "tests/scripts/test_zeno_visualize.py" 172 ]; 173 174 meta = { 175 changelog = "https://github.com/EleutherAI/lm-evaluation-harness/releases/tag/${src.tag}"; 176 description = "Framework for few-shot evaluation of language models"; 177 homepage = "https://github.com/EleutherAI/lm-evaluation-harness"; 178 license = [ lib.licenses.mit ]; 179 maintainers = [ lib.maintainers.booxter ]; 180 }; 181}