at master 5.0 kB view raw
1{ 2 lib, 3 linkFarm, 4 fetchurl, 5 buildPythonPackage, 6 fetchFromGitHub, 7 8 # nativeBuildInputs 9 cargo, 10 pkg-config, 11 rustPlatform, 12 rustc, 13 setuptools-rust, 14 15 # buildInputs 16 openssl, 17 18 # dependencies 19 huggingface-hub, 20 21 # tests 22 datasets, 23 numpy, 24 pytest-asyncio, 25 pytestCheckHook, 26 requests, 27 tiktoken, 28 writableTmpDirAsHomeHook, 29}: 30 31let 32 # See https://github.com/huggingface/tokenizers/blob/main/bindings/python/tests/utils.py for details 33 # about URLs and file names 34 test-data = linkFarm "tokenizers-test-data" { 35 "roberta-base-vocab.json" = fetchurl { 36 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json"; 37 hash = "sha256-nn9jwtFdZmtS4h0lDS5RO4fJtxPPpph6gu2J5eblBlU="; 38 }; 39 "roberta-base-merges.txt" = fetchurl { 40 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt"; 41 hash = "sha256-HOFmR3PFDz4MyIQmGak+3EYkUltyixiKngvjO3cmrcU="; 42 }; 43 "albert-base-v1-tokenizer.json" = fetchurl { 44 url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json"; 45 hash = "sha256-biqj1cpMaEG8NqUCgXnLTWPBKZMfoY/OOP2zjOxNKsM="; 46 }; 47 "bert-base-uncased-vocab.txt" = fetchurl { 48 url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"; 49 hash = "sha256-B+ztN1zsFE0nyQAkHz4zlHjeyVj5L928VR8pXJkgOKM="; 50 }; 51 "tokenizer-llama3.json" = fetchurl { 52 url = "https://huggingface.co/Narsil/llama-tokenizer/resolve/main/tokenizer.json"; 53 hash = "sha256-eePlImNfMXEwCRO7QhRkqH3mIiGCoFcLmyzLoqlksrQ="; 54 }; 55 "big.txt" = fetchurl { 56 url = "https://norvig.com/big.txt"; 57 hash = "sha256-+gZsfUDw8gGsQUTmUqpiQw5YprOAXscGUPZ42lgE6Hs="; 58 }; 59 "bert-wiki.json" = fetchurl { 60 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json"; 61 hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc="; 62 }; 63 "tokenizer-wiki.json" = fetchurl { 64 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json"; 65 hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI="; 66 }; 67 "openai-gpt-vocab.json" = fetchurl { 68 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"; 69 hash = "sha256-/fSbGefeI2hSCR2gm4Sno81eew55kWN2z0X2uBJ7gHg="; 70 }; 71 "openai-gpt-merges.txt" = fetchurl { 72 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"; 73 hash = "sha256-Dqm1GuaVBzzYceA1j3AWMR1nGn/zlj42fVI2Ui8pRyU="; 74 }; 75 }; 76in 77buildPythonPackage rec { 78 pname = "tokenizers"; 79 version = "0.22.1"; 80 pyproject = true; 81 82 src = fetchFromGitHub { 83 owner = "huggingface"; 84 repo = "tokenizers"; 85 tag = "v${version}"; 86 hash = "sha256-1ijP16Fw/dRgNXXX9qEymXNaamZmlNFqbfZee82Qz6c="; 87 }; 88 89 cargoDeps = rustPlatform.fetchCargoVendor { 90 inherit 91 pname 92 version 93 src 94 sourceRoot 95 ; 96 hash = "sha256-CKbnFtwsEtJ11Wnn8JFpHd7lnUzQMTwJ1DmmB44qciM="; 97 }; 98 99 sourceRoot = "${src.name}/bindings/python"; 100 101 nativeBuildInputs = [ 102 cargo 103 pkg-config 104 rustPlatform.cargoSetupHook 105 rustPlatform.maturinBuildHook 106 rustc 107 setuptools-rust 108 ]; 109 110 buildInputs = [ 111 openssl 112 ]; 113 114 dependencies = [ 115 huggingface-hub 116 ]; 117 118 nativeCheckInputs = [ 119 datasets 120 numpy 121 pytest-asyncio 122 pytestCheckHook 123 requests 124 tiktoken 125 writableTmpDirAsHomeHook 126 ]; 127 128 postUnpack = 129 # Add data files for tests, otherwise tests attempt network access 130 '' 131 mkdir $sourceRoot/tests/data 132 ln -s ${test-data}/* $sourceRoot/tests/data/ 133 ''; 134 135 pythonImportsCheck = [ "tokenizers" ]; 136 137 disabledTests = [ 138 # Downloads data using the datasets module 139 "test_encode_special_tokens" 140 "test_splitting" 141 "TestTrainFromIterators" 142 143 # Require downloading from huggingface 144 # huggingface_hub.errors.LocalEntryNotFoundError 145 "test_async_methods_existence" 146 "test_basic_encoding" 147 "test_concurrency" 148 "test_decode" 149 "test_decode_skip_special_tokens" 150 "test_decode_stream_fallback" 151 "test_encode" 152 "test_error_handling" 153 "test_large_batch" 154 "test_numpy_inputs" 155 "test_performance_comparison" 156 "test_various_input_formats" 157 "test_with_special_tokens" 158 "test_with_truncation_padding" 159 160 # Those tests require more data 161 "test_from_pretrained" 162 "test_from_pretrained_revision" 163 "test_continuing_prefix_trainer_mistmatch" 164 ]; 165 166 disabledTestPaths = [ 167 # fixture 'model' not found 168 "benches/test_tiktoken.py" 169 ]; 170 171 meta = { 172 description = "Fast State-of-the-Art Tokenizers optimized for Research and Production"; 173 homepage = "https://github.com/huggingface/tokenizers"; 174 changelog = "https://github.com/huggingface/tokenizers/releases/tag/v${version}"; 175 license = lib.licenses.asl20; 176 maintainers = with lib.maintainers; [ GaetanLepage ]; 177 platforms = lib.platforms.unix; 178 }; 179}