1{
2 lib,
3 linkFarm,
4 fetchurl,
5 buildPythonPackage,
6 fetchFromGitHub,
7
8 # nativeBuildInputs
9 cargo,
10 pkg-config,
11 rustPlatform,
12 rustc,
13 setuptools-rust,
14
15 # buildInputs
16 openssl,
17
18 # dependencies
19 huggingface-hub,
20
21 # tests
22 datasets,
23 numpy,
24 pytest-asyncio,
25 pytestCheckHook,
26 requests,
27 tiktoken,
28 writableTmpDirAsHomeHook,
29}:
30
31let
32 # See https://github.com/huggingface/tokenizers/blob/main/bindings/python/tests/utils.py for details
33 # about URLs and file names
34 test-data = linkFarm "tokenizers-test-data" {
35 "roberta-base-vocab.json" = fetchurl {
36 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json";
37 hash = "sha256-nn9jwtFdZmtS4h0lDS5RO4fJtxPPpph6gu2J5eblBlU=";
38 };
39 "roberta-base-merges.txt" = fetchurl {
40 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";
41 hash = "sha256-HOFmR3PFDz4MyIQmGak+3EYkUltyixiKngvjO3cmrcU=";
42 };
43 "albert-base-v1-tokenizer.json" = fetchurl {
44 url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json";
45 hash = "sha256-biqj1cpMaEG8NqUCgXnLTWPBKZMfoY/OOP2zjOxNKsM=";
46 };
47 "bert-base-uncased-vocab.txt" = fetchurl {
48 url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
49 hash = "sha256-B+ztN1zsFE0nyQAkHz4zlHjeyVj5L928VR8pXJkgOKM=";
50 };
51 "tokenizer-llama3.json" = fetchurl {
52 url = "https://huggingface.co/Narsil/llama-tokenizer/resolve/main/tokenizer.json";
53 hash = "sha256-eePlImNfMXEwCRO7QhRkqH3mIiGCoFcLmyzLoqlksrQ=";
54 };
55 "big.txt" = fetchurl {
56 url = "https://norvig.com/big.txt";
57 hash = "sha256-+gZsfUDw8gGsQUTmUqpiQw5YprOAXscGUPZ42lgE6Hs=";
58 };
59 "bert-wiki.json" = fetchurl {
60 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json";
61 hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc=";
62 };
63 "tokenizer-wiki.json" = fetchurl {
64 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json";
65 hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI=";
66 };
67 "openai-gpt-vocab.json" = fetchurl {
68 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
69 hash = "sha256-/fSbGefeI2hSCR2gm4Sno81eew55kWN2z0X2uBJ7gHg=";
70 };
71 "openai-gpt-merges.txt" = fetchurl {
72 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt";
73 hash = "sha256-Dqm1GuaVBzzYceA1j3AWMR1nGn/zlj42fVI2Ui8pRyU=";
74 };
75 };
76in
77buildPythonPackage rec {
78 pname = "tokenizers";
79 version = "0.22.1";
80 pyproject = true;
81
82 src = fetchFromGitHub {
83 owner = "huggingface";
84 repo = "tokenizers";
85 tag = "v${version}";
86 hash = "sha256-1ijP16Fw/dRgNXXX9qEymXNaamZmlNFqbfZee82Qz6c=";
87 };
88
89 cargoDeps = rustPlatform.fetchCargoVendor {
90 inherit
91 pname
92 version
93 src
94 sourceRoot
95 ;
96 hash = "sha256-CKbnFtwsEtJ11Wnn8JFpHd7lnUzQMTwJ1DmmB44qciM=";
97 };
98
99 sourceRoot = "${src.name}/bindings/python";
100
101 nativeBuildInputs = [
102 cargo
103 pkg-config
104 rustPlatform.cargoSetupHook
105 rustPlatform.maturinBuildHook
106 rustc
107 setuptools-rust
108 ];
109
110 buildInputs = [
111 openssl
112 ];
113
114 dependencies = [
115 huggingface-hub
116 ];
117
118 nativeCheckInputs = [
119 datasets
120 numpy
121 pytest-asyncio
122 pytestCheckHook
123 requests
124 tiktoken
125 writableTmpDirAsHomeHook
126 ];
127
128 postUnpack =
129 # Add data files for tests, otherwise tests attempt network access
130 ''
131 mkdir $sourceRoot/tests/data
132 ln -s ${test-data}/* $sourceRoot/tests/data/
133 '';
134
135 pythonImportsCheck = [ "tokenizers" ];
136
137 disabledTests = [
138 # Downloads data using the datasets module
139 "test_encode_special_tokens"
140 "test_splitting"
141 "TestTrainFromIterators"
142
143 # Require downloading from huggingface
144 # huggingface_hub.errors.LocalEntryNotFoundError
145 "test_async_methods_existence"
146 "test_basic_encoding"
147 "test_concurrency"
148 "test_decode"
149 "test_decode_skip_special_tokens"
150 "test_decode_stream_fallback"
151 "test_encode"
152 "test_error_handling"
153 "test_large_batch"
154 "test_numpy_inputs"
155 "test_performance_comparison"
156 "test_various_input_formats"
157 "test_with_special_tokens"
158 "test_with_truncation_padding"
159
160 # Those tests require more data
161 "test_from_pretrained"
162 "test_from_pretrained_revision"
163 "test_continuing_prefix_trainer_mistmatch"
164 ];
165
166 disabledTestPaths = [
167 # fixture 'model' not found
168 "benches/test_tiktoken.py"
169 ];
170
171 meta = {
172 description = "Fast State-of-the-Art Tokenizers optimized for Research and Production";
173 homepage = "https://github.com/huggingface/tokenizers";
174 changelog = "https://github.com/huggingface/tokenizers/releases/tag/v${version}";
175 license = lib.licenses.asl20;
176 maintainers = with lib.maintainers; [ GaetanLepage ];
177 platforms = lib.platforms.unix;
178 };
179}