1{
2 lib,
3 buildPythonPackage,
4 fetchFromGitHub,
5 cython,
6 setuptools,
7 regex,
8 pytestCheckHook,
9}:
10
11buildPythonPackage rec {
12 pname = "curated-tokenizers";
13 version = "2.0.0";
14 pyproject = true;
15
16 src = fetchFromGitHub {
17 owner = "explosion";
18 repo = "curated-tokenizers";
19 tag = "v${version}";
20 hash = "sha256-VkDV/9c5b8TzYlthCZ38ufbrne4rihtkmkZ/gyAQXLE=";
21 fetchSubmodules = true;
22 };
23
24 build-system = [
25 cython
26 setuptools
27 ];
28
29 dependencies = [
30 regex
31 ];
32
33 nativeCheckInputs = [
34 pytestCheckHook
35 ];
36
37 # Explicitly set the path to avoid running vendored
38 # sentencepiece tests.
39 enabledTestPaths = [ "tests" ];
40
41 preCheck = ''
42 # avoid local paths, relative imports wont resolve correctly
43 mv curated_tokenizers/tests tests
44 rm -r curated_tokenizers
45 '';
46
47 pythonImportsCheck = [ "curated_tokenizers" ];
48
49 meta = with lib; {
50 description = "Lightweight piece tokenization library";
51 homepage = "https://github.com/explosion/curated-tokenizers";
52 changelog = "https://github.com/explosion/curated-tokenizers/releases/tag/${src.tag}";
53 license = licenses.mit;
54 maintainers = with maintainers; [ danieldk ];
55 };
56}