1{
2 lib,
3 aiohttp,
4 buildPythonPackage,
5 dill,
6 fetchFromGitHub,
7 fsspec,
8 huggingface-hub,
9 multiprocess,
10 numpy,
11 packaging,
12 pandas,
13 pyarrow,
14 requests,
15 responses,
16 setuptools,
17 tqdm,
18 xxhash,
19}:
20buildPythonPackage rec {
21 pname = "datasets";
22 version = "4.0.0";
23 pyproject = true;
24
25 src = fetchFromGitHub {
26 owner = "huggingface";
27 repo = "datasets";
28 tag = version;
29 hash = "sha256-Cr25PgLNGX/KcFZE5h1oiaDW9J50ccMqA5z3q4sITus=";
30 };
31
32 build-system = [
33 setuptools
34 ];
35
36 dependencies = [
37 aiohttp
38 dill
39 fsspec
40 huggingface-hub
41 multiprocess
42 numpy
43 packaging
44 pandas
45 pyarrow
46 requests
47 responses
48 tqdm
49 xxhash
50 ];
51
52 pythonRelaxDeps = [
53 # https://github.com/huggingface/datasets/blob/a256b85cbc67aa3f0e75d32d6586afc507cf535b/setup.py#L117
54 # "pin until dill has official support for determinism"
55 "dill"
56 "multiprocess"
57 # https://github.com/huggingface/datasets/blob/a256b85cbc67aa3f0e75d32d6586afc507cf535b/setup.py#L129
58 # "to support protocol=kwargs in fsspec's `open`, `get_fs_token_paths`"
59 "fsspec"
60 ];
61
62 # Tests require pervasive internet access
63 doCheck = false;
64
65 # Module import will attempt to create a cache directory
66 postFixup = "export HF_MODULES_CACHE=$TMPDIR";
67
68 pythonImportsCheck = [ "datasets" ];
69
70 meta = {
71 description = "Open-access datasets and evaluation metrics for natural language processing";
72 mainProgram = "datasets-cli";
73 homepage = "https://github.com/huggingface/datasets";
74 changelog = "https://github.com/huggingface/datasets/releases/tag/${src.tag}";
75 license = lib.licenses.asl20;
76 maintainers = with lib.maintainers; [ osbm ];
77 };
78}