1{
2 lib,
3 buildPythonPackage,
4 fetchFromGitHub,
5
6 # build-system
7 setuptools,
8
9 # dependencies
10 absl-py,
11 etils,
12 jsonpath-rw,
13 networkx,
14 pandas,
15 pandas-stubs,
16 python-dateutil,
17 rdflib,
18 requests,
19 scipy,
20 tqdm,
21
22 # tests
23 apache-beam,
24 gitpython,
25 librosa,
26 pillow,
27 pytestCheckHook,
28 pyyaml,
29 writableTmpDirAsHomeHook,
30}:
31
32buildPythonPackage rec {
33 pname = "mlcroissant";
34 version = "1.0.22";
35 pyproject = true;
36
37 src = fetchFromGitHub {
38 owner = "mlcommons";
39 repo = "croissant";
40 tag = "v${version}";
41 hash = "sha256-uJOxKNrK3eN2wyPFEQr2J4+vZeSK1KPyFDag2jcyWZw=";
42 };
43
44 sourceRoot = "${src.name}/python/mlcroissant";
45
46 build-system = [
47 setuptools
48 ];
49
50 dependencies = [
51 absl-py
52 etils
53 jsonpath-rw
54 networkx
55 pandas
56 pandas-stubs
57 python-dateutil
58 rdflib
59 requests
60 scipy
61 tqdm
62 ]
63 ++ etils.optional-dependencies.epath;
64
65 pythonImportsCheck = [ "mlcroissant" ];
66
67 nativeCheckInputs = [
68 apache-beam
69 gitpython
70 librosa
71 pillow
72 pytestCheckHook
73 pyyaml
74 writableTmpDirAsHomeHook
75 ];
76
77 disabledTests = [
78 # Requires internet access
79 "test_hermetic_loading_1_1"
80 "test_load_from_huggingface"
81 "test_nonhermetic_loading"
82 "test_nonhermetic_loading_1_0"
83
84 # AssertionError: assert {'records/aud...t32), 22050)'} == {'records/aud...t32), 22050)'}
85 "test_hermetic_loading"
86
87 # AttributeError: 'MaybeReshuffle' object has no attribute 'side_inputs'
88 "test_beam_hermetic_loading"
89 ];
90
91 meta = {
92 description = "High-level format for machine learning datasets that brings together four rich layers";
93 homepage = "https://github.com/mlcommons/croissant";
94 changelog = "https://github.com/mlcommons/croissant/releases/tag/${src.tag}";
95 license = lib.licenses.asl20;
96 maintainers = with lib.maintainers; [ GaetanLepage ];
97 platforms = lib.platforms.all;
98 mainProgram = "mlcroissant";
99 };
100}