1{ 2 lib, 3 buildPythonPackage, 4 fetchFromGitHub, 5 6 # build-system 7 setuptools, 8 9 # dependencies 10 absl-py, 11 etils, 12 jsonpath-rw, 13 networkx, 14 pandas, 15 pandas-stubs, 16 python-dateutil, 17 rdflib, 18 requests, 19 scipy, 20 tqdm, 21 22 # tests 23 apache-beam, 24 gitpython, 25 librosa, 26 pillow, 27 pytestCheckHook, 28 pyyaml, 29 writableTmpDirAsHomeHook, 30}: 31 32buildPythonPackage rec { 33 pname = "mlcroissant"; 34 version = "1.0.22"; 35 pyproject = true; 36 37 src = fetchFromGitHub { 38 owner = "mlcommons"; 39 repo = "croissant"; 40 tag = "v${version}"; 41 hash = "sha256-uJOxKNrK3eN2wyPFEQr2J4+vZeSK1KPyFDag2jcyWZw="; 42 }; 43 44 sourceRoot = "${src.name}/python/mlcroissant"; 45 46 build-system = [ 47 setuptools 48 ]; 49 50 dependencies = [ 51 absl-py 52 etils 53 jsonpath-rw 54 networkx 55 pandas 56 pandas-stubs 57 python-dateutil 58 rdflib 59 requests 60 scipy 61 tqdm 62 ] 63 ++ etils.optional-dependencies.epath; 64 65 pythonImportsCheck = [ "mlcroissant" ]; 66 67 nativeCheckInputs = [ 68 apache-beam 69 gitpython 70 librosa 71 pillow 72 pytestCheckHook 73 pyyaml 74 writableTmpDirAsHomeHook 75 ]; 76 77 disabledTests = [ 78 # Requires internet access 79 "test_hermetic_loading_1_1" 80 "test_load_from_huggingface" 81 "test_nonhermetic_loading" 82 "test_nonhermetic_loading_1_0" 83 84 # AssertionError: assert {'records/aud...t32), 22050)'} == {'records/aud...t32), 22050)'} 85 "test_hermetic_loading" 86 87 # AttributeError: 'MaybeReshuffle' object has no attribute 'side_inputs' 88 "test_beam_hermetic_loading" 89 ]; 90 91 meta = { 92 description = "High-level format for machine learning datasets that brings together four rich layers"; 93 homepage = "https://github.com/mlcommons/croissant"; 94 changelog = "https://github.com/mlcommons/croissant/releases/tag/${src.tag}"; 95 license = lib.licenses.asl20; 96 maintainers = with lib.maintainers; [ GaetanLepage ]; 97 platforms = lib.platforms.all; 98 mainProgram = "mlcroissant"; 99 }; 100}