at master 3.6 kB view raw
1{ 2 lib, 3 buildPythonPackage, 4 fetchFromGitHub, 5 6 # build system 7 poetry-core, 8 9 # dependencies 10 accelerate, 11 beautifulsoup4, 12 certifi, 13 docling-core, 14 docling-ibm-models, 15 docling-parse, 16 easyocr, 17 filetype, 18 huggingface-hub, 19 lxml, 20 marko, 21 # ocrmac # not yet packaged 22 onnxruntime, 23 openpyxl, 24 pandas, 25 pillow, 26 pluggy, 27 pydantic, 28 pydantic-settings, 29 pylatexenc, 30 pypdfium2, 31 python-docx, 32 python-pptx, 33 rapidocr, 34 requests, 35 rtree, 36 scipy, 37 tesserocr, 38 tqdm, 39 transformers, 40 typer, 41 42 # optional dependencies 43 # mkdocs-click # not yet packaged 44 mkdocs-jupyter, 45 mkdocs-material, 46 mkdocstrings, 47 48 # tests 49 pytestCheckHook, 50 writableTmpDirAsHomeHook, 51}: 52 53buildPythonPackage rec { 54 pname = "docling"; 55 version = "2.47.1"; 56 pyproject = true; 57 58 src = fetchFromGitHub { 59 owner = "docling-project"; 60 repo = "docling"; 61 tag = "v${version}"; 62 hash = "sha256-U82hGvWXkKwZ4um0VevVoYiIfzswu5hLDYvxtqJqmHU="; 63 }; 64 65 build-system = [ 66 poetry-core 67 ]; 68 69 dependencies = [ 70 accelerate 71 beautifulsoup4 72 certifi 73 docling-core 74 docling-ibm-models 75 docling-parse 76 easyocr 77 filetype 78 huggingface-hub 79 lxml 80 marko 81 # ocrmac # not yet packaged 82 onnxruntime 83 openpyxl 84 pandas 85 pillow 86 pluggy 87 pydantic 88 pydantic-settings 89 pylatexenc 90 pypdfium2 91 python-docx 92 python-pptx 93 rapidocr 94 requests 95 rtree 96 scipy 97 tesserocr 98 tqdm 99 transformers 100 typer 101 ]; 102 103 pythonRelaxDeps = [ 104 "lxml" 105 "pypdfium2" 106 "pillow" 107 ]; 108 109 optional-dependencies = { 110 ocrmac = [ 111 # ocrmac # not yet packaged 112 ]; 113 rapidocr = [ 114 onnxruntime 115 rapidocr 116 ]; 117 tesserocr = [ 118 tesserocr 119 ]; 120 121 docs = [ 122 # mkdocs-click # not yet packaged 123 mkdocs-jupyter 124 mkdocs-material 125 mkdocstrings 126 # griffle-pydantic 127 ]; 128 }; 129 130 nativeCheckInputs = [ 131 pytestCheckHook 132 writableTmpDirAsHomeHook 133 ]; 134 135 pythonImportsCheck = [ 136 "docling" 137 ]; 138 139 disabledTests = [ 140 "test_e2e_pdfs_conversions" # AssertionError: ## TableFormer: Table Structure Understanding with Transf 141 "test_e2e_conversions" # RuntimeError: Tesseract is not available 142 143 # AssertionError 144 # assert doc.export_to_markdown() == pair[1], f"Error in case {idx}" 145 "test_ordered_lists" 146 147 # AssertionError: export to md 148 "test_e2e_html_conversions" 149 150 # AssertionError: assert 'Unordered li...d code block:' == 'Unordered li...d code block:' 151 "test_convert_valid" 152 153 # AssertionError: Markdown file mismatch against groundtruth pftaps057006474.md 154 "test_patent_groundtruth" 155 156 # huggingface_hub.errors.LocalEntryNotFoundError: An error happened 157 "test_cli_convert" 158 "test_code_and_formula_conversion" 159 "test_picture_classifier" 160 "test_convert_path" 161 "test_convert_stream" 162 "test_compare_legacy_output" 163 "test_ocr_coverage_threshold" 164 "test_formula_conversion_with_page_range" 165 166 # requires network access 167 "test_page_range" 168 "test_parser_backends" 169 "test_confidence" 170 "test_e2e_webp_conversions" 171 "test_asr_pipeline_conversion" 172 "test_threaded_pipeline" 173 "test_pipeline_comparison" 174 175 # AssertionError: pred_itxt==true_itxt 176 "test_e2e_valid_csv_conversions" 177 ]; 178 179 meta = { 180 description = "Get your documents ready for gen AI"; 181 homepage = "https://github.com/DS4SD/docling"; 182 changelog = "https://github.com/DS4SD/docling/blob/${src.tag}/CHANGELOG.md"; 183 license = lib.licenses.mit; 184 maintainers = with lib.maintainers; [ happysalada ]; 185 mainProgram = "docling"; 186 }; 187}