1{
2 lib,
3 buildPythonPackage,
4 fetchFromGitHub,
5
6 # build system
7 poetry-core,
8
9 # dependencies
10 accelerate,
11 beautifulsoup4,
12 certifi,
13 docling-core,
14 docling-ibm-models,
15 docling-parse,
16 easyocr,
17 filetype,
18 huggingface-hub,
19 lxml,
20 marko,
21 # ocrmac # not yet packaged
22 onnxruntime,
23 openpyxl,
24 pandas,
25 pillow,
26 pluggy,
27 pydantic,
28 pydantic-settings,
29 pylatexenc,
30 pypdfium2,
31 python-docx,
32 python-pptx,
33 rapidocr,
34 requests,
35 rtree,
36 scipy,
37 tesserocr,
38 tqdm,
39 transformers,
40 typer,
41
42 # optional dependencies
43 # mkdocs-click # not yet packaged
44 mkdocs-jupyter,
45 mkdocs-material,
46 mkdocstrings,
47
48 # tests
49 pytestCheckHook,
50 writableTmpDirAsHomeHook,
51}:
52
53buildPythonPackage rec {
54 pname = "docling";
55 version = "2.47.1";
56 pyproject = true;
57
58 src = fetchFromGitHub {
59 owner = "docling-project";
60 repo = "docling";
61 tag = "v${version}";
62 hash = "sha256-U82hGvWXkKwZ4um0VevVoYiIfzswu5hLDYvxtqJqmHU=";
63 };
64
65 build-system = [
66 poetry-core
67 ];
68
69 dependencies = [
70 accelerate
71 beautifulsoup4
72 certifi
73 docling-core
74 docling-ibm-models
75 docling-parse
76 easyocr
77 filetype
78 huggingface-hub
79 lxml
80 marko
81 # ocrmac # not yet packaged
82 onnxruntime
83 openpyxl
84 pandas
85 pillow
86 pluggy
87 pydantic
88 pydantic-settings
89 pylatexenc
90 pypdfium2
91 python-docx
92 python-pptx
93 rapidocr
94 requests
95 rtree
96 scipy
97 tesserocr
98 tqdm
99 transformers
100 typer
101 ];
102
103 pythonRelaxDeps = [
104 "lxml"
105 "pypdfium2"
106 "pillow"
107 ];
108
109 optional-dependencies = {
110 ocrmac = [
111 # ocrmac # not yet packaged
112 ];
113 rapidocr = [
114 onnxruntime
115 rapidocr
116 ];
117 tesserocr = [
118 tesserocr
119 ];
120
121 docs = [
122 # mkdocs-click # not yet packaged
123 mkdocs-jupyter
124 mkdocs-material
125 mkdocstrings
126 # griffle-pydantic
127 ];
128 };
129
130 nativeCheckInputs = [
131 pytestCheckHook
132 writableTmpDirAsHomeHook
133 ];
134
135 pythonImportsCheck = [
136 "docling"
137 ];
138
139 disabledTests = [
140 "test_e2e_pdfs_conversions" # AssertionError: ## TableFormer: Table Structure Understanding with Transf
141 "test_e2e_conversions" # RuntimeError: Tesseract is not available
142
143 # AssertionError
144 # assert doc.export_to_markdown() == pair[1], f"Error in case {idx}"
145 "test_ordered_lists"
146
147 # AssertionError: export to md
148 "test_e2e_html_conversions"
149
150 # AssertionError: assert 'Unordered li...d code block:' == 'Unordered li...d code block:'
151 "test_convert_valid"
152
153 # AssertionError: Markdown file mismatch against groundtruth pftaps057006474.md
154 "test_patent_groundtruth"
155
156 # huggingface_hub.errors.LocalEntryNotFoundError: An error happened
157 "test_cli_convert"
158 "test_code_and_formula_conversion"
159 "test_picture_classifier"
160 "test_convert_path"
161 "test_convert_stream"
162 "test_compare_legacy_output"
163 "test_ocr_coverage_threshold"
164 "test_formula_conversion_with_page_range"
165
166 # requires network access
167 "test_page_range"
168 "test_parser_backends"
169 "test_confidence"
170 "test_e2e_webp_conversions"
171 "test_asr_pipeline_conversion"
172 "test_threaded_pipeline"
173 "test_pipeline_comparison"
174
175 # AssertionError: pred_itxt==true_itxt
176 "test_e2e_valid_csv_conversions"
177 ];
178
179 meta = {
180 description = "Get your documents ready for gen AI";
181 homepage = "https://github.com/DS4SD/docling";
182 changelog = "https://github.com/DS4SD/docling/blob/${src.tag}/CHANGELOG.md";
183 license = lib.licenses.mit;
184 maintainers = with lib.maintainers; [ happysalada ];
185 mainProgram = "docling";
186 };
187}