1{
2 lib,
3 buildPythonPackage,
4 fetchFromGitHub,
5
6 # build-system
7 setuptools,
8
9 # core networking and async dependencies
10 anyio,
11 backoff,
12 certifi,
13 httpcore,
14 httpx,
15 h11,
16 nest-asyncio,
17 requests,
18 requests-toolbelt,
19 sniffio,
20 urllib3,
21
22 # core parsing and processing
23 beautifulsoup4,
24 chardet,
25 charset-normalizer,
26 emoji,
27 filetype,
28 html5lib,
29 idna,
30 joblib,
31 # jsonpath-python,
32 nltk,
33 olefile,
34 orderly-set,
35 python-dateutil,
36 python-iso639,
37 python-magic,
38 python-oxmsg,
39 rapidfuzz,
40 regex,
41 soupsieve,
42 webencodings,
43
44 # core data handling
45 dataclasses-json,
46 deepdiff,
47 marshmallow,
48 mypy-extensions,
49 packaging,
50 typing-extensions,
51 typing-inspect,
52
53 # core system utilities
54 cffi,
55 cryptography,
56 psutil,
57 pycparser,
58 six,
59 tqdm,
60 wrapt,
61
62 # document format support
63 markdown,
64 pdfminer-six,
65 pdfplumber,
66 # pi-heif,
67 pikepdf,
68 pypandoc,
69 pypdf,
70 python-docx,
71 unstructured-client,
72 # unstructured-pytesseract,
73 # optional dependencies
74 # csv
75 pytz,
76 tzdata,
77 # markdown
78 importlib-metadata,
79 zipp,
80 # pdf
81 opencv-python,
82 paddlepaddle,
83 pdf2image,
84 # unstructured-paddleocr,
85 # pptx
86 lxml,
87 pillow,
88 python-pptx,
89 xlsxwriter,
90 # xslx
91 et-xmlfile,
92 networkx,
93 numpy,
94 openpyxl,
95 pandas,
96 xlrd,
97 # huggingface
98 langdetect,
99 sacremoses,
100 sentencepiece,
101 torch,
102 transformers,
103 # local-inference
104 unstructured-inference,
105 # test dependencies
106 pytestCheckHook,
107 black,
108 coverage,
109 click,
110 freezegun,
111 # , label-studio-sdk
112 mypy,
113 pytest-cov-stub,
114 pytest-mock,
115 vcrpy,
116 grpcio,
117}:
118let
119 version = "0.18.15";
120in
121buildPythonPackage rec {
122 pname = "unstructured";
123 inherit version;
124 pyproject = true;
125
126 src = fetchFromGitHub {
127 owner = "Unstructured-IO";
128 repo = "unstructured";
129 tag = version;
130 hash = "sha256-rzspozQQ+WrS3cKAGe9O7clAIDo4P/6PdZzCXIRdNn8=";
131 };
132
133 build-system = [ setuptools ];
134
135 dependencies = [
136 # Base dependencies
137 anyio
138 backoff
139 beautifulsoup4
140 certifi
141 cffi
142 chardet
143 charset-normalizer
144 click
145 cryptography
146 dataclasses-json
147 deepdiff
148 emoji
149 filetype
150 h11
151 html5lib
152 httpcore
153 httpx
154 idna
155 joblib
156 # jsonpath-python
157 langdetect
158 lxml
159 marshmallow
160 mypy-extensions
161 nest-asyncio
162 nltk
163 numpy
164 olefile
165 orderly-set
166 packaging
167 psutil
168 pycparser
169 pypdf
170 python-dateutil
171 python-iso639
172 python-magic
173 python-oxmsg
174 rapidfuzz
175 regex
176 requests
177 requests-toolbelt
178 six
179 sniffio
180 soupsieve
181 tqdm
182 typing-extensions
183 typing-inspect
184 unstructured-client
185 urllib3
186 webencodings
187 wrapt
188 ];
189
190 optional-dependencies = rec {
191 all-docs = csv ++ docx ++ epub ++ pdf ++ req-markdown ++ odt ++ org ++ pptx ++ xlsx;
192 csv = [
193 numpy
194 pandas
195 python-dateutil
196 pytz
197 tzdata
198 ];
199 docx = [
200 lxml
201 python-docx
202 typing-extensions
203 ];
204 epub = [ pypandoc ];
205 req-markdown = [
206 importlib-metadata
207 markdown
208 zipp
209 ];
210 odt = [
211 lxml
212 pypandoc
213 python-docx
214 typing-extensions
215 ];
216 org = [
217 pypandoc
218 ];
219 paddleocr = [
220 opencv-python
221 # paddlepaddle # 3.12 not supported for now
222 pdf2image
223 # unstructured-paddleocr
224 ];
225 pdf = [
226 pdf2image
227 pdfminer-six
228 pdfplumber
229 # pi-heif
230 pikepdf
231 pypdf
232 unstructured-inference
233 # unstructured-pytesseract
234 ];
235 pptx = [
236 lxml
237 pillow
238 python-pptx
239 xlsxwriter
240 ];
241 xlsx = [
242 et-xmlfile
243 networkx
244 numpy
245 openpyxl
246 pandas
247 xlrd
248 ];
249 huggingface = [
250 langdetect
251 sacremoses
252 sentencepiece
253 torch
254 transformers
255 ];
256 };
257
258 pythonImportsCheck = [ "unstructured" ];
259
260 # test try to download punkt from nltk
261 # figure out how to make it available to enable the tests
262 doCheck = false;
263
264 nativeCheckInputs = [
265 pytestCheckHook
266 black
267 coverage
268 click
269 freezegun
270 mypy
271 pytest-cov-stub
272 pytest-mock
273 vcrpy
274 grpcio
275 ];
276
277 meta = with lib; {
278 description = "Open source libraries and APIs to build custom preprocessing pipelines for labeling, training, or production machine learning pipelines";
279 mainProgram = "unstructured-ingest";
280 homepage = "https://github.com/Unstructured-IO/unstructured";
281 changelog = "https://github.com/Unstructured-IO/unstructured/blob/${src.tag}/CHANGELOG.md";
282 license = licenses.asl20;
283 maintainers = with maintainers; [ happysalada ];
284 };
285}