1{ 2 lib, 3 buildPythonPackage, 4 fetchFromGitHub, 5 6 # build-system 7 setuptools, 8 9 # core networking and async dependencies 10 anyio, 11 backoff, 12 certifi, 13 httpcore, 14 httpx, 15 h11, 16 nest-asyncio, 17 requests, 18 requests-toolbelt, 19 sniffio, 20 urllib3, 21 22 # core parsing and processing 23 beautifulsoup4, 24 chardet, 25 charset-normalizer, 26 emoji, 27 filetype, 28 html5lib, 29 idna, 30 joblib, 31 # jsonpath-python, 32 nltk, 33 olefile, 34 orderly-set, 35 python-dateutil, 36 python-iso639, 37 python-magic, 38 python-oxmsg, 39 rapidfuzz, 40 regex, 41 soupsieve, 42 webencodings, 43 44 # core data handling 45 dataclasses-json, 46 deepdiff, 47 marshmallow, 48 mypy-extensions, 49 packaging, 50 typing-extensions, 51 typing-inspect, 52 53 # core system utilities 54 cffi, 55 cryptography, 56 psutil, 57 pycparser, 58 six, 59 tqdm, 60 wrapt, 61 62 # document format support 63 markdown, 64 pdfminer-six, 65 pdfplumber, 66 # pi-heif, 67 pikepdf, 68 pypandoc, 69 pypdf, 70 python-docx, 71 unstructured-client, 72 # unstructured-pytesseract, 73 # optional dependencies 74 # csv 75 pytz, 76 tzdata, 77 # markdown 78 importlib-metadata, 79 zipp, 80 # pdf 81 opencv-python, 82 paddlepaddle, 83 pdf2image, 84 # unstructured-paddleocr, 85 # pptx 86 lxml, 87 pillow, 88 python-pptx, 89 xlsxwriter, 90 # xslx 91 et-xmlfile, 92 networkx, 93 numpy, 94 openpyxl, 95 pandas, 96 xlrd, 97 # huggingface 98 langdetect, 99 sacremoses, 100 sentencepiece, 101 torch, 102 transformers, 103 # local-inference 104 unstructured-inference, 105 # test dependencies 106 pytestCheckHook, 107 black, 108 coverage, 109 click, 110 freezegun, 111 # , label-studio-sdk 112 mypy, 113 pytest-cov-stub, 114 pytest-mock, 115 vcrpy, 116 grpcio, 117}: 118let 119 version = "0.18.15"; 120in 121buildPythonPackage rec { 122 pname = "unstructured"; 123 inherit version; 124 pyproject = true; 125 126 src = fetchFromGitHub { 127 owner = "Unstructured-IO"; 128 repo = "unstructured"; 129 tag = version; 130 hash = "sha256-rzspozQQ+WrS3cKAGe9O7clAIDo4P/6PdZzCXIRdNn8="; 131 }; 132 133 build-system = [ setuptools ]; 134 135 dependencies = [ 136 # Base dependencies 137 anyio 138 backoff 139 beautifulsoup4 140 certifi 141 cffi 142 chardet 143 charset-normalizer 144 click 145 cryptography 146 dataclasses-json 147 deepdiff 148 emoji 149 filetype 150 h11 151 html5lib 152 httpcore 153 httpx 154 idna 155 joblib 156 # jsonpath-python 157 langdetect 158 lxml 159 marshmallow 160 mypy-extensions 161 nest-asyncio 162 nltk 163 numpy 164 olefile 165 orderly-set 166 packaging 167 psutil 168 pycparser 169 pypdf 170 python-dateutil 171 python-iso639 172 python-magic 173 python-oxmsg 174 rapidfuzz 175 regex 176 requests 177 requests-toolbelt 178 six 179 sniffio 180 soupsieve 181 tqdm 182 typing-extensions 183 typing-inspect 184 unstructured-client 185 urllib3 186 webencodings 187 wrapt 188 ]; 189 190 optional-dependencies = rec { 191 all-docs = csv ++ docx ++ epub ++ pdf ++ req-markdown ++ odt ++ org ++ pptx ++ xlsx; 192 csv = [ 193 numpy 194 pandas 195 python-dateutil 196 pytz 197 tzdata 198 ]; 199 docx = [ 200 lxml 201 python-docx 202 typing-extensions 203 ]; 204 epub = [ pypandoc ]; 205 req-markdown = [ 206 importlib-metadata 207 markdown 208 zipp 209 ]; 210 odt = [ 211 lxml 212 pypandoc 213 python-docx 214 typing-extensions 215 ]; 216 org = [ 217 pypandoc 218 ]; 219 paddleocr = [ 220 opencv-python 221 # paddlepaddle # 3.12 not supported for now 222 pdf2image 223 # unstructured-paddleocr 224 ]; 225 pdf = [ 226 pdf2image 227 pdfminer-six 228 pdfplumber 229 # pi-heif 230 pikepdf 231 pypdf 232 unstructured-inference 233 # unstructured-pytesseract 234 ]; 235 pptx = [ 236 lxml 237 pillow 238 python-pptx 239 xlsxwriter 240 ]; 241 xlsx = [ 242 et-xmlfile 243 networkx 244 numpy 245 openpyxl 246 pandas 247 xlrd 248 ]; 249 huggingface = [ 250 langdetect 251 sacremoses 252 sentencepiece 253 torch 254 transformers 255 ]; 256 }; 257 258 pythonImportsCheck = [ "unstructured" ]; 259 260 # test try to download punkt from nltk 261 # figure out how to make it available to enable the tests 262 doCheck = false; 263 264 nativeCheckInputs = [ 265 pytestCheckHook 266 black 267 coverage 268 click 269 freezegun 270 mypy 271 pytest-cov-stub 272 pytest-mock 273 vcrpy 274 grpcio 275 ]; 276 277 meta = with lib; { 278 description = "Open source libraries and APIs to build custom preprocessing pipelines for labeling, training, or production machine learning pipelines"; 279 mainProgram = "unstructured-ingest"; 280 homepage = "https://github.com/Unstructured-IO/unstructured"; 281 changelog = "https://github.com/Unstructured-IO/unstructured/blob/${src.tag}/CHANGELOG.md"; 282 license = licenses.asl20; 283 maintainers = with maintainers; [ happysalada ]; 284 }; 285}