1{
2 lib,
3 buildPythonPackage,
4 fetchFromGitHub,
5 hatchling,
6 beautifulsoup4,
7 defusedxml,
8 ffmpeg-headless,
9 magika,
10 mammoth,
11 markdownify,
12 numpy,
13 openai,
14 openpyxl,
15 pandas,
16 pathvalidate,
17 pdfminer-six,
18 puremagic,
19 pydub,
20 python-pptx,
21 requests,
22 speechrecognition,
23 youtube-transcript-api,
24 olefile,
25 xlrd,
26 lxml,
27 pytestCheckHook,
28 gitUpdater,
29}:
30
31buildPythonPackage rec {
32 pname = "markitdown";
33 version = "0.1.3";
34 pyproject = true;
35
36 src = fetchFromGitHub {
37 owner = "microsoft";
38 repo = "markitdown";
39 tag = "v${version}";
40 hash = "sha256-bHnJsv4ln1W0lVbWwLmCzQ15KOGJZ9gF2yx4TDuBqBI=";
41 };
42
43 sourceRoot = "${src.name}/packages/markitdown";
44
45 build-system = [ hatchling ];
46
47 dependencies = [
48 beautifulsoup4
49 defusedxml
50 ffmpeg-headless
51 lxml
52 magika
53 mammoth
54 markdownify
55 numpy
56 olefile
57 openai
58 openpyxl
59 pandas
60 pathvalidate
61 pdfminer-six
62 puremagic
63 pydub
64 python-pptx
65 requests
66 speechrecognition
67 xlrd
68 youtube-transcript-api
69 ];
70
71 pythonImportsCheck = [ "markitdown" ];
72
73 nativeCheckInputs = [ pytestCheckHook ];
74
75 disabledTests = [
76 # Require network access
77 "test_markitdown_remote"
78 "test_module_vectors"
79 "test_cli_vectors"
80 "test_module_misc"
81 ];
82
83 passthru.updateScripts = gitUpdater { };
84
85 meta = {
86 description = "Python tool for converting files and office documents to Markdown";
87 homepage = "https://github.com/microsoft/markitdown";
88 license = lib.licenses.mit;
89 maintainers = with lib.maintainers; [ ];
90 };
91}