1{
2 lib,
3 buildPythonPackage,
4 fetchFromGitHub,
5 setuptools,
6 setuptools-scm,
7 attrdict,
8 beautifulsoup4,
9 cython,
10 fire,
11 fonttools,
12 lmdb,
13 lxml,
14 numpy,
15 opencv-python,
16 openpyxl,
17 pdf2docx,
18 pillow,
19 pyclipper,
20 pymupdf,
21 python-docx,
22 rapidfuzz,
23 scikit-image,
24 shapely,
25 tqdm,
26 paddlepaddle,
27 lanms-neo,
28 polygon3,
29 paddlex,
30 pyyaml,
31}:
32
33buildPythonPackage rec {
34 pname = "paddleocr";
35 version = "3.2.0";
36 pyproject = true;
37
38 src = fetchFromGitHub {
39 owner = "PaddlePaddle";
40 repo = "PaddleOCR";
41 tag = "v${version}";
42 hash = "sha256-lrFwrbDzOYFzZEz+P0roTtQMxeWBCDZuEVviyUzM3M4=";
43 };
44
45 patches = [
46 # The `ppocr.data.imaug` re-exports the `IaaAugment` and `CopyPaste`
47 # classes. These classes depend on the `imgaug` package which is
48 # unmaintained and has been removed from nixpkgs.
49 #
50 # The image OCR feature of PaddleOCR doesn't use these classes though, so
51 # they work even after stripping the the `IaaAugment` and `CopyPaste`
52 # exports. It probably breaks some of the OCR model creation tooling that
53 # PaddleOCR provides, however.
54 ./remove-import-imaug.patch
55 ];
56
57 postPatch = ''
58 substituteInPlace pyproject.toml \
59 --replace-fail "==72.1.0" ""
60 '';
61
62 build-system = [
63 setuptools
64 setuptools-scm
65 ];
66
67 # trying to relax only pymupdf makes the whole build fail
68 pythonRelaxDeps = true;
69 pythonRemoveDeps = [
70 "imgaug"
71 "visualdl"
72 "opencv-contrib-python"
73 ];
74
75 dependencies = [
76 attrdict
77 beautifulsoup4
78 cython
79 fire
80 fonttools
81 lmdb
82 lxml
83 numpy
84 opencv-python
85 openpyxl
86 pdf2docx
87 pillow
88 pyclipper
89 pymupdf
90 python-docx
91 rapidfuzz
92 scikit-image
93 shapely
94 tqdm
95 paddlepaddle
96 lanms-neo
97 polygon3
98 paddlex
99 pyyaml
100 ];
101
102 # TODO: The tests depend, among possibly other things, on `cudatoolkit`.
103 # But Cudatoolkit fails to install.
104 # preCheck = "export HOME=$TMPDIR";
105 # nativeCheckInputs = with pkgs; [ which cudatoolkit ];
106 doCheck = false;
107
108 meta = {
109 homepage = "https://github.com/PaddlePaddle/PaddleOCR";
110 license = lib.licenses.asl20;
111 description = "Multilingual OCR toolkits based on PaddlePaddle";
112 longDescription = ''
113 PaddleOCR aims to create multilingual, awesome, leading, and practical OCR
114 tools that help users train better models and apply them into practice.
115 '';
116 changelog = "https://github.com/PaddlePaddle/PaddleOCR/releases/tag/${src.tag}";
117 maintainers = with lib.maintainers; [ happysalada ];
118 platforms = [
119 "x86_64-linux"
120 "x86_64-darwin"
121 "aarch64-darwin"
122 ];
123 };
124}