at master 11 kB view raw
1commit cfc05af26b571e9ca09e9c709c0fb8934e9e46dd 2Author: Guillaume Girol <symphorien+git@xlumurb.eu> 3Date: Sat Aug 20 17:48:01 2022 +0200 4 5 Fix finding tesseract 6 7diff --git a/src/pyocr/libtesseract/tesseract_raw.py b/src/pyocr/libtesseract/tesseract_raw.py 8index 1edec8c..434a336 100644 9--- a/src/pyocr/libtesseract/tesseract_raw.py 10+++ b/src/pyocr/libtesseract/tesseract_raw.py 11@@ -2,7 +2,6 @@ import ctypes 12 import locale 13 import logging 14 import os 15-import sys 16 17 from ..error import TesseractError 18 19@@ -10,51 +9,16 @@ from ..error import TesseractError 20 logger = logging.getLogger(__name__) 21 22 TESSDATA_PREFIX = os.getenv('TESSDATA_PREFIX', None) 23-libnames = [] 24+if TESSDATA_PREFIX is None: 25+ TESSDATA_PREFIX = '@tesseract@/share/tessdata' 26+ os.environ['TESSDATA_PREFIX'] = TESSDATA_PREFIX 27+ 28+ 29 # 70 is the minimum credible dpi for tesseract and force it to compute an 30 # estimate of the image dpi 31 DPI_DEFAULT = 70 32 33- 34-if getattr(sys, 'frozen', False) and hasattr(sys, '_MEIPASS'): 35- # Pyinstaller integration 36- libnames += [os.path.join(sys._MEIPASS, "libtesseract-4.dll")] 37- libnames += [os.path.join(sys._MEIPASS, "libtesseract-3.dll")] 38- tessdata = os.path.join(sys._MEIPASS, "data") 39- if not os.path.exists(os.path.join(tessdata, "tessdata")): 40- logger.warning( 41- "Running from container, but no tessdata ({}) found !".format( 42- tessdata 43- ) 44- ) 45- else: 46- TESSDATA_PREFIX = os.path.join(tessdata, "tessdata") 47- 48- 49-if sys.platform[:3] == "win": # pragma: no cover 50- libnames += [ 51- # Jflesch> Don't they have the equivalent of LD_LIBRARY_PATH on 52- # Windows ? 53- "../vs2010/DLL_Release/libtesseract302.dll", 54- # prefer the most recent first 55- "libtesseract305.dll", 56- "libtesseract304.dll", 57- "libtesseract303.dll", 58- "libtesseract302.dll", 59- "libtesseract400.dll", # Tesseract 4 is still in alpha stage 60- "libtesseract.dll", 61- "C:\\Program Files (x86)\\Tesseract-OCR\\libtesseract-4.dll", 62- "C:\\Program Files (x86)\\Tesseract-OCR\\libtesseract-3.dll", 63- ] 64-else: 65- libnames += [ 66- "libtesseract.so.5", 67- "libtesseract.so.4", 68- "libtesseract.so.3", 69- "libtesseract.5.dylib", 70- "libtesseract.4.dylib", 71- ] 72- 73+libnames = [ "@tesseractLibraryLocation@" ] 74 75 g_libtesseract = None 76 77@@ -367,12 +331,12 @@ def init(lang=None): 78 try: 79 if lang: 80 lang = lang.encode("utf-8") 81- prefix = None 82- if TESSDATA_PREFIX: # pragma: no cover 83- prefix = TESSDATA_PREFIX.encode("utf-8") 84+ 85+ prefix = TESSDATA_PREFIX 86+ 87 g_libtesseract.TessBaseAPIInit3( 88 ctypes.c_void_p(handle), 89- ctypes.c_char_p(prefix), 90+ ctypes.c_char_p(prefix.encode('utf-8')), 91 ctypes.c_char_p(lang) 92 ) 93 g_libtesseract.TessBaseAPISetVariable( 94diff --git a/src/pyocr/tesseract.py b/src/pyocr/tesseract.py 95index 0fe0d20..c1fdd27 100644 96--- a/src/pyocr/tesseract.py 97+++ b/src/pyocr/tesseract.py 98@@ -28,8 +28,7 @@ from .builders import DigitBuilder # backward compatibility 99 from .error import TesseractError # backward compatibility 100 from .util import digits_only 101 102-# CHANGE THIS IF TESSERACT IS NOT IN YOUR PATH, OR IS NAMED DIFFERENTLY 103-TESSERACT_CMD = 'tesseract.exe' if os.name == 'nt' else 'tesseract' 104+TESSERACT_CMD = '@tesseract@/bin/tesseract' 105 106 TESSDATA_EXTENSION = ".traineddata" 107 108diff --git a/tests/test_libtesseract.py b/tests/test_libtesseract.py 109index cc31a50..890c02c 100644 110--- a/tests/test_libtesseract.py 111+++ b/tests/test_libtesseract.py 112@@ -167,7 +167,8 @@ class TestLibTesseractRaw(BaseTest): 113 args = libtess.TessBaseAPIInit3.call_args[0] 114 self.assertEqual(len(args), 3) 115 self.assertEqual(args[0].value, self.handle) 116- self.assertEqual(args[1].value, None) 117+ # we hardcode tesseract data, so we don't get None 118+ #self.assertEqual(args[1].value, None) 119 self.assertEqual(args[2].value, lang.encode() if lang else None) 120 121 self.assertEqual( 122@@ -203,7 +204,8 @@ class TestLibTesseractRaw(BaseTest): 123 args = libtess.TessBaseAPIInit3.call_args[0] 124 self.assertEqual(len(args), 3) 125 self.assertEqual(args[0].value, self.handle) 126- self.assertEqual(args[1].value, None) 127+ # we hardcode tesseract data, so we don't get None 128+ #self.assertEqual(args[1].value, None) 129 self.assertEqual(args[2].value, lang.encode() if lang else None) 130 131 self.assertEqual( 132diff --git a/tests/test_tesseract.py b/tests/test_tesseract.py 133index 823818f..2ee5fb4 100644 134--- a/tests/test_tesseract.py 135+++ b/tests/test_tesseract.py 136@@ -37,7 +37,7 @@ class TestTesseract(BaseTest): 137 def test_available(self, which): 138 which.return_value = True 139 self.assertTrue(tesseract.is_available()) 140- which.assert_called_once_with("tesseract") 141+ which.assert_called_once_with("@tesseract@/bin/tesseract") 142 143 @patch("subprocess.Popen") 144 def test_version_error(self, popen): 145@@ -163,7 +163,7 @@ class TestTesseract(BaseTest): 146 for lang in ("eng", "fra", "jpn", "osd"): 147 self.assertIn(lang, langs) 148 popen.assert_called_once_with( 149- ["tesseract", "--list-langs"], 150+ ["@tesseract@/bin/tesseract", "--list-langs"], 151 startupinfo=None, creationflags=0, 152 stdout=subprocess.PIPE, stderr=subprocess.STDOUT 153 ) 154@@ -178,7 +178,7 @@ class TestTesseract(BaseTest): 155 self.assertEqual(te.exception.status, 1) 156 self.assertEqual("unable to get languages", te.exception.message) 157 popen.assert_called_once_with( 158- ["tesseract", "--list-langs"], 159+ ["@tesseract@/bin/tesseract", "--list-langs"], 160 startupinfo=None, creationflags=0, 161 stdout=subprocess.PIPE, stderr=subprocess.STDOUT 162 ) 163@@ -255,7 +255,7 @@ class TestTesseract(BaseTest): 164 self.assertEqual(status, 0) 165 self.assertEqual(error, message) 166 popen.assert_called_once_with( 167- ["tesseract", "input.bmp", "output"], 168+ ["@tesseract@/bin/tesseract", "input.bmp", "output"], 169 cwd=tmpdir, 170 startupinfo=None, 171 creationflags=0, 172@@ -278,7 +278,7 @@ class TestTesseract(BaseTest): 173 self.assertEqual(status, 0) 174 self.assertEqual(error, message) 175 popen.assert_called_with( 176- ["tesseract", "input2.bmp", "output2", "-l", "fra", "--psm", "3"], 177+ ["@tesseract@/bin/tesseract", "input2.bmp", "output2", "-l", "fra", "--psm", "3"], 178 cwd=tmpdir, 179 startupinfo=None, 180 creationflags=0, 181@@ -309,7 +309,7 @@ class TestTesseract(BaseTest): 182 self.assertEqual(result["angle"], 90) 183 self.assertEqual(result["confidence"], 9.30) 184 popen.assert_called_once_with( 185- ["tesseract", "input.bmp", "stdout", "--psm", "0"], 186+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "--psm", "0"], 187 stdin=subprocess.PIPE, 188 shell=False, 189 startupinfo=None, 190@@ -345,7 +345,7 @@ class TestTesseract(BaseTest): 191 self.assertEqual(result["angle"], 90) 192 self.assertEqual(result["confidence"], 9.30) 193 popen.assert_called_once_with( 194- ["tesseract", "input.bmp", "stdout", "--psm", "0"], 195+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "--psm", "0"], 196 stdin=subprocess.PIPE, 197 shell=False, 198 startupinfo=None, 199@@ -378,7 +378,7 @@ class TestTesseract(BaseTest): 200 self.assertEqual(result["angle"], 90) 201 self.assertEqual(result["confidence"], 9.30) 202 popen.assert_called_once_with( 203- ["tesseract", "input.bmp", "stdout", 204+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", 205 "--psm", "0", "-l", "osd"], 206 stdin=subprocess.PIPE, 207 shell=False, 208@@ -406,7 +406,7 @@ class TestTesseract(BaseTest): 209 with self.assertRaises(tesseract.TesseractError) as te: 210 tesseract.detect_orientation(self.image) 211 popen.assert_called_once_with( 212- ["tesseract", "input.bmp", "stdout", "--psm", "0"], 213+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "--psm", "0"], 214 stdin=subprocess.PIPE, 215 shell=False, 216 startupinfo=None, 217@@ -440,7 +440,7 @@ class TestTesseract(BaseTest): 218 with self.assertRaises(tesseract.TesseractError) as te: 219 tesseract.detect_orientation(self.image) 220 popen.assert_called_once_with( 221- ["tesseract", "input.bmp", "stdout", "--psm", "0"], 222+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "--psm", "0"], 223 stdin=subprocess.PIPE, 224 shell=False, 225 startupinfo=None, 226@@ -474,7 +474,7 @@ class TestTesseract(BaseTest): 227 self.assertEqual(result["angle"], 90) 228 self.assertEqual(result["confidence"], 9.30) 229 popen.assert_called_once_with( 230- ["tesseract", "input.bmp", "stdout", "-psm", "0"], 231+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "-psm", "0"], 232 stdin=subprocess.PIPE, 233 shell=False, 234 startupinfo=None, 235@@ -507,7 +507,7 @@ class TestTesseract(BaseTest): 236 self.assertEqual(result["angle"], 90) 237 self.assertEqual(result["confidence"], 9.30) 238 popen.assert_called_once_with( 239- ["tesseract", "input.bmp", "stdout", "-psm", "0", "-l", "fra"], 240+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "-psm", "0", "-l", "fra"], 241 stdin=subprocess.PIPE, 242 shell=False, 243 startupinfo=None, 244@@ -534,7 +534,7 @@ class TestTesseract(BaseTest): 245 with self.assertRaises(tesseract.TesseractError) as te: 246 tesseract.detect_orientation(self.image) 247 popen.assert_called_once_with( 248- ["tesseract", "input.bmp", "stdout", "-psm", "0"], 249+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "-psm", "0"], 250 stdin=subprocess.PIPE, 251 shell=False, 252 startupinfo=None, 253@@ -568,7 +568,7 @@ class TestTesseract(BaseTest): 254 with self.assertRaises(tesseract.TesseractError) as te: 255 tesseract.detect_orientation(self.image) 256 popen.assert_called_once_with( 257- ["tesseract", "input.bmp", "stdout", "-psm", "0"], 258+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "-psm", "0"], 259 stdin=subprocess.PIPE, 260 shell=False, 261 startupinfo=None,