From 60e9e3589254dc410b548c873ba3d1774d78a718 Mon Sep 17 00:00:00 2001 From: Dennis Eichhorn Date: Sun, 13 Feb 2022 23:58:18 +0100 Subject: [PATCH] add basic tesseract cli support --- Ai/Ocr/Tesseract/TesseractOcr.php | 184 ++++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 Ai/Ocr/Tesseract/TesseractOcr.php diff --git a/Ai/Ocr/Tesseract/TesseractOcr.php b/Ai/Ocr/Tesseract/TesseractOcr.php new file mode 100644 index 000000000..b1f6322e0 --- /dev/null +++ b/Ai/Ocr/Tesseract/TesseractOcr.php @@ -0,0 +1,184 @@ +path) . ' ' + . $cmd; + } else { + $cmd = \escapeshellarg(self::$bin) + . ' -C ' . \escapeshellarg($this->path) . ' ' + . $cmd; + } + + $pipes = []; + $desc = [ + 1 => ['pipe', 'w'], + 2 => ['pipe', 'w'], + ]; + + $resource = \proc_open($cmd, $desc, $pipes, $this->path, null); + + if ($resource === false) { + throw new \Exception(); + } + + $stdout = \stream_get_contents($pipes[1]); + $stderr = \stream_get_contents($pipes[2]); + + foreach ($pipes as $pipe) { + \fclose($pipe); + } + + $status = \proc_close($resource); + + if ($status == -1) { + throw new \Exception((string) $stderr); + } + + return $this->parseLines(\trim($stdout === false ? '' : $stdout)); + } + + /** + * Parse lines. + * + * @param string $lines Result of git command + * + * @return string[] + * + * @since 1.0.0 + */ + private function parseLines(string $lines) : array + { + $lineArray = \preg_split('/\r\n|\n|\r/', $lines); + $lines = []; + + if ($lineArray === false) { + return $lines; + } + + foreach ($lineArray as $line) { + $temp = \preg_replace('/\s+/', ' ', \trim($line, ' ')); + + if (!empty($temp)) { + $lines[] = $temp; + } + } + + return $lines; + } + + /** + * Prase image + * + * @param string $image Image path + * @param array $languages Languages to use + * @param int $psm Page segmentation mode (0 - 13) + * 0 Orientation and script detection (OSD) only. + * 1 Automatic page segmentation with OSD. + * 2 Automatic page segmentation, but no OSD, or OCR. + * 3 Fully automatic page segmentation, but no OSD. (Default) + * 4 Assume a single column of text of variable sizes. + * 5 Assume a single uniform block of vertically aligned text. + * 6 Assume a single uniform block of text. + * 7 Treat the image as a single text line. + * 8 Treat the image as a single word. + * 9 Treat the image as a single word in a circle. + * 10 Treat the image as a single character. + * 11 Sparse text. Find as much text as possible in no particular order. + * 12 Sparse text with OSD. + * 13 Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific. + * @param int $oem OCR engine modes + * 0 Legacy engine only. + * 1 Neural nets LSTM engine only. + * 2 Legacy + LSTM engines. + * 3 Default, based on what is available + * + * @return string + * + * @since 1.0.0 + */ + public function parseImage(string $image, array $languages = ['eng'], int $psm = 3, int $oem = 3) : string + { + $this->run( + $image . ' ' + . ($temp = \tempnam(\sys_get_temp_dir(), 'ocr_')) + . '--psm ' . $psm . ' ' + . '--oem ' . $oem . ' ' + . '-l ' . \implode('+', $languages) + ); + + $parsed = \file_get_contents($temp); + + \unlink($temp); + + return $parsed; + } +}