mirror of
https://github.com/Karaka-Management/phpOMS.git
synced 2026-02-07 04:58:40 +00:00
187 lines
5.1 KiB
PHP
187 lines
5.1 KiB
PHP
<?php
|
|
/**
|
|
* Karaka
|
|
*
|
|
* PHP Version 8.0
|
|
*
|
|
* @package phpOMS\Ai\Ocr\Tesseract
|
|
* @copyright Dennis Eichhorn
|
|
* @license OMS License 1.0
|
|
* @version 1.0.0
|
|
* @link https://karaka.app
|
|
*/
|
|
declare(strict_types=1);
|
|
|
|
namespace phpOMS\Ai\Ocr\Tesseract;
|
|
|
|
use phpOMS\System\File\PathException;
|
|
|
|
/**
|
|
* Tesseract api
|
|
*
|
|
* @package phpOMS\Ai\Ocr\Tesseract
|
|
* @license OMS License 1.0
|
|
* @link https://karaka.app
|
|
* @since 1.0.0
|
|
*/
|
|
final class TesseractOcr
|
|
{
|
|
/**
|
|
* Tesseract path.
|
|
*
|
|
* @var string
|
|
* @since 1.0.0
|
|
*/
|
|
protected static string $bin = '/usr/bin/tesseract';
|
|
|
|
/**
|
|
* Set tesseract binary.
|
|
*
|
|
* @param string $path tesseract path
|
|
*
|
|
* @return void
|
|
*
|
|
* @throws PathException This exception is thrown if the binary path doesn't exist
|
|
*
|
|
* @since 1.0.0
|
|
*/
|
|
public static function setBin(string $path) : void
|
|
{
|
|
if (\realpath($path) === false) {
|
|
throw new PathException($path);
|
|
}
|
|
|
|
self::$bin = \realpath($path);
|
|
}
|
|
|
|
/**
|
|
* Run git command.
|
|
*
|
|
* @param string $cmd Command to run
|
|
*
|
|
* @return string[]
|
|
*
|
|
* @throws \Exception
|
|
*
|
|
* @since 1.0.0
|
|
*/
|
|
private function run(string $cmd) : array
|
|
{
|
|
if (\strtolower((string) \substr(\PHP_OS, 0, 3)) === 'win') {
|
|
$cmd = 'cd ' . \escapeshellarg(\dirname(self::$bin))
|
|
. ' && ' . \basename(self::$bin)
|
|
. ' '
|
|
. $cmd;
|
|
} else {
|
|
$cmd = \escapeshellarg(self::$bin)
|
|
. ' '
|
|
. $cmd;
|
|
}
|
|
|
|
$pipes = [];
|
|
$desc = [
|
|
1 => ['pipe', 'w'],
|
|
2 => ['pipe', 'w'],
|
|
];
|
|
|
|
$resource = \proc_open($cmd, $desc, $pipes, null, null);
|
|
|
|
if ($resource === false) {
|
|
throw new \Exception();
|
|
}
|
|
|
|
$stdout = \stream_get_contents($pipes[1]);
|
|
$stderr = \stream_get_contents($pipes[2]);
|
|
|
|
foreach ($pipes as $pipe) {
|
|
\fclose($pipe);
|
|
}
|
|
|
|
$status = \proc_close($resource);
|
|
|
|
if ($status == -1) {
|
|
throw new \Exception((string) $stderr);
|
|
}
|
|
|
|
return $this->parseLines(\trim($stdout === false ? '' : $stdout));
|
|
}
|
|
|
|
/**
|
|
* Parse lines.
|
|
*
|
|
* @param string $lines Result of git command
|
|
*
|
|
* @return string[]
|
|
*
|
|
* @since 1.0.0
|
|
*/
|
|
private function parseLines(string $lines) : array
|
|
{
|
|
$lineArray = \preg_split('/\r\n|\n|\r/', $lines);
|
|
$lines = [];
|
|
|
|
if ($lineArray === false) {
|
|
return $lines;
|
|
}
|
|
|
|
foreach ($lineArray as $line) {
|
|
$temp = \preg_replace('/\s+/', ' ', \trim($line, ' '));
|
|
|
|
if (!empty($temp)) {
|
|
$lines[] = $temp;
|
|
}
|
|
}
|
|
|
|
return $lines;
|
|
}
|
|
|
|
/**
|
|
* Prase image
|
|
*
|
|
* @param string $image Image path
|
|
* @param array $languages Languages to use
|
|
* @param int $psm Page segmentation mode (0 - 13)
|
|
* 0 Orientation and script detection (OSD) only.
|
|
* 1 Automatic page segmentation with OSD.
|
|
* 2 Automatic page segmentation, but no OSD, or OCR.
|
|
* 3 Fully automatic page segmentation, but no OSD. (Default)
|
|
* 4 Assume a single column of text of variable sizes.
|
|
* 5 Assume a single uniform block of vertically aligned text.
|
|
* 6 Assume a single uniform block of text.
|
|
* 7 Treat the image as a single text line.
|
|
* 8 Treat the image as a single word.
|
|
* 9 Treat the image as a single word in a circle.
|
|
* 10 Treat the image as a single character.
|
|
* 11 Sparse text. Find as much text as possible in no particular order.
|
|
* 12 Sparse text with OSD.
|
|
* 13 Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific.
|
|
* @param int $oem OCR engine modes
|
|
* 0 Legacy engine only.
|
|
* 1 Neural nets LSTM engine only.
|
|
* 2 Legacy + LSTM engines.
|
|
* 3 Default, based on what is available
|
|
*
|
|
* @return string
|
|
*
|
|
* @since 1.0.0
|
|
*/
|
|
public function parseImage(string $image, array $languages = ['eng'], int $psm = 3, int $oem = 3) : string
|
|
{
|
|
$this->run(
|
|
$image . ' '
|
|
. ($temp = \tempnam(\sys_get_temp_dir(), 'ocr_'))
|
|
. ' --psm ' . $psm
|
|
. ' --oem ' . $oem
|
|
. ' -l ' . \implode('+', $languages)
|
|
);
|
|
|
|
$parsed = \file_get_contents($temp . '.txt');
|
|
|
|
// @todo: auto flip image if x% of text are garbage words?
|
|
|
|
\unlink($temp);
|
|
|
|
return \trim($parsed);
|
|
}
|
|
}
|