improving external prog. calls

This commit is contained in:
Dennis Eichhorn 2022-11-25 19:26:04 +01:00
parent 9745efa7d5
commit e9618de67d

View File

@ -28,6 +28,22 @@ use phpOMS\Utils\StringUtils;
*/ */
class PdfParser class PdfParser
{ {
/**
* PDFToText path.
*
* @var string
* @var 1.0.0
*/
public static $pdftotext = '/usr/bin/pdftotext';
/**
* PDFToPPM path.
*
* @var string
* @var 1.0.0
*/
public static $pdftoppm = '/usr/bin/pdftoppm';
/** /**
* Pdf to text * Pdf to text
* *
@ -37,7 +53,7 @@ class PdfParser
* *
* @since 1.0.0 * @since 1.0.0
*/ */
public static function pdf2text(string $path) : string public static function pdf2text(string $path, string $optimizer = '') : string
{ {
$text = ''; $text = '';
$tmpDir = \sys_get_temp_dir(); $tmpDir = \sys_get_temp_dir();
@ -47,11 +63,13 @@ class PdfParser
return ''; return '';
} }
SystemUtils::runProc( if (\is_file(self::$pdftotext)) {
'/usr/bin/pdftotext', '-layout ' SystemUtils::runProc(
. \escapeshellarg($path) . ' ' self::$pdftotext, '-layout '
. \escapeshellarg($out) . \escapeshellarg($path) . ' '
); . \escapeshellarg($out)
);
}
$text = \file_get_contents($out); $text = \file_get_contents($out);
\unlink($out); \unlink($out);
@ -66,12 +84,14 @@ class PdfParser
return ''; return '';
} }
SystemUtils::runProc( if (\is_file(self::$pdftoppm)) {
'/usr/bin/pdftoppm', SystemUtils::runProc(
'-jpeg -r 300 ' self::$pdftoppm,
. \escapeshellarg($path) . ' ' '-jpeg -r 300 '
. \escapeshellarg($out) . \escapeshellarg($path) . ' '
); . \escapeshellarg($out)
);
}
$files = \glob($out . '*'); $files = \glob($out . '*');
if ($files === false) { if ($files === false) {
@ -91,11 +111,13 @@ class PdfParser
Skew::autoRotate($file, $file, 10); Skew::autoRotate($file, $file, 10);
*/ */
SystemUtils::runProc( if (!empty($optimizer) && \is_file($optimizer)) {
__DIR__ . '/../../../cOMS/Tools/InvoicePreprocessing/App', SystemUtils::runProc(
\escapeshellarg($file) . ' ' $optimizer,
\escapeshellarg($file) . ' '
. \escapeshellarg($file) . \escapeshellarg($file)
); );
}
$ocr = new TesseractOcr(); $ocr = new TesseractOcr();
$text = $ocr->parseImage($file); $text = $ocr->parseImage($file);