From e9618de67d533997d2f6554472568dc00e579493 Mon Sep 17 00:00:00 2001 From: Dennis Eichhorn Date: Fri, 25 Nov 2022 19:26:04 +0100 Subject: [PATCH] improving external prog. calls --- Utils/Parser/Pdf/PdfParser.php | 54 ++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 16 deletions(-) diff --git a/Utils/Parser/Pdf/PdfParser.php b/Utils/Parser/Pdf/PdfParser.php index 753903770..11cf98d92 100755 --- a/Utils/Parser/Pdf/PdfParser.php +++ b/Utils/Parser/Pdf/PdfParser.php @@ -28,6 +28,22 @@ use phpOMS\Utils\StringUtils; */ class PdfParser { + /** + * PDFToText path. + * + * @var string + * @var 1.0.0 + */ + public static $pdftotext = '/usr/bin/pdftotext'; + + /** + * PDFToPPM path. + * + * @var string + * @var 1.0.0 + */ + public static $pdftoppm = '/usr/bin/pdftoppm'; + /** * Pdf to text * @@ -37,7 +53,7 @@ class PdfParser * * @since 1.0.0 */ - public static function pdf2text(string $path) : string + public static function pdf2text(string $path, string $optimizer = '') : string { $text = ''; $tmpDir = \sys_get_temp_dir(); @@ -47,11 +63,13 @@ class PdfParser return ''; } - SystemUtils::runProc( - '/usr/bin/pdftotext', '-layout ' - . \escapeshellarg($path) . ' ' - . \escapeshellarg($out) - ); + if (\is_file(self::$pdftotext)) { + SystemUtils::runProc( + self::$pdftotext, '-layout ' + . \escapeshellarg($path) . ' ' + . \escapeshellarg($out) + ); + } $text = \file_get_contents($out); \unlink($out); @@ -66,12 +84,14 @@ class PdfParser return ''; } - SystemUtils::runProc( - '/usr/bin/pdftoppm', - '-jpeg -r 300 ' - . \escapeshellarg($path) . ' ' - . \escapeshellarg($out) - ); + if (\is_file(self::$pdftoppm)) { + SystemUtils::runProc( + self::$pdftoppm, + '-jpeg -r 300 ' + . \escapeshellarg($path) . ' ' + . \escapeshellarg($out) + ); + } $files = \glob($out . '*'); if ($files === false) { @@ -91,11 +111,13 @@ class PdfParser Skew::autoRotate($file, $file, 10); */ - SystemUtils::runProc( - __DIR__ . '/../../../cOMS/Tools/InvoicePreprocessing/App', - \escapeshellarg($file) . ' ' + if (!empty($optimizer) && \is_file($optimizer)) { + SystemUtils::runProc( + $optimizer, + \escapeshellarg($file) . ' ' . \escapeshellarg($file) - ); + ); + } $ocr = new TesseractOcr(); $text = $ocr->parseImage($file);