diff --git a/Utils/Parser/Document/DocumentParser.php b/Utils/Parser/Document/DocumentParser.php index 6fcf2d295..1a63ebe9f 100755 --- a/Utils/Parser/Document/DocumentParser.php +++ b/Utils/Parser/Document/DocumentParser.php @@ -58,6 +58,23 @@ final class DocumentParser $writer = new DocumentWriter($doc); return $writer->toPdfString(); + } elseif ($output === 'txt') { + $writer = new HTML($doc); + $html = $writer->getContent(); + + $doc = new \DOMDocument(); + $html = \preg_replace( + ['~~', '~~'], + ['', ''], + $html + ); + + $doc->loadHTMLFile($path); + + $body = $doc->getElementsByTagName('body'); + $node = $body->item(0); + + return empty($node->textContent) ? '' : $node->textContent; } return ''; diff --git a/Utils/Parser/Document/DocumentWriter.php b/Utils/Parser/Document/DocumentWriter.php index 61a29a1aa..b3af2ad5d 100755 --- a/Utils/Parser/Document/DocumentWriter.php +++ b/Utils/Parser/Document/DocumentWriter.php @@ -14,8 +14,6 @@ declare(strict_types=1); namespace phpOMS\Utils\Parser\Document; -use PhpOffice\PhpWord\PhpWord; -use PhpOffice\PhpWord\Settings; use PhpOffice\PhpWord\Writer\PDF\AbstractRenderer; use PhpOffice\PhpWord\Writer\WriterInterface; diff --git a/Utils/Parser/Html/HtmlParser.php b/Utils/Parser/Html/HtmlParser.php new file mode 100644 index 000000000..934ca6efb --- /dev/null +++ b/Utils/Parser/Html/HtmlParser.php @@ -0,0 +1,84 @@ +~', '~~'], + ['', ''], + $html + ); + + $doc->loadHTMLFile($path); + $content = ''; + + if (empty($xpath)) { + $body = $doc->getElementsByTagName('body'); + $node = $body->item(0); + + $content = empty($node->textContent) ? '' : $node->textContent; + } else { + $xNode = new \DOMXpath($doc); + $elements = $xNode->query($xpath); + + if ($elements === false) { + return $content; + } + + foreach ($elements as $element) { + $nodes = $element->childNodes; + + foreach ($nodes as $node) { + $content .= $node->textContent . "\n"; + } + } + } + + return $content; + } +} diff --git a/Utils/Parser/Pdf/PdfParser.php b/Utils/Parser/Pdf/PdfParser.php index 8b38d8941..6c20399af 100755 --- a/Utils/Parser/Pdf/PdfParser.php +++ b/Utils/Parser/Pdf/PdfParser.php @@ -54,6 +54,20 @@ final class PdfParser { } + /** + * Html to string + * + * @param string $path Path + * + * @return string + * + * @since 1.0.0 + */ + public static function parsePdf(string $path, string $output = 'html') : string + { + return self::pdf2text($path); + } + /** * Pdf to text * @@ -73,6 +87,8 @@ final class PdfParser return ''; } + // Try to read pdf directly + // Important: not all PDFs are searchable, some behave like an image if (\is_file(self::$pdftotext)) { try { SystemUtils::runProc( @@ -98,6 +114,7 @@ final class PdfParser return $text; } + // Couldn't read text from pdf -> transform to image and run OCR on image $out = \tempnam($tmpDir, 'oms_pdf_'); if ($out === false) { return ''; diff --git a/Utils/Parser/Presentation/PresentationParser.php b/Utils/Parser/Presentation/PresentationParser.php index d68e53f21..6bc0ed368 100755 --- a/Utils/Parser/Presentation/PresentationParser.php +++ b/Utils/Parser/Presentation/PresentationParser.php @@ -55,6 +55,24 @@ final class PresentationParser $oTree = new PresentationWriter($presentation); return $oTree->renderHtml(); + } elseif ($output === 'txt') { + $presentation = IOFactory::load($path); + $oTree = new PresentationWriter($presentation); + $html = $oTree->renderHtml(); + + $doc = new \DOMDocument(); + $html = \preg_replace( + ['~~', '~~'], + ['', ''], + $html + ); + + $doc->loadHTMLFile($path); + + $body = $doc->getElementsByTagName('body'); + $node = $body->item(0); + + return empty($node->textContent) ? '' : $node->textContent; } return ''; diff --git a/Utils/Parser/Spreadsheet/SpreadsheetParser.php b/Utils/Parser/Spreadsheet/SpreadsheetParser.php index b5e07c63d..62ae0664d 100755 --- a/Utils/Parser/Spreadsheet/SpreadsheetParser.php +++ b/Utils/Parser/Spreadsheet/SpreadsheetParser.php @@ -77,6 +77,26 @@ final class SpreadsheetParser $writer = IOFactory::createWriter($spreadsheet, 'custom'); return $writer->generateHtmlAll(); + } elseif ($output === 'txt') { + IOFactory::registerWriter('custom', \phpOMS\Utils\Parser\Spreadsheet\SpreadsheetWriter::class); + + /** @var \phpOMS\Utils\Parser\Spreadsheet\SpreadsheetWriter $writer */ + $writer = IOFactory::createWriter($spreadsheet, 'custom'); + $html = $writer->generateHtmlAll(); + + $doc = new \DOMDocument(); + $html = \preg_replace( + ['~~', '~~'], + ['', ''], + $html + ); + + $doc->loadHTMLFile($path); + + $body = $doc->getElementsByTagName('body'); + $node = $body->item(0); + + return empty($node->textContent) ? '' : $node->textContent; } return '';