improve parsers

2026-03-07 08:28:41 +00:00 · 2024-01-30 21:00:59 +00:00 · 2024-01-30 21:00:59 +00:00 · 84d73bf821
commit 84d73bf821
parent f14f65ffd3
6 changed files with 156 additions and 2 deletions
--- a/Utils/Parser/Document/DocumentParser.php
+++ b/Utils/Parser/Document/DocumentParser.php
@ -58,6 +58,23 @@ final class DocumentParser
            $writer = new DocumentWriter($doc);

            return $writer->toPdfString();
+        } elseif ($output === 'txt') {
+            $writer = new HTML($doc);
+            $html   = $writer->getContent();
+
+            $doc  = new \DOMDocument();
+            $html = \preg_replace(
+                ['~<style.*?</style>~', '~<script.*?</script>~'],
+                ['', ''],
+                $html
+            );
+
+            $doc->loadHTMLFile($path);
+
+            $body = $doc->getElementsByTagName('body');
+            $node = $body->item(0);
+
+            return empty($node->textContent) ? '' : $node->textContent;
        }

        return '';
--- a/Utils/Parser/Document/DocumentWriter.php
+++ b/Utils/Parser/Document/DocumentWriter.php
@ -14,8 +14,6 @@ declare(strict_types=1);

 namespace phpOMS\Utils\Parser\Document;

-use PhpOffice\PhpWord\PhpWord;
-use PhpOffice\PhpWord\Settings;
 use PhpOffice\PhpWord\Writer\PDF\AbstractRenderer;
 use PhpOffice\PhpWord\Writer\WriterInterface;

--- a/Utils/Parser/Html/HtmlParser.php
+++ b/Utils/Parser/Html/HtmlParser.php
@ -0,0 +1,84 @@
+<?php
+/**
+ * Jingga
+ *
+ * PHP Version 8.1
+ *
+ * @package   phpOMS\Utils\Parser\Html
+ * @copyright Dennis Eichhorn
+ * @license   OMS License 2.0
+ * @version   1.0.0
+ * @link      https://jingga.app
+ */
+declare(strict_types=1);
+
+namespace phpOMS\Utils\Parser\Html;
+
+/**
+ * Html parser class.
+ *
+ * @package phpOMS\Utils\Parser\Html
+ * @license OMS License 2.0
+ * @link    https://jingga.app
+ * @since   1.0.0
+ */
+final class HtmlParser
+{
+    /**
+     * Constructor.
+     *
+     * @since 1.0.0
+     * @codeCoverageIgnore
+     */
+    private function __construct()
+    {
+    }
+
+    /**
+     * Html to string
+     *
+     * @param string $path Path
+     *
+     * @return string
+     *
+     * @since 1.0.0
+     */
+    public static function parseHtml(string $path, string $output = 'html', string $xpath = '') : string
+    {
+        $doc = new \DOMDocument();
+
+        $html = \file_get_contents($path);
+        $html = \preg_replace(
+            ['~<style.*?</style>~', '~<script.*?</script>~'],
+            ['', ''],
+            $html
+        );
+
+        $doc->loadHTMLFile($path);
+        $content = '';
+
+        if (empty($xpath)) {
+            $body = $doc->getElementsByTagName('body');
+            $node = $body->item(0);
+
+            $content = empty($node->textContent) ? '' : $node->textContent;
+        } else {
+            $xNode = new \DOMXpath($doc);
+            $elements = $xNode->query($xpath);
+
+            if ($elements === false) {
+                return $content;
+            }
+
+            foreach ($elements as $element) {
+                $nodes = $element->childNodes;
+
+                foreach ($nodes as $node) {
+                    $content .= $node->textContent . "\n";
+                }
+            }
+        }
+
+        return $content;
+    }
+}
--- a/Utils/Parser/Pdf/PdfParser.php
+++ b/Utils/Parser/Pdf/PdfParser.php
@ -54,6 +54,20 @@ final class PdfParser
    {
    }

+    /**
+     * Html to string
+     *
+     * @param string $path Path
+     *
+     * @return string
+     *
+     * @since 1.0.0
+     */
+    public static function parsePdf(string $path, string $output = 'html') : string
+    {
+        return self::pdf2text($path);
+    }
+
    /**
     * Pdf to text
     *
@ -73,6 +87,8 @@ final class PdfParser
            return '';
        }

+        // Try to read pdf directly
+        // Important: not all PDFs are searchable, some behave like an image
        if (\is_file(self::$pdftotext)) {
            try {
                SystemUtils::runProc(
@ -98,6 +114,7 @@ final class PdfParser
            return $text;
        }

+        // Couldn't read text from pdf -> transform to image and run OCR on image
        $out = \tempnam($tmpDir, 'oms_pdf_');
        if ($out === false) {
            return '';
--- a/Utils/Parser/Presentation/PresentationParser.php
+++ b/Utils/Parser/Presentation/PresentationParser.php
@ -55,6 +55,24 @@ final class PresentationParser
            $oTree = new PresentationWriter($presentation);

            return $oTree->renderHtml();
+        } elseif ($output === 'txt') {
+            $presentation = IOFactory::load($path);
+            $oTree        = new PresentationWriter($presentation);
+            $html         = $oTree->renderHtml();
+
+            $doc  = new \DOMDocument();
+            $html = \preg_replace(
+                ['~<style.*?</style>~', '~<script.*?</script>~'],
+                ['', ''],
+                $html
+            );
+
+            $doc->loadHTMLFile($path);
+
+            $body = $doc->getElementsByTagName('body');
+            $node = $body->item(0);
+
+            return empty($node->textContent) ? '' : $node->textContent;
        }

        return '';
--- a/Utils/Parser/Spreadsheet/SpreadsheetParser.php
+++ b/Utils/Parser/Spreadsheet/SpreadsheetParser.php
@ -77,6 +77,26 @@ final class SpreadsheetParser
            $writer = IOFactory::createWriter($spreadsheet, 'custom');

            return $writer->generateHtmlAll();
+        } elseif ($output === 'txt') {
+            IOFactory::registerWriter('custom', \phpOMS\Utils\Parser\Spreadsheet\SpreadsheetWriter::class);
+
+            /** @var \phpOMS\Utils\Parser\Spreadsheet\SpreadsheetWriter $writer */
+            $writer = IOFactory::createWriter($spreadsheet, 'custom');
+            $html   =  $writer->generateHtmlAll();
+
+            $doc  = new \DOMDocument();
+            $html = \preg_replace(
+                ['~<style.*?</style>~', '~<script.*?</script>~'],
+                ['', ''],
+                $html
+            );
+
+            $doc->loadHTMLFile($path);
+
+            $body = $doc->getElementsByTagName('body');
+            $node = $body->item(0);
+
+            return empty($node->textContent) ? '' : $node->textContent;
        }

        return '';