improve parsers

2026-03-07 08:28:41 +00:00 · 2024-01-30 21:00:59 +00:00 · 2024-01-30 21:00:59 +00:00 · 84d73bf821
commit 84d73bf821
parent f14f65ffd3
6 changed files with 156 additions and 2 deletions
--- a/Utils/Parser/Document/DocumentParser.php
+++ b/Utils/Parser/Document/DocumentParser.php
@ -58,6 +58,23 @@ final class DocumentParser
            $writer = new DocumentWriter($doc);
            return $writer->toPdfString();
        } elseif ($output === 'txt') {
            $writer = new HTML($doc);
            $html   = $writer->getContent();
            $doc  = new \DOMDocument();
            $html = \preg_replace(
                ['~<style.*?</style>~', '~<script.*?</script>~'],
                ['', ''],
                $html
            );
            $doc->loadHTMLFile($path);
            $body = $doc->getElementsByTagName('body');
            $node = $body->item(0);
            return empty($node->textContent) ? '' : $node->textContent;
        }
        return '';
--- a/Utils/Parser/Document/DocumentWriter.php
+++ b/Utils/Parser/Document/DocumentWriter.php
@ -14,8 +14,6 @@ declare(strict_types=1);
 namespace phpOMS\Utils\Parser\Document;
 use PhpOffice\PhpWord\PhpWord;
 use PhpOffice\PhpWord\Settings;
 use PhpOffice\PhpWord\Writer\PDF\AbstractRenderer;
 use PhpOffice\PhpWord\Writer\WriterInterface;
--- a/Utils/Parser/Html/HtmlParser.php
+++ b/Utils/Parser/Html/HtmlParser.php
@ -0,0 +1,84 @@
 <?php
 /**
 * Jingga
 *
 * PHP Version 8.1
 *
 * @package   phpOMS\Utils\Parser\Html
 * @copyright Dennis Eichhorn
 * @license   OMS License 2.0
 * @version   1.0.0
 * @link      https://jingga.app
 */
 declare(strict_types=1);
 namespace phpOMS\Utils\Parser\Html;
 /**
 * Html parser class.
 *
 * @package phpOMS\Utils\Parser\Html
 * @license OMS License 2.0
 * @link    https://jingga.app
 * @since   1.0.0
 */
 final class HtmlParser
 {
    /**
     * Constructor.
     *
     * @since 1.0.0
     * @codeCoverageIgnore
     */
    private function __construct()
    {
    }
    /**
     * Html to string
     *
     * @param string $path Path
     *
     * @return string
     *
     * @since 1.0.0
     */
    public static function parseHtml(string $path, string $output = 'html', string $xpath = '') : string
    {
        $doc = new \DOMDocument();
        $html = \file_get_contents($path);
        $html = \preg_replace(
            ['~<style.*?</style>~', '~<script.*?</script>~'],
            ['', ''],
            $html
        );
        $doc->loadHTMLFile($path);
        $content = '';
        if (empty($xpath)) {
            $body = $doc->getElementsByTagName('body');
            $node = $body->item(0);
            $content = empty($node->textContent) ? '' : $node->textContent;
        } else {
            $xNode = new \DOMXpath($doc);
            $elements = $xNode->query($xpath);
            if ($elements === false) {
                return $content;
            }
            foreach ($elements as $element) {
                $nodes = $element->childNodes;
                foreach ($nodes as $node) {
                    $content .= $node->textContent . "\n";
                }
            }
        }
        return $content;
    }
 }
--- a/Utils/Parser/Pdf/PdfParser.php
+++ b/Utils/Parser/Pdf/PdfParser.php
@ -54,6 +54,20 @@ final class PdfParser
    {
    }
    /**
     * Html to string
     *
     * @param string $path Path
     *
     * @return string
     *
     * @since 1.0.0
     */
    public static function parsePdf(string $path, string $output = 'html') : string
    {
        return self::pdf2text($path);
    }
    /**
     * Pdf to text
     *
@ -73,6 +87,8 @@ final class PdfParser
            return '';
        }
        // Try to read pdf directly
        // Important: not all PDFs are searchable, some behave like an image
        if (\is_file(self::$pdftotext)) {
            try {
                SystemUtils::runProc(
@ -98,6 +114,7 @@ final class PdfParser
            return $text;
        }
        // Couldn't read text from pdf -> transform to image and run OCR on image
        $out = \tempnam($tmpDir, 'oms_pdf_');
        if ($out === false) {
            return '';
--- a/Utils/Parser/Presentation/PresentationParser.php
+++ b/Utils/Parser/Presentation/PresentationParser.php
@ -55,6 +55,24 @@ final class PresentationParser
            $oTree = new PresentationWriter($presentation);
            return $oTree->renderHtml();
        } elseif ($output === 'txt') {
            $presentation = IOFactory::load($path);
            $oTree        = new PresentationWriter($presentation);
            $html         = $oTree->renderHtml();
            $doc  = new \DOMDocument();
            $html = \preg_replace(
                ['~<style.*?</style>~', '~<script.*?</script>~'],
                ['', ''],
                $html
            );
            $doc->loadHTMLFile($path);
            $body = $doc->getElementsByTagName('body');
            $node = $body->item(0);
            return empty($node->textContent) ? '' : $node->textContent;
        }
        return '';
--- a/Utils/Parser/Spreadsheet/SpreadsheetParser.php
+++ b/Utils/Parser/Spreadsheet/SpreadsheetParser.php
@ -77,6 +77,26 @@ final class SpreadsheetParser
            $writer = IOFactory::createWriter($spreadsheet, 'custom');
            return $writer->generateHtmlAll();
        } elseif ($output === 'txt') {
            IOFactory::registerWriter('custom', \phpOMS\Utils\Parser\Spreadsheet\SpreadsheetWriter::class);
            /** @var \phpOMS\Utils\Parser\Spreadsheet\SpreadsheetWriter $writer */
            $writer = IOFactory::createWriter($spreadsheet, 'custom');
            $html   =  $writer->generateHtmlAll();
            $doc  = new \DOMDocument();
            $html = \preg_replace(
                ['~<style.*?</style>~', '~<script.*?</script>~'],
                ['', ''],
                $html
            );
            $doc->loadHTMLFile($path);
            $body = $doc->getElementsByTagName('body');
            $node = $body->item(0);
            return empty($node->textContent) ? '' : $node->textContent;
        }
        return '';