improve parsers

This commit is contained in:
Dennis Eichhorn 2024-01-30 21:00:59 +00:00
parent f14f65ffd3
commit 84d73bf821
6 changed files with 156 additions and 2 deletions

View File

@ -58,6 +58,23 @@ final class DocumentParser
$writer = new DocumentWriter($doc);
return $writer->toPdfString();
} elseif ($output === 'txt') {
$writer = new HTML($doc);
$html = $writer->getContent();
$doc = new \DOMDocument();
$html = \preg_replace(
['~<style.*?</style>~', '~<script.*?</script>~'],
['', ''],
$html
);
$doc->loadHTMLFile($path);
$body = $doc->getElementsByTagName('body');
$node = $body->item(0);
return empty($node->textContent) ? '' : $node->textContent;
}
return '';

View File

@ -14,8 +14,6 @@ declare(strict_types=1);
namespace phpOMS\Utils\Parser\Document;
use PhpOffice\PhpWord\PhpWord;
use PhpOffice\PhpWord\Settings;
use PhpOffice\PhpWord\Writer\PDF\AbstractRenderer;
use PhpOffice\PhpWord\Writer\WriterInterface;

View File

@ -0,0 +1,84 @@
<?php
/**
* Jingga
*
* PHP Version 8.1
*
* @package phpOMS\Utils\Parser\Html
* @copyright Dennis Eichhorn
* @license OMS License 2.0
* @version 1.0.0
* @link https://jingga.app
*/
declare(strict_types=1);
namespace phpOMS\Utils\Parser\Html;
/**
* Html parser class.
*
* @package phpOMS\Utils\Parser\Html
* @license OMS License 2.0
* @link https://jingga.app
* @since 1.0.0
*/
final class HtmlParser
{
/**
* Constructor.
*
* @since 1.0.0
* @codeCoverageIgnore
*/
private function __construct()
{
}
/**
* Html to string
*
* @param string $path Path
*
* @return string
*
* @since 1.0.0
*/
public static function parseHtml(string $path, string $output = 'html', string $xpath = '') : string
{
$doc = new \DOMDocument();
$html = \file_get_contents($path);
$html = \preg_replace(
['~<style.*?</style>~', '~<script.*?</script>~'],
['', ''],
$html
);
$doc->loadHTMLFile($path);
$content = '';
if (empty($xpath)) {
$body = $doc->getElementsByTagName('body');
$node = $body->item(0);
$content = empty($node->textContent) ? '' : $node->textContent;
} else {
$xNode = new \DOMXpath($doc);
$elements = $xNode->query($xpath);
if ($elements === false) {
return $content;
}
foreach ($elements as $element) {
$nodes = $element->childNodes;
foreach ($nodes as $node) {
$content .= $node->textContent . "\n";
}
}
}
return $content;
}
}

View File

@ -54,6 +54,20 @@ final class PdfParser
{
}
/**
* Html to string
*
* @param string $path Path
*
* @return string
*
* @since 1.0.0
*/
public static function parsePdf(string $path, string $output = 'html') : string
{
return self::pdf2text($path);
}
/**
* Pdf to text
*
@ -73,6 +87,8 @@ final class PdfParser
return '';
}
// Try to read pdf directly
// Important: not all PDFs are searchable, some behave like an image
if (\is_file(self::$pdftotext)) {
try {
SystemUtils::runProc(
@ -98,6 +114,7 @@ final class PdfParser
return $text;
}
// Couldn't read text from pdf -> transform to image and run OCR on image
$out = \tempnam($tmpDir, 'oms_pdf_');
if ($out === false) {
return '';

View File

@ -55,6 +55,24 @@ final class PresentationParser
$oTree = new PresentationWriter($presentation);
return $oTree->renderHtml();
} elseif ($output === 'txt') {
$presentation = IOFactory::load($path);
$oTree = new PresentationWriter($presentation);
$html = $oTree->renderHtml();
$doc = new \DOMDocument();
$html = \preg_replace(
['~<style.*?</style>~', '~<script.*?</script>~'],
['', ''],
$html
);
$doc->loadHTMLFile($path);
$body = $doc->getElementsByTagName('body');
$node = $body->item(0);
return empty($node->textContent) ? '' : $node->textContent;
}
return '';

View File

@ -77,6 +77,26 @@ final class SpreadsheetParser
$writer = IOFactory::createWriter($spreadsheet, 'custom');
return $writer->generateHtmlAll();
} elseif ($output === 'txt') {
IOFactory::registerWriter('custom', \phpOMS\Utils\Parser\Spreadsheet\SpreadsheetWriter::class);
/** @var \phpOMS\Utils\Parser\Spreadsheet\SpreadsheetWriter $writer */
$writer = IOFactory::createWriter($spreadsheet, 'custom');
$html = $writer->generateHtmlAll();
$doc = new \DOMDocument();
$html = \preg_replace(
['~<style.*?</style>~', '~<script.*?</script>~'],
['', ''],
$html
);
$doc->loadHTMLFile($path);
$body = $doc->getElementsByTagName('body');
$node = $body->item(0);
return empty($node->textContent) ? '' : $node->textContent;
}
return '';