mirror of
https://github.com/Karaka-Management/phpOMS.git
synced 2026-01-10 17:28:40 +00:00
improve parsers
This commit is contained in:
parent
f14f65ffd3
commit
84d73bf821
|
|
@ -58,6 +58,23 @@ final class DocumentParser
|
|||
$writer = new DocumentWriter($doc);
|
||||
|
||||
return $writer->toPdfString();
|
||||
} elseif ($output === 'txt') {
|
||||
$writer = new HTML($doc);
|
||||
$html = $writer->getContent();
|
||||
|
||||
$doc = new \DOMDocument();
|
||||
$html = \preg_replace(
|
||||
['~<style.*?</style>~', '~<script.*?</script>~'],
|
||||
['', ''],
|
||||
$html
|
||||
);
|
||||
|
||||
$doc->loadHTMLFile($path);
|
||||
|
||||
$body = $doc->getElementsByTagName('body');
|
||||
$node = $body->item(0);
|
||||
|
||||
return empty($node->textContent) ? '' : $node->textContent;
|
||||
}
|
||||
|
||||
return '';
|
||||
|
|
|
|||
|
|
@ -14,8 +14,6 @@ declare(strict_types=1);
|
|||
|
||||
namespace phpOMS\Utils\Parser\Document;
|
||||
|
||||
use PhpOffice\PhpWord\PhpWord;
|
||||
use PhpOffice\PhpWord\Settings;
|
||||
use PhpOffice\PhpWord\Writer\PDF\AbstractRenderer;
|
||||
use PhpOffice\PhpWord\Writer\WriterInterface;
|
||||
|
||||
|
|
|
|||
84
Utils/Parser/Html/HtmlParser.php
Normal file
84
Utils/Parser/Html/HtmlParser.php
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
<?php
|
||||
/**
|
||||
* Jingga
|
||||
*
|
||||
* PHP Version 8.1
|
||||
*
|
||||
* @package phpOMS\Utils\Parser\Html
|
||||
* @copyright Dennis Eichhorn
|
||||
* @license OMS License 2.0
|
||||
* @version 1.0.0
|
||||
* @link https://jingga.app
|
||||
*/
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace phpOMS\Utils\Parser\Html;
|
||||
|
||||
/**
|
||||
* Html parser class.
|
||||
*
|
||||
* @package phpOMS\Utils\Parser\Html
|
||||
* @license OMS License 2.0
|
||||
* @link https://jingga.app
|
||||
* @since 1.0.0
|
||||
*/
|
||||
final class HtmlParser
|
||||
{
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @since 1.0.0
|
||||
* @codeCoverageIgnore
|
||||
*/
|
||||
private function __construct()
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* Html to string
|
||||
*
|
||||
* @param string $path Path
|
||||
*
|
||||
* @return string
|
||||
*
|
||||
* @since 1.0.0
|
||||
*/
|
||||
public static function parseHtml(string $path, string $output = 'html', string $xpath = '') : string
|
||||
{
|
||||
$doc = new \DOMDocument();
|
||||
|
||||
$html = \file_get_contents($path);
|
||||
$html = \preg_replace(
|
||||
['~<style.*?</style>~', '~<script.*?</script>~'],
|
||||
['', ''],
|
||||
$html
|
||||
);
|
||||
|
||||
$doc->loadHTMLFile($path);
|
||||
$content = '';
|
||||
|
||||
if (empty($xpath)) {
|
||||
$body = $doc->getElementsByTagName('body');
|
||||
$node = $body->item(0);
|
||||
|
||||
$content = empty($node->textContent) ? '' : $node->textContent;
|
||||
} else {
|
||||
$xNode = new \DOMXpath($doc);
|
||||
$elements = $xNode->query($xpath);
|
||||
|
||||
if ($elements === false) {
|
||||
return $content;
|
||||
}
|
||||
|
||||
foreach ($elements as $element) {
|
||||
$nodes = $element->childNodes;
|
||||
|
||||
foreach ($nodes as $node) {
|
||||
$content .= $node->textContent . "\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $content;
|
||||
}
|
||||
}
|
||||
|
|
@ -54,6 +54,20 @@ final class PdfParser
|
|||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* Html to string
|
||||
*
|
||||
* @param string $path Path
|
||||
*
|
||||
* @return string
|
||||
*
|
||||
* @since 1.0.0
|
||||
*/
|
||||
public static function parsePdf(string $path, string $output = 'html') : string
|
||||
{
|
||||
return self::pdf2text($path);
|
||||
}
|
||||
|
||||
/**
|
||||
* Pdf to text
|
||||
*
|
||||
|
|
@ -73,6 +87,8 @@ final class PdfParser
|
|||
return '';
|
||||
}
|
||||
|
||||
// Try to read pdf directly
|
||||
// Important: not all PDFs are searchable, some behave like an image
|
||||
if (\is_file(self::$pdftotext)) {
|
||||
try {
|
||||
SystemUtils::runProc(
|
||||
|
|
@ -98,6 +114,7 @@ final class PdfParser
|
|||
return $text;
|
||||
}
|
||||
|
||||
// Couldn't read text from pdf -> transform to image and run OCR on image
|
||||
$out = \tempnam($tmpDir, 'oms_pdf_');
|
||||
if ($out === false) {
|
||||
return '';
|
||||
|
|
|
|||
|
|
@ -55,6 +55,24 @@ final class PresentationParser
|
|||
$oTree = new PresentationWriter($presentation);
|
||||
|
||||
return $oTree->renderHtml();
|
||||
} elseif ($output === 'txt') {
|
||||
$presentation = IOFactory::load($path);
|
||||
$oTree = new PresentationWriter($presentation);
|
||||
$html = $oTree->renderHtml();
|
||||
|
||||
$doc = new \DOMDocument();
|
||||
$html = \preg_replace(
|
||||
['~<style.*?</style>~', '~<script.*?</script>~'],
|
||||
['', ''],
|
||||
$html
|
||||
);
|
||||
|
||||
$doc->loadHTMLFile($path);
|
||||
|
||||
$body = $doc->getElementsByTagName('body');
|
||||
$node = $body->item(0);
|
||||
|
||||
return empty($node->textContent) ? '' : $node->textContent;
|
||||
}
|
||||
|
||||
return '';
|
||||
|
|
|
|||
|
|
@ -77,6 +77,26 @@ final class SpreadsheetParser
|
|||
$writer = IOFactory::createWriter($spreadsheet, 'custom');
|
||||
|
||||
return $writer->generateHtmlAll();
|
||||
} elseif ($output === 'txt') {
|
||||
IOFactory::registerWriter('custom', \phpOMS\Utils\Parser\Spreadsheet\SpreadsheetWriter::class);
|
||||
|
||||
/** @var \phpOMS\Utils\Parser\Spreadsheet\SpreadsheetWriter $writer */
|
||||
$writer = IOFactory::createWriter($spreadsheet, 'custom');
|
||||
$html = $writer->generateHtmlAll();
|
||||
|
||||
$doc = new \DOMDocument();
|
||||
$html = \preg_replace(
|
||||
['~<style.*?</style>~', '~<script.*?</script>~'],
|
||||
['', ''],
|
||||
$html
|
||||
);
|
||||
|
||||
$doc->loadHTMLFile($path);
|
||||
|
||||
$body = $doc->getElementsByTagName('body');
|
||||
$node = $body->item(0);
|
||||
|
||||
return empty($node->textContent) ? '' : $node->textContent;
|
||||
}
|
||||
|
||||
return '';
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user