mirror of
https://github.com/Karaka-Management/phpOMS.git
synced 2026-01-11 01:38:41 +00:00
improve parsers
This commit is contained in:
parent
f14f65ffd3
commit
84d73bf821
|
|
@ -58,6 +58,23 @@ final class DocumentParser
|
||||||
$writer = new DocumentWriter($doc);
|
$writer = new DocumentWriter($doc);
|
||||||
|
|
||||||
return $writer->toPdfString();
|
return $writer->toPdfString();
|
||||||
|
} elseif ($output === 'txt') {
|
||||||
|
$writer = new HTML($doc);
|
||||||
|
$html = $writer->getContent();
|
||||||
|
|
||||||
|
$doc = new \DOMDocument();
|
||||||
|
$html = \preg_replace(
|
||||||
|
['~<style.*?</style>~', '~<script.*?</script>~'],
|
||||||
|
['', ''],
|
||||||
|
$html
|
||||||
|
);
|
||||||
|
|
||||||
|
$doc->loadHTMLFile($path);
|
||||||
|
|
||||||
|
$body = $doc->getElementsByTagName('body');
|
||||||
|
$node = $body->item(0);
|
||||||
|
|
||||||
|
return empty($node->textContent) ? '' : $node->textContent;
|
||||||
}
|
}
|
||||||
|
|
||||||
return '';
|
return '';
|
||||||
|
|
|
||||||
|
|
@ -14,8 +14,6 @@ declare(strict_types=1);
|
||||||
|
|
||||||
namespace phpOMS\Utils\Parser\Document;
|
namespace phpOMS\Utils\Parser\Document;
|
||||||
|
|
||||||
use PhpOffice\PhpWord\PhpWord;
|
|
||||||
use PhpOffice\PhpWord\Settings;
|
|
||||||
use PhpOffice\PhpWord\Writer\PDF\AbstractRenderer;
|
use PhpOffice\PhpWord\Writer\PDF\AbstractRenderer;
|
||||||
use PhpOffice\PhpWord\Writer\WriterInterface;
|
use PhpOffice\PhpWord\Writer\WriterInterface;
|
||||||
|
|
||||||
|
|
|
||||||
84
Utils/Parser/Html/HtmlParser.php
Normal file
84
Utils/Parser/Html/HtmlParser.php
Normal file
|
|
@ -0,0 +1,84 @@
|
||||||
|
<?php
|
||||||
|
/**
|
||||||
|
* Jingga
|
||||||
|
*
|
||||||
|
* PHP Version 8.1
|
||||||
|
*
|
||||||
|
* @package phpOMS\Utils\Parser\Html
|
||||||
|
* @copyright Dennis Eichhorn
|
||||||
|
* @license OMS License 2.0
|
||||||
|
* @version 1.0.0
|
||||||
|
* @link https://jingga.app
|
||||||
|
*/
|
||||||
|
declare(strict_types=1);
|
||||||
|
|
||||||
|
namespace phpOMS\Utils\Parser\Html;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Html parser class.
|
||||||
|
*
|
||||||
|
* @package phpOMS\Utils\Parser\Html
|
||||||
|
* @license OMS License 2.0
|
||||||
|
* @link https://jingga.app
|
||||||
|
* @since 1.0.0
|
||||||
|
*/
|
||||||
|
final class HtmlParser
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Constructor.
|
||||||
|
*
|
||||||
|
* @since 1.0.0
|
||||||
|
* @codeCoverageIgnore
|
||||||
|
*/
|
||||||
|
private function __construct()
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Html to string
|
||||||
|
*
|
||||||
|
* @param string $path Path
|
||||||
|
*
|
||||||
|
* @return string
|
||||||
|
*
|
||||||
|
* @since 1.0.0
|
||||||
|
*/
|
||||||
|
public static function parseHtml(string $path, string $output = 'html', string $xpath = '') : string
|
||||||
|
{
|
||||||
|
$doc = new \DOMDocument();
|
||||||
|
|
||||||
|
$html = \file_get_contents($path);
|
||||||
|
$html = \preg_replace(
|
||||||
|
['~<style.*?</style>~', '~<script.*?</script>~'],
|
||||||
|
['', ''],
|
||||||
|
$html
|
||||||
|
);
|
||||||
|
|
||||||
|
$doc->loadHTMLFile($path);
|
||||||
|
$content = '';
|
||||||
|
|
||||||
|
if (empty($xpath)) {
|
||||||
|
$body = $doc->getElementsByTagName('body');
|
||||||
|
$node = $body->item(0);
|
||||||
|
|
||||||
|
$content = empty($node->textContent) ? '' : $node->textContent;
|
||||||
|
} else {
|
||||||
|
$xNode = new \DOMXpath($doc);
|
||||||
|
$elements = $xNode->query($xpath);
|
||||||
|
|
||||||
|
if ($elements === false) {
|
||||||
|
return $content;
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach ($elements as $element) {
|
||||||
|
$nodes = $element->childNodes;
|
||||||
|
|
||||||
|
foreach ($nodes as $node) {
|
||||||
|
$content .= $node->textContent . "\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return $content;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -54,6 +54,20 @@ final class PdfParser
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Html to string
|
||||||
|
*
|
||||||
|
* @param string $path Path
|
||||||
|
*
|
||||||
|
* @return string
|
||||||
|
*
|
||||||
|
* @since 1.0.0
|
||||||
|
*/
|
||||||
|
public static function parsePdf(string $path, string $output = 'html') : string
|
||||||
|
{
|
||||||
|
return self::pdf2text($path);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Pdf to text
|
* Pdf to text
|
||||||
*
|
*
|
||||||
|
|
@ -73,6 +87,8 @@ final class PdfParser
|
||||||
return '';
|
return '';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Try to read pdf directly
|
||||||
|
// Important: not all PDFs are searchable, some behave like an image
|
||||||
if (\is_file(self::$pdftotext)) {
|
if (\is_file(self::$pdftotext)) {
|
||||||
try {
|
try {
|
||||||
SystemUtils::runProc(
|
SystemUtils::runProc(
|
||||||
|
|
@ -98,6 +114,7 @@ final class PdfParser
|
||||||
return $text;
|
return $text;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Couldn't read text from pdf -> transform to image and run OCR on image
|
||||||
$out = \tempnam($tmpDir, 'oms_pdf_');
|
$out = \tempnam($tmpDir, 'oms_pdf_');
|
||||||
if ($out === false) {
|
if ($out === false) {
|
||||||
return '';
|
return '';
|
||||||
|
|
|
||||||
|
|
@ -55,6 +55,24 @@ final class PresentationParser
|
||||||
$oTree = new PresentationWriter($presentation);
|
$oTree = new PresentationWriter($presentation);
|
||||||
|
|
||||||
return $oTree->renderHtml();
|
return $oTree->renderHtml();
|
||||||
|
} elseif ($output === 'txt') {
|
||||||
|
$presentation = IOFactory::load($path);
|
||||||
|
$oTree = new PresentationWriter($presentation);
|
||||||
|
$html = $oTree->renderHtml();
|
||||||
|
|
||||||
|
$doc = new \DOMDocument();
|
||||||
|
$html = \preg_replace(
|
||||||
|
['~<style.*?</style>~', '~<script.*?</script>~'],
|
||||||
|
['', ''],
|
||||||
|
$html
|
||||||
|
);
|
||||||
|
|
||||||
|
$doc->loadHTMLFile($path);
|
||||||
|
|
||||||
|
$body = $doc->getElementsByTagName('body');
|
||||||
|
$node = $body->item(0);
|
||||||
|
|
||||||
|
return empty($node->textContent) ? '' : $node->textContent;
|
||||||
}
|
}
|
||||||
|
|
||||||
return '';
|
return '';
|
||||||
|
|
|
||||||
|
|
@ -77,6 +77,26 @@ final class SpreadsheetParser
|
||||||
$writer = IOFactory::createWriter($spreadsheet, 'custom');
|
$writer = IOFactory::createWriter($spreadsheet, 'custom');
|
||||||
|
|
||||||
return $writer->generateHtmlAll();
|
return $writer->generateHtmlAll();
|
||||||
|
} elseif ($output === 'txt') {
|
||||||
|
IOFactory::registerWriter('custom', \phpOMS\Utils\Parser\Spreadsheet\SpreadsheetWriter::class);
|
||||||
|
|
||||||
|
/** @var \phpOMS\Utils\Parser\Spreadsheet\SpreadsheetWriter $writer */
|
||||||
|
$writer = IOFactory::createWriter($spreadsheet, 'custom');
|
||||||
|
$html = $writer->generateHtmlAll();
|
||||||
|
|
||||||
|
$doc = new \DOMDocument();
|
||||||
|
$html = \preg_replace(
|
||||||
|
['~<style.*?</style>~', '~<script.*?</script>~'],
|
||||||
|
['', ''],
|
||||||
|
$html
|
||||||
|
);
|
||||||
|
|
||||||
|
$doc->loadHTMLFile($path);
|
||||||
|
|
||||||
|
$body = $doc->getElementsByTagName('body');
|
||||||
|
$node = $body->item(0);
|
||||||
|
|
||||||
|
return empty($node->textContent) ? '' : $node->textContent;
|
||||||
}
|
}
|
||||||
|
|
||||||
return '';
|
return '';
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user