-
Notifications
You must be signed in to change notification settings - Fork 45
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into maintenance_doc_cache
- Loading branch information
Showing
7 changed files
with
226 additions
and
139 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,157 @@ | ||
<?php | ||
|
||
/** | ||
* (c) Kitodo. Key to digital objects e.V. <[email protected]> | ||
* | ||
* This file is part of the Kitodo and TYPO3 projects. | ||
* | ||
* @license GNU General Public License version 3 or later. | ||
* For the full copyright and license information, please read the | ||
* LICENSE.txt file that was distributed with this source code. | ||
*/ | ||
|
||
namespace Kitodo\Dlf\Common; | ||
|
||
use TYPO3\CMS\Core\Configuration\ExtensionConfiguration; | ||
use TYPO3\CMS\Core\Log\Logger; | ||
use TYPO3\CMS\Core\Log\LogManager; | ||
use TYPO3\CMS\Core\Utility\GeneralUtility; | ||
|
||
class FullTextReader | ||
{ | ||
/** | ||
* @access private | ||
* @var Logger This holds the logger | ||
*/ | ||
private Logger $logger; | ||
|
||
/** | ||
* @access private | ||
* @var array This holds all formats | ||
*/ | ||
private array $formats; | ||
|
||
/** | ||
* Constructor | ||
* | ||
* @param array $formats | ||
*/ | ||
public function __construct(array $formats) | ||
{ | ||
$this->logger = GeneralUtility::makeInstance(LogManager::class)->getLogger(get_class($this)); | ||
$this->formats = $formats; | ||
} | ||
|
||
/** | ||
* This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas from an | ||
* XML full text representation (currently only ALTO). For IIIF manifests, ALTO documents have | ||
* to be given in the Canvas' / Manifest's "seeAlso" property. | ||
* | ||
* @param string $id The "@ID" attribute of the physical structure node (METS) or the "@id" property | ||
* of the Manifest / Range (IIIF) | ||
* @param array $fileLocations The locations of the XML files | ||
* @param mixed $physicalStructureNode The physical structure node (METS) or the Manifest / Range (IIIF) | ||
* | ||
* @return string The OCR full text | ||
*/ | ||
public function getFromXml(string $id, array $fileLocations, $physicalStructureNode): string | ||
{ | ||
$fullText = ''; | ||
|
||
$fileGrpsFulltext = $this->getFullTextFileGroups(); | ||
$textFormat = ""; | ||
if (!empty($physicalStructureNode)) { | ||
while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) { | ||
if (!empty($physicalStructureNode['files'][$fileGrpFulltext])) { | ||
// Get full text file. | ||
$fileContent = GeneralUtility::getUrl($fileLocations[$fileGrpFulltext]); | ||
if ($fileContent !== false) { | ||
$textFormat = $this->getTextFormat($fileContent); | ||
} else { | ||
$this->logger->warning('Couldn\'t load full text file for structure node @ID "' . $id . '"'); | ||
return $fullText; | ||
} | ||
break; | ||
} | ||
} | ||
} else { | ||
$this->logger->warning('Invalid structure node @ID "' . $id . '"'); | ||
return $fullText; | ||
} | ||
// Is this text format supported? | ||
// This part actually differs from previous version of indexed OCR | ||
if (!empty($fileContent) && !empty($this->formats[$textFormat])) { | ||
$textMiniOcr = ''; | ||
if (!empty($this->formats[$textFormat]['class'])) { | ||
$textMiniOcr = $this->getRawTextFromClass($fileContent, $textFormat); | ||
} | ||
$fullText = $textMiniOcr; | ||
} else { | ||
$this->logger->warning('Unsupported text format "' . $textFormat . '" in physical node with @ID "' . $id . '"'); | ||
} | ||
|
||
return $fullText; | ||
} | ||
|
||
/** | ||
* Get raw text from class for given format. | ||
* | ||
* @access private | ||
* | ||
* @param string $fileContent The content of the XML file | ||
* @param string $textFormat | ||
* | ||
* @return string | ||
*/ | ||
private function getRawTextFromClass(string $fileContent, string $textFormat): string | ||
{ | ||
$textMiniOcr = ''; | ||
$class = $this->formats[$textFormat]['class']; | ||
// Get the raw text from class. | ||
if (class_exists($class)) { | ||
$obj = GeneralUtility::makeInstance($class); | ||
if ($obj instanceof FulltextInterface) { | ||
// Load XML from file. | ||
$ocrTextXml = Helper::getXmlFileAsString($fileContent); | ||
$textMiniOcr = $obj->getTextAsMiniOcr($ocrTextXml); | ||
} else { | ||
$this->logger->warning('Invalid class/method "' . $class . '->getRawText()" for text format "' . $textFormat . '"'); | ||
} | ||
} else { | ||
$this->logger->warning('Class "' . $class . ' does not exists for "' . $textFormat . ' text format"'); | ||
} | ||
return $textMiniOcr; | ||
} | ||
|
||
/** | ||
* Get full text file groups from extension configuration. | ||
* | ||
* @return array | ||
*/ | ||
private function getFullTextFileGroups(): array | ||
{ | ||
$extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('dlf', 'files'); | ||
return GeneralUtility::trimExplode(',', $extConf['fileGrpFulltext']); | ||
} | ||
|
||
/** | ||
* Get format of the OCR full text | ||
* | ||
* @access private | ||
* | ||
* @param string $fileContent The content of the XML file | ||
* | ||
* @return string The format of the OCR full text | ||
*/ | ||
private function getTextFormat(string $fileContent): string | ||
{ | ||
$xml = Helper::getXmlFileAsString($fileContent); | ||
|
||
if ($xml !== false) { | ||
// Get the root element's name as text format. | ||
return strtoupper($xml->getName()); | ||
} else { | ||
return ''; | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.