Skip to content

Commit

Permalink
Merge branch 'main' into maintenance_doc_cache
Browse files Browse the repository at this point in the history
  • Loading branch information
sebastian-meyer authored Dec 18, 2024
2 parents 03b6255 + 2ac3a96 commit eae6acc
Show file tree
Hide file tree
Showing 7 changed files with 226 additions and 139 deletions.
135 changes: 22 additions & 113 deletions Classes/Common/AbstractDocument.php
Original file line number Diff line number Diff line change
Expand Up @@ -545,9 +545,6 @@ public static function &getInstance(string $location, array $settings = [], bool

// Try to load a file from the url
if (GeneralUtility::isValidUrl($location)) {
// Load extension configuration
$extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);

$content = Helper::getUrl($location);
if ($content !== false) {
$xml = Helper::getXmlFileAsString($content);
Expand All @@ -560,10 +557,7 @@ public static function &getInstance(string $location, array $settings = [], bool
// Try to load file as IIIF resource instead.
$contentAsJsonArray = json_decode($content, true);
if ($contentAsJsonArray !== null) {
IiifHelper::setUrlReader(IiifUrlReader::getInstance());
IiifHelper::setMaxThumbnailHeight($extConf['iiif']['thumbnailHeight']);
IiifHelper::setMaxThumbnailWidth($extConf['iiif']['thumbnailWidth']);
$iiif = IiifHelper::loadIiifResource($contentAsJsonArray);
$iiif = self::loadIiifResource($contentAsJsonArray);
if ($iiif instanceof IiifResourceInterface) {
$documentFormat = 'IIIF';
}
Expand Down Expand Up @@ -619,112 +613,6 @@ public function getPhysicalPage(string $logicalPage): int
return 1;
}

/**
* This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas from an
* XML full text representation (currently only ALTO). For IIIF manifests, ALTO documents have
* to be given in the Canvas' / Manifest's "seeAlso" property.
*
* @param string $id The "@ID" attribute of the physical structure node (METS) or the "@id" property
* of the Manifest / Range (IIIF)
*
* @return string The OCR full text
*/
protected function getFullTextFromXml(string $id): string
{
$fullText = '';
// Load available text formats, ...
$this->loadFormats();
// ... physical structure ...
$this->magicGetPhysicalStructure();
// ... and extension configuration.
$extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey, 'files');
$fileGrpsFulltext = GeneralUtility::trimExplode(',', $extConf['fileGrpFulltext']);
$textFormat = "";
if (!empty($this->physicalStructureInfo[$id])) {
while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) {
if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) {
// Get full text file.
$fileContent = GeneralUtility::getUrl($this->getFileLocation($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext]));
if ($fileContent !== false) {
$textFormat = $this->getTextFormat($fileContent);
} else {
$this->logger->warning('Couldn\'t load full text file for structure node @ID "' . $id . '"');
return $fullText;
}
break;
}
}
} else {
$this->logger->warning('Invalid structure node @ID "' . $id . '"');
return $fullText;
}
// Is this text format supported?
// This part actually differs from previous version of indexed OCR
if (!empty($fileContent) && !empty($this->formats[$textFormat])) {
$textMiniOcr = '';
if (!empty($this->formats[$textFormat]['class'])) {
$textMiniOcr = $this->getRawTextFromClass($id, $fileContent, $textFormat);
}
$fullText = $textMiniOcr;
} else {
$this->logger->warning('Unsupported text format "' . $textFormat . '" in physical node with @ID "' . $id . '"');
}
return $fullText;
}

/**
* Get raw text from class for given format.
*
* @access private
*
* @param $id
* @param $fileContent
* @param $textFormat
*
* @return string
*/
private function getRawTextFromClass($id, $fileContent, $textFormat): string
{
$textMiniOcr = '';
$class = $this->formats[$textFormat]['class'];
// Get the raw text from class.
if (class_exists($class)) {
$obj = GeneralUtility::makeInstance($class);
if ($obj instanceof FulltextInterface) {
// Load XML from file.
$ocrTextXml = Helper::getXmlFileAsString($fileContent);
$textMiniOcr = $obj->getTextAsMiniOcr($ocrTextXml);
$this->rawTextArray[$id] = $textMiniOcr;
} else {
$this->logger->warning('Invalid class/method "' . $class . '->getRawText()" for text format "' . $textFormat . '"');
}
} else {
$this->logger->warning('Class "' . $class . ' does not exists for "' . $textFormat . ' text format"');
}
return $textMiniOcr;
}

/**
* Get format of the OCR full text
*
* @access private
*
* @param string $fileContent content of the XML file
*
* @return string The format of the OCR full text
*/
private function getTextFormat(string $fileContent): string
{
$xml = Helper::getXmlFileAsString($fileContent);

if ($xml !== false) {
// Get the root element's name as text format.
return strtoupper($xml->getName());
} else {
return '';
}
}

/**
* This determines a title for the given document
*
Expand Down Expand Up @@ -911,6 +799,27 @@ protected function loadFormats(): void
}
}

/**
* Load IIIF resource from resource.
*
* @access protected
*
* @static
*
* @param string|array $resource IIIF resource. Can be an IRI, the JSON document as string
* or a dictionary in form of a PHP associative array
*
* @return NULL|\Ubl\Iiif\Presentation\Common\Model\AbstractIiifEntity An instance of the IIIF resource
*/
protected static function loadIiifResource($resource): mixed
{
$extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey, 'iiif');
IiifHelper::setUrlReader(IiifUrlReader::getInstance());
IiifHelper::setMaxThumbnailHeight($extConf['thumbnailHeight']);
IiifHelper::setMaxThumbnailWidth($extConf['thumbnailWidth']);
return IiifHelper::loadIiifResource($resource);
}

/**
* Register all available namespaces for a \SimpleXMLElement object
*
Expand Down
157 changes: 157 additions & 0 deletions Classes/Common/FullTextReader.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
<?php

/**
* (c) Kitodo. Key to digital objects e.V. <[email protected]>
*
* This file is part of the Kitodo and TYPO3 projects.
*
* @license GNU General Public License version 3 or later.
* For the full copyright and license information, please read the
* LICENSE.txt file that was distributed with this source code.
*/

namespace Kitodo\Dlf\Common;

use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
use TYPO3\CMS\Core\Log\Logger;
use TYPO3\CMS\Core\Log\LogManager;
use TYPO3\CMS\Core\Utility\GeneralUtility;

class FullTextReader
{
/**
* @access private
* @var Logger This holds the logger
*/
private Logger $logger;

/**
* @access private
* @var array This holds all formats
*/
private array $formats;

/**
* Constructor
*
* @param array $formats
*/
public function __construct(array $formats)
{
$this->logger = GeneralUtility::makeInstance(LogManager::class)->getLogger(get_class($this));
$this->formats = $formats;
}

/**
* This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas from an
* XML full text representation (currently only ALTO). For IIIF manifests, ALTO documents have
* to be given in the Canvas' / Manifest's "seeAlso" property.
*
* @param string $id The "@ID" attribute of the physical structure node (METS) or the "@id" property
* of the Manifest / Range (IIIF)
* @param array $fileLocations The locations of the XML files
* @param mixed $physicalStructureNode The physical structure node (METS) or the Manifest / Range (IIIF)
*
* @return string The OCR full text
*/
public function getFromXml(string $id, array $fileLocations, $physicalStructureNode): string
{
$fullText = '';

$fileGrpsFulltext = $this->getFullTextFileGroups();
$textFormat = "";
if (!empty($physicalStructureNode)) {
while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) {
if (!empty($physicalStructureNode['files'][$fileGrpFulltext])) {
// Get full text file.
$fileContent = GeneralUtility::getUrl($fileLocations[$fileGrpFulltext]);
if ($fileContent !== false) {
$textFormat = $this->getTextFormat($fileContent);
} else {
$this->logger->warning('Couldn\'t load full text file for structure node @ID "' . $id . '"');
return $fullText;
}
break;
}
}
} else {
$this->logger->warning('Invalid structure node @ID "' . $id . '"');
return $fullText;
}
// Is this text format supported?
// This part actually differs from previous version of indexed OCR
if (!empty($fileContent) && !empty($this->formats[$textFormat])) {
$textMiniOcr = '';
if (!empty($this->formats[$textFormat]['class'])) {
$textMiniOcr = $this->getRawTextFromClass($fileContent, $textFormat);
}
$fullText = $textMiniOcr;
} else {
$this->logger->warning('Unsupported text format "' . $textFormat . '" in physical node with @ID "' . $id . '"');
}

return $fullText;
}

/**
* Get raw text from class for given format.
*
* @access private
*
* @param string $fileContent The content of the XML file
* @param string $textFormat
*
* @return string
*/
private function getRawTextFromClass(string $fileContent, string $textFormat): string
{
$textMiniOcr = '';
$class = $this->formats[$textFormat]['class'];
// Get the raw text from class.
if (class_exists($class)) {
$obj = GeneralUtility::makeInstance($class);
if ($obj instanceof FulltextInterface) {
// Load XML from file.
$ocrTextXml = Helper::getXmlFileAsString($fileContent);
$textMiniOcr = $obj->getTextAsMiniOcr($ocrTextXml);
} else {
$this->logger->warning('Invalid class/method "' . $class . '->getRawText()" for text format "' . $textFormat . '"');
}
} else {
$this->logger->warning('Class "' . $class . ' does not exists for "' . $textFormat . ' text format"');
}
return $textMiniOcr;
}

/**
* Get full text file groups from extension configuration.
*
* @return array
*/
private function getFullTextFileGroups(): array
{
$extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('dlf', 'files');
return GeneralUtility::trimExplode(',', $extConf['fileGrpFulltext']);
}

/**
* Get format of the OCR full text
*
* @access private
*
* @param string $fileContent The content of the XML file
*
* @return string The format of the OCR full text
*/
private function getTextFormat(string $fileContent): string
{
$xml = Helper::getXmlFileAsString($fileContent);

if ($xml !== false) {
// Get the root element's name as text format.
return strtoupper($xml->getName());
} else {
return '';
}
}
}
26 changes: 12 additions & 14 deletions Classes/Common/IiifManifest.php
Original file line number Diff line number Diff line change
Expand Up @@ -755,17 +755,23 @@ public function getFullText(string $id): string
// ... and extension configuration.
$extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
$fileGrpsFulltext = GeneralUtility::trimExplode(',', $extConf['files']['fileGrpFulltext']);
if (!empty($this->physicalStructureInfo[$id])) {

$physicalStructureNode = $this->physicalStructureInfo[$id];
if (!empty($physicalStructureNode)) {
while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) {
if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) {
$rawText = parent::getFullTextFromXml($id);
if (!empty($physicalStructureNode['files'][$fileGrpFulltext])) {
$rawText = GeneralUtility::makeInstance(FullTextReader::class, $this->formats)->getFromXml(
$id,
[$fileGrpFulltext => $this->getFileLocation($physicalStructureNode['files'][$fileGrpFulltext])],
$physicalStructureNode
);
break;
}
}
if ($extConf['iiif']['indexAnnotations'] == 1) {
$iiifResource = $this->iiif->getContainedResourceById($id);
// Get annotation containers
$annotationContainerIds = $this->physicalStructureInfo[$id]['annotationContainers'];
$annotationContainerIds = $physicalStructureNode['annotationContainers'];
if (!empty($annotationContainerIds)) {
$annotationTexts = $this->getAnnotationTexts($annotationContainerIds, $iiifResource->getId());
$rawText .= implode(' ', $annotationTexts);
Expand Down Expand Up @@ -807,11 +813,7 @@ protected function loadLocation(string $location): bool
{
$fileResource = GeneralUtility::getUrl($location);
if ($fileResource !== false) {
$conf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey, 'iiif');
IiifHelper::setUrlReader(IiifUrlReader::getInstance());
IiifHelper::setMaxThumbnailHeight($conf['thumbnailHeight']);
IiifHelper::setMaxThumbnailWidth($conf['thumbnailWidth']);
$resource = IiifHelper::loadIiifResource($fileResource);
$resource = self::loadIiifResource($fileResource);
if ($resource instanceof ManifestInterface) {
$this->iiif = $resource;
return true;
Expand Down Expand Up @@ -998,11 +1000,7 @@ private function setFileUseFulltext(string $iiifId, $iiif): void
*/
public function __wakeup(): void
{
$conf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey, 'iiif');
IiifHelper::setUrlReader(IiifUrlReader::getInstance());
IiifHelper::setMaxThumbnailHeight($conf['thumbnailHeight']);
IiifHelper::setMaxThumbnailWidth($conf['thumbnailWidth']);
$resource = IiifHelper::loadIiifResource($this->asJson);
$resource = self::loadIiifResource($this->asJson);
if ($resource instanceof ManifestInterface) {
$this->asJson = '';
$this->iiif = $resource;
Expand Down
Loading

0 comments on commit eae6acc

Please sign in to comment.