From 3d09497433e2f141641c7e25bac34d587839f7cf Mon Sep 17 00:00:00 2001 From: Beatrycze Volk Date: Mon, 16 Dec 2024 22:31:21 +0100 Subject: [PATCH 1/3] [MAINTENANCE] Extract load of IIIF resource to protected function (#1403) Co-authored-by: Sebastian Meyer --- Classes/Common/AbstractDocument.php | 29 ++++++++++++++++++++++------- Classes/Common/IiifManifest.php | 12 ++---------- Classes/Common/MetsDocument.php | 6 +----- 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/Classes/Common/AbstractDocument.php b/Classes/Common/AbstractDocument.php index c30315b36..b5e1f3861 100644 --- a/Classes/Common/AbstractDocument.php +++ b/Classes/Common/AbstractDocument.php @@ -545,9 +545,6 @@ public static function &getInstance(string $location, array $settings = [], bool // Try to load a file from the url if (GeneralUtility::isValidUrl($location)) { - // Load extension configuration - $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey); - $content = Helper::getUrl($location); if ($content !== false) { $xml = Helper::getXmlFileAsString($content); @@ -560,10 +557,7 @@ public static function &getInstance(string $location, array $settings = [], bool // Try to load file as IIIF resource instead. $contentAsJsonArray = json_decode($content, true); if ($contentAsJsonArray !== null) { - IiifHelper::setUrlReader(IiifUrlReader::getInstance()); - IiifHelper::setMaxThumbnailHeight($extConf['iiif']['thumbnailHeight']); - IiifHelper::setMaxThumbnailWidth($extConf['iiif']['thumbnailWidth']); - $iiif = IiifHelper::loadIiifResource($contentAsJsonArray); + $iiif = self::loadIiifResource($contentAsJsonArray); if ($iiif instanceof IiifResourceInterface) { $documentFormat = 'IIIF'; } @@ -911,6 +905,27 @@ protected function loadFormats(): void } } + /** + * Load IIIF resource from resource. + * + * @access protected + * + * @static + * + * @param string|array $resource IIIF resource. Can be an IRI, the JSON document as string + * or a dictionary in form of a PHP associative array + * + * @return NULL|\Ubl\Iiif\Presentation\Common\Model\AbstractIiifEntity An instance of the IIIF resource + */ + protected static function loadIiifResource($resource): mixed + { + $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey, 'iiif'); + IiifHelper::setUrlReader(IiifUrlReader::getInstance()); + IiifHelper::setMaxThumbnailHeight($extConf['thumbnailHeight']); + IiifHelper::setMaxThumbnailWidth($extConf['thumbnailWidth']); + return IiifHelper::loadIiifResource($resource); + } + /** * Register all available namespaces for a \SimpleXMLElement object * diff --git a/Classes/Common/IiifManifest.php b/Classes/Common/IiifManifest.php index 073bf5a53..61c7c7e5d 100644 --- a/Classes/Common/IiifManifest.php +++ b/Classes/Common/IiifManifest.php @@ -807,11 +807,7 @@ protected function loadLocation(string $location): bool { $fileResource = GeneralUtility::getUrl($location); if ($fileResource !== false) { - $conf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey, 'iiif'); - IiifHelper::setUrlReader(IiifUrlReader::getInstance()); - IiifHelper::setMaxThumbnailHeight($conf['thumbnailHeight']); - IiifHelper::setMaxThumbnailWidth($conf['thumbnailWidth']); - $resource = IiifHelper::loadIiifResource($fileResource); + $resource = self::loadIiifResource($fileResource); if ($resource instanceof ManifestInterface) { $this->iiif = $resource; return true; @@ -998,11 +994,7 @@ private function setFileUseFulltext(string $iiifId, $iiif): void */ public function __wakeup(): void { - $conf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey, 'iiif'); - IiifHelper::setUrlReader(IiifUrlReader::getInstance()); - IiifHelper::setMaxThumbnailHeight($conf['thumbnailHeight']); - IiifHelper::setMaxThumbnailWidth($conf['thumbnailWidth']); - $resource = IiifHelper::loadIiifResource($this->asJson); + $resource = self::loadIiifResource($this->asJson); if ($resource instanceof ManifestInterface) { $this->asJson = ''; $this->iiif = $resource; diff --git a/Classes/Common/MetsDocument.php b/Classes/Common/MetsDocument.php index b87feab4f..e9cd5ef08 100644 --- a/Classes/Common/MetsDocument.php +++ b/Classes/Common/MetsDocument.php @@ -231,11 +231,7 @@ public function getDownloadLocation(string $id): string $file = $this->getFileInfo($id); if ($file['mimeType'] === 'application/vnd.kitodo.iiif') { $file['location'] = (strrpos($file['location'], 'info.json') === strlen($file['location']) - 9) ? $file['location'] : (strrpos($file['location'], '/') === strlen($file['location']) ? $file['location'] . 'info.json' : $file['location'] . '/info.json'); - $conf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey, 'iiif'); - IiifHelper::setUrlReader(IiifUrlReader::getInstance()); - IiifHelper::setMaxThumbnailHeight($conf['thumbnailHeight']); - IiifHelper::setMaxThumbnailWidth($conf['thumbnailWidth']); - $service = IiifHelper::loadIiifResource($file['location']); + $service = self::loadIiifResource($file['location']); if ($service instanceof AbstractImageService) { return $service->getImageUrl(); } From bd529ae8cc6981a8c93e3286dff657ba66702218 Mon Sep 17 00:00:00 2001 From: Beatrycze Volk Date: Mon, 16 Dec 2024 22:37:29 +0100 Subject: [PATCH 2/3] [FEATURE] Implement full text reader (#1402) Co-authored-by: Sebastian Meyer --- Classes/Common/AbstractDocument.php | 106 ------------------- Classes/Common/FullTextReader.php | 157 ++++++++++++++++++++++++++++ Classes/Common/IiifManifest.php | 14 ++- Classes/Common/MetsDocument.php | 23 +++- 4 files changed, 187 insertions(+), 113 deletions(-) create mode 100644 Classes/Common/FullTextReader.php diff --git a/Classes/Common/AbstractDocument.php b/Classes/Common/AbstractDocument.php index b5e1f3861..8166ce0a8 100644 --- a/Classes/Common/AbstractDocument.php +++ b/Classes/Common/AbstractDocument.php @@ -613,112 +613,6 @@ public function getPhysicalPage(string $logicalPage): int return 1; } - /** - * This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas from an - * XML full text representation (currently only ALTO). For IIIF manifests, ALTO documents have - * to be given in the Canvas' / Manifest's "seeAlso" property. - * - * @param string $id The "@ID" attribute of the physical structure node (METS) or the "@id" property - * of the Manifest / Range (IIIF) - * - * @return string The OCR full text - */ - protected function getFullTextFromXml(string $id): string - { - $fullText = ''; - // Load available text formats, ... - $this->loadFormats(); - // ... physical structure ... - $this->magicGetPhysicalStructure(); - // ... and extension configuration. - $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey, 'files'); - $fileGrpsFulltext = GeneralUtility::trimExplode(',', $extConf['fileGrpFulltext']); - $textFormat = ""; - if (!empty($this->physicalStructureInfo[$id])) { - while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) { - if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) { - // Get full text file. - $fileContent = GeneralUtility::getUrl($this->getFileLocation($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])); - if ($fileContent !== false) { - $textFormat = $this->getTextFormat($fileContent); - } else { - $this->logger->warning('Couldn\'t load full text file for structure node @ID "' . $id . '"'); - return $fullText; - } - break; - } - } - } else { - $this->logger->warning('Invalid structure node @ID "' . $id . '"'); - return $fullText; - } - // Is this text format supported? - // This part actually differs from previous version of indexed OCR - if (!empty($fileContent) && !empty($this->formats[$textFormat])) { - $textMiniOcr = ''; - if (!empty($this->formats[$textFormat]['class'])) { - $textMiniOcr = $this->getRawTextFromClass($id, $fileContent, $textFormat); - } - $fullText = $textMiniOcr; - } else { - $this->logger->warning('Unsupported text format "' . $textFormat . '" in physical node with @ID "' . $id . '"'); - } - return $fullText; - } - - /** - * Get raw text from class for given format. - * - * @access private - * - * @param $id - * @param $fileContent - * @param $textFormat - * - * @return string - */ - private function getRawTextFromClass($id, $fileContent, $textFormat): string - { - $textMiniOcr = ''; - $class = $this->formats[$textFormat]['class']; - // Get the raw text from class. - if (class_exists($class)) { - $obj = GeneralUtility::makeInstance($class); - if ($obj instanceof FulltextInterface) { - // Load XML from file. - $ocrTextXml = Helper::getXmlFileAsString($fileContent); - $textMiniOcr = $obj->getTextAsMiniOcr($ocrTextXml); - $this->rawTextArray[$id] = $textMiniOcr; - } else { - $this->logger->warning('Invalid class/method "' . $class . '->getRawText()" for text format "' . $textFormat . '"'); - } - } else { - $this->logger->warning('Class "' . $class . ' does not exists for "' . $textFormat . ' text format"'); - } - return $textMiniOcr; - } - - /** - * Get format of the OCR full text - * - * @access private - * - * @param string $fileContent content of the XML file - * - * @return string The format of the OCR full text - */ - private function getTextFormat(string $fileContent): string - { - $xml = Helper::getXmlFileAsString($fileContent); - - if ($xml !== false) { - // Get the root element's name as text format. - return strtoupper($xml->getName()); - } else { - return ''; - } - } - /** * This determines a title for the given document * diff --git a/Classes/Common/FullTextReader.php b/Classes/Common/FullTextReader.php new file mode 100644 index 000000000..02a6c7661 --- /dev/null +++ b/Classes/Common/FullTextReader.php @@ -0,0 +1,157 @@ + + * + * This file is part of the Kitodo and TYPO3 projects. + * + * @license GNU General Public License version 3 or later. + * For the full copyright and license information, please read the + * LICENSE.txt file that was distributed with this source code. + */ + +namespace Kitodo\Dlf\Common; + +use TYPO3\CMS\Core\Configuration\ExtensionConfiguration; +use TYPO3\CMS\Core\Log\Logger; +use TYPO3\CMS\Core\Log\LogManager; +use TYPO3\CMS\Core\Utility\GeneralUtility; + +class FullTextReader +{ + /** + * @access private + * @var Logger This holds the logger + */ + private Logger $logger; + + /** + * @access private + * @var array This holds all formats + */ + private array $formats; + + /** + * Constructor + * + * @param array $formats + */ + public function __construct(array $formats) + { + $this->logger = GeneralUtility::makeInstance(LogManager::class)->getLogger(get_class($this)); + $this->formats = $formats; + } + + /** + * This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas from an + * XML full text representation (currently only ALTO). For IIIF manifests, ALTO documents have + * to be given in the Canvas' / Manifest's "seeAlso" property. + * + * @param string $id The "@ID" attribute of the physical structure node (METS) or the "@id" property + * of the Manifest / Range (IIIF) + * @param array $fileLocations The locations of the XML files + * @param mixed $physicalStructureNode The physical structure node (METS) or the Manifest / Range (IIIF) + * + * @return string The OCR full text + */ + public function getFromXml(string $id, array $fileLocations, $physicalStructureNode): string + { + $fullText = ''; + + $fileGrpsFulltext = $this->getFullTextFileGroups(); + $textFormat = ""; + if (!empty($physicalStructureNode)) { + while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) { + if (!empty($physicalStructureNode['files'][$fileGrpFulltext])) { + // Get full text file. + $fileContent = GeneralUtility::getUrl($fileLocations[$fileGrpFulltext]); + if ($fileContent !== false) { + $textFormat = $this->getTextFormat($fileContent); + } else { + $this->logger->warning('Couldn\'t load full text file for structure node @ID "' . $id . '"'); + return $fullText; + } + break; + } + } + } else { + $this->logger->warning('Invalid structure node @ID "' . $id . '"'); + return $fullText; + } + // Is this text format supported? + // This part actually differs from previous version of indexed OCR + if (!empty($fileContent) && !empty($this->formats[$textFormat])) { + $textMiniOcr = ''; + if (!empty($this->formats[$textFormat]['class'])) { + $textMiniOcr = $this->getRawTextFromClass($fileContent, $textFormat); + } + $fullText = $textMiniOcr; + } else { + $this->logger->warning('Unsupported text format "' . $textFormat . '" in physical node with @ID "' . $id . '"'); + } + + return $fullText; + } + + /** + * Get raw text from class for given format. + * + * @access private + * + * @param string $fileContent The content of the XML file + * @param string $textFormat + * + * @return string + */ + private function getRawTextFromClass(string $fileContent, string $textFormat): string + { + $textMiniOcr = ''; + $class = $this->formats[$textFormat]['class']; + // Get the raw text from class. + if (class_exists($class)) { + $obj = GeneralUtility::makeInstance($class); + if ($obj instanceof FulltextInterface) { + // Load XML from file. + $ocrTextXml = Helper::getXmlFileAsString($fileContent); + $textMiniOcr = $obj->getTextAsMiniOcr($ocrTextXml); + } else { + $this->logger->warning('Invalid class/method "' . $class . '->getRawText()" for text format "' . $textFormat . '"'); + } + } else { + $this->logger->warning('Class "' . $class . ' does not exists for "' . $textFormat . ' text format"'); + } + return $textMiniOcr; + } + + /** + * Get full text file groups from extension configuration. + * + * @return array + */ + private function getFullTextFileGroups(): array + { + $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('dlf', 'files'); + return GeneralUtility::trimExplode(',', $extConf['fileGrpFulltext']); + } + + /** + * Get format of the OCR full text + * + * @access private + * + * @param string $fileContent The content of the XML file + * + * @return string The format of the OCR full text + */ + private function getTextFormat(string $fileContent): string + { + $xml = Helper::getXmlFileAsString($fileContent); + + if ($xml !== false) { + // Get the root element's name as text format. + return strtoupper($xml->getName()); + } else { + return ''; + } + } +} diff --git a/Classes/Common/IiifManifest.php b/Classes/Common/IiifManifest.php index 61c7c7e5d..b5924f85a 100644 --- a/Classes/Common/IiifManifest.php +++ b/Classes/Common/IiifManifest.php @@ -755,17 +755,23 @@ public function getFullText(string $id): string // ... and extension configuration. $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey); $fileGrpsFulltext = GeneralUtility::trimExplode(',', $extConf['files']['fileGrpFulltext']); - if (!empty($this->physicalStructureInfo[$id])) { + + $physicalStructureNode = $this->physicalStructureInfo[$id]; + if (!empty($physicalStructureNode)) { while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) { - if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) { - $rawText = parent::getFullTextFromXml($id); + if (!empty($physicalStructureNode['files'][$fileGrpFulltext])) { + $rawText = GeneralUtility::makeInstance(FullTextReader::class, $this->formats)->getFromXml( + $id, + [$fileGrpFulltext => $this->getFileLocation($physicalStructureNode['files'][$fileGrpFulltext])], + $physicalStructureNode + ); break; } } if ($extConf['iiif']['indexAnnotations'] == 1) { $iiifResource = $this->iiif->getContainedResourceById($id); // Get annotation containers - $annotationContainerIds = $this->physicalStructureInfo[$id]['annotationContainers']; + $annotationContainerIds = $physicalStructureNode['annotationContainers']; if (!empty($annotationContainerIds)) { $annotationTexts = $this->getAnnotationTexts($annotationContainerIds, $iiifResource->getId()); $rawText .= implode(' ', $annotationTexts); diff --git a/Classes/Common/MetsDocument.php b/Classes/Common/MetsDocument.php index e9cd5ef08..4f13c606c 100644 --- a/Classes/Common/MetsDocument.php +++ b/Classes/Common/MetsDocument.php @@ -1116,12 +1116,29 @@ function ($element) { public function getFullText(string $id): string { $fullText = ''; - - // Load fileGrps and check for full text files. + // Load available text formats, ... + $this->loadFormats(); + // ... physical structure ... + $this->magicGetPhysicalStructure(); + // ... fileGrps and check for full text files. $this->magicGetFileGrps(); + if ($this->hasFulltext) { - $fullText = $this->getFullTextFromXml($id); + $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey, 'files'); + $fileGrpsFulltext = GeneralUtility::trimExplode(',', $extConf['fileGrpFulltext']); + + $physicalStructureNode = $this->physicalStructureInfo[$id]; + + $fileLocations = []; + if (!empty($physicalStructureNode)) { + while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) { + $fileLocations[$fileGrpFulltext] = $this->getFileLocation($physicalStructureNode['files'][$fileGrpFulltext]); + } + } + + $fullText = GeneralUtility::makeInstance(FullTextReader::class, $this->formats)->getFromXml($id, $fileLocations, $physicalStructureNode); } + return $fullText; } From 2ac3a969a91ffcedbfad54a85f8b38cf12e8bbe7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20Sch=C3=B6lzel?= <142507449+fschoelzel@users.noreply.github.com> Date: Mon, 16 Dec 2024 22:42:30 +0100 Subject: [PATCH 3/3] [FEATURE] Implement IntlDateFormatter for SolrCoreStatus timestamps (#1394) Co-authored-by: Sebastian Meyer --- .../Hooks/Form/FieldInformation/SolrCoreStatus.php | 14 ++++++++++++-- Resources/Private/Language/de.locallang_be.xlf | 2 +- Resources/Private/Language/locallang_be.xlf | 2 +- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/Classes/Hooks/Form/FieldInformation/SolrCoreStatus.php b/Classes/Hooks/Form/FieldInformation/SolrCoreStatus.php index ddb911beb..f45f4e82e 100644 --- a/Classes/Hooks/Form/FieldInformation/SolrCoreStatus.php +++ b/Classes/Hooks/Form/FieldInformation/SolrCoreStatus.php @@ -12,6 +12,7 @@ namespace Kitodo\Dlf\Hooks\Form\FieldInformation; +use IntlDateFormatter; use Kitodo\Dlf\Common\Helper; use Kitodo\Dlf\Common\Solr\Solr; use TYPO3\CMS\Backend\Form\AbstractNode; @@ -37,6 +38,14 @@ class SolrCoreStatus extends AbstractNode public function render(): array { $result = $this->initializeResultArray(); + + // Get date formatter + $dateFormatter = new IntlDateFormatter( + Helper::getLanguageService()->lang, // locale + IntlDateFormatter::MEDIUM, // dateType + IntlDateFormatter::MEDIUM // timeType + ); + // Show only when editing existing records. if ($this->data['command'] !== 'new') { $core = $this->data['databaseRow']['index_name']; @@ -57,8 +66,9 @@ public function render(): array $dateTimeTo = new \DateTime("@$uptimeInSeconds"); $uptime = $dateTimeFrom->diff($dateTimeTo)->format('%a ' . Helper::getLanguageService()->getLL('flash.days') . ', %H:%I:%S'); $numDocuments = $response->getNumberOfDocuments(); - $startTime = $response->getStartTime() ? strftime('%c', $response->getStartTime()->getTimestamp()) : 'N/A'; - $lastModified = $response->getLastModified() ? strftime('%c', $response->getLastModified()->getTimestamp()) : 'N/A'; + $startTime = $response->getStartTime() ? $dateFormatter->format($response->getStartTime()) : 'N/A'; + $lastModified = $response->getLastModified() ? $dateFormatter->format($response->getLastModified()) : 'N/A'; + // Create flash message. Helper::addMessage( sprintf(Helper::getLanguageService()->getLL('flash.coreStatus'), $startTime, $uptime, $lastModified, $numDocuments), diff --git a/Resources/Private/Language/de.locallang_be.xlf b/Resources/Private/Language/de.locallang_be.xlf index 779d0d3f4..ba91b5a6d 100644 --- a/Resources/Private/Language/de.locallang_be.xlf +++ b/Resources/Private/Language/de.locallang_be.xlf @@ -579,7 +579,7 @@ - Uptime: %s
Last Modified: %ss
Number of Documents: %u]]> + Uptime: %s
Last Modified: %s
Number of Documents: %u]]> Laufzeit: %s
Letzte Ă„nderung: %s
Anzahl Dokumente: %u]]>
diff --git a/Resources/Private/Language/locallang_be.xlf b/Resources/Private/Language/locallang_be.xlf index 7b215dcf1..8c90a17cf 100644 --- a/Resources/Private/Language/locallang_be.xlf +++ b/Resources/Private/Language/locallang_be.xlf @@ -450,7 +450,7 @@ - Uptime: %s
Last Modified: %ss
Number of Documents: %u]]> + Uptime: %s
Last Modified: %s
Number of Documents: %u]]>