Skip to content

Commit

Permalink
Merge branch 'main' into 3dviewer-selection
Browse files Browse the repository at this point in the history
  • Loading branch information
markusweigelt authored Jan 13, 2025
2 parents d67d750 + 4b3a130 commit 2923761
Show file tree
Hide file tree
Showing 16 changed files with 742 additions and 599 deletions.
4 changes: 2 additions & 2 deletions Classes/Command/ReindexCommand.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
namespace Kitodo\Dlf\Command;

use Kitodo\Dlf\Common\AbstractDocument;
use Kitodo\Dlf\Command\BaseCommand;
use Kitodo\Dlf\Common\DocumentCacheManager;
use Kitodo\Dlf\Common\Indexer;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
Expand Down Expand Up @@ -224,7 +224,7 @@ protected function execute(InputInterface $input, OutputInterface $output): int
Indexer::add($document, $this->documentRepository, $input->getOption('softCommit'));
}
// Clear document cache to prevent memory exhaustion.
AbstractDocument::clearDocumentCache();
GeneralUtility::makeInstance(DocumentCacheManager::class)->flush();
}

// Clear state of persistence manager to prevent memory exhaustion.
Expand Down
237 changes: 70 additions & 167 deletions Classes/Common/AbstractDocument.php
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

namespace Kitodo\Dlf\Common;

use TYPO3\CMS\Core\Cache\CacheManager;
use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
use TYPO3\CMS\Core\Database\ConnectionPool;
use TYPO3\CMS\Core\Log\Logger;
Expand Down Expand Up @@ -275,6 +274,14 @@ abstract class AbstractDocument
*/
protected string $toplevelId = '';

/**
* Holds the configured useGrps as array.
*
* @var array
* @access protected
*/
protected array $useGroups = [];

/**
* @access protected
* @var \SimpleXMLElement This holds the whole XML file as \SimpleXMLElement object
Expand Down Expand Up @@ -445,11 +452,9 @@ abstract protected function magicGetSmLinks(): array;
*
* @abstract
*
* @param bool $forceReload Force reloading the thumbnail instead of returning the cached value
*
* @return string The document's thumbnail location
*/
abstract protected function magicGetThumbnail(bool $forceReload = false): string;
abstract protected function magicGetThumbnail(): string;

/**
* This returns the ID of the toplevel logical structure node
Expand Down Expand Up @@ -536,7 +541,7 @@ public static function &getInstance(string $location, array $settings = [], bool
$iiif = null;

if (!$forceReload) {
$instance = self::getDocumentCache($location);
$instance = GeneralUtility::makeInstance(DocumentCacheManager::class)->get($location);
if ($instance !== false) {
return $instance;
}
Expand All @@ -546,9 +551,6 @@ public static function &getInstance(string $location, array $settings = [], bool

// Try to load a file from the url
if (GeneralUtility::isValidUrl($location)) {
// Load extension configuration
$extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);

$content = Helper::getUrl($location);
if ($content !== false) {
$xml = Helper::getXmlFileAsString($content);
Expand All @@ -561,10 +563,7 @@ public static function &getInstance(string $location, array $settings = [], bool
// Try to load file as IIIF resource instead.
$contentAsJsonArray = json_decode($content, true);
if ($contentAsJsonArray !== null) {
IiifHelper::setUrlReader(IiifUrlReader::getInstance());
IiifHelper::setMaxThumbnailHeight($extConf['iiif']['thumbnailHeight']);
IiifHelper::setMaxThumbnailWidth($extConf['iiif']['thumbnailWidth']);
$iiif = IiifHelper::loadIiifResource($contentAsJsonArray);
$iiif = self::loadIiifResource($contentAsJsonArray);
if ($iiif instanceof IiifResourceInterface) {
$documentFormat = 'IIIF';
}
Expand All @@ -584,27 +583,12 @@ public static function &getInstance(string $location, array $settings = [], bool
}

if ($instance !== null) {
self::setDocumentCache($location, $instance);
GeneralUtility::makeInstance(DocumentCacheManager::class)->set($location, $instance);
}

return $instance;
}

/**
* Clear document cache.
*
* @access public
*
* @static
*
* @return void
*/
public static function clearDocumentCache(): void
{
$cache = GeneralUtility::makeInstance(CacheManager::class)->getCache('tx_dlf_doc');
$cache->flush();
}

/**
* This returns the first corresponding physical page number of a given logical page label
*
Expand Down Expand Up @@ -636,109 +620,17 @@ public function getPhysicalPage(string $logicalPage): int
}

/**
* This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas from an
* XML full text representation (currently only ALTO). For IIIF manifests, ALTO documents have
* to be given in the Canvas' / Manifest's "seeAlso" property.
*
* @param string $id The "@ID" attribute of the physical structure node (METS) or the "@id" property
* of the Manifest / Range (IIIF)
*
* @return string The OCR full text
*/
protected function getFullTextFromXml(string $id): string
{
$fullText = '';
// Load available text formats, ...
$this->loadFormats();
// ... physical structure ...
$this->magicGetPhysicalStructure();
// ... and extension configuration.
$extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey, 'files');
$fileGrpsFulltext = GeneralUtility::trimExplode(',', $extConf['fileGrpFulltext']);
$textFormat = "";
if (!empty($this->physicalStructureInfo[$id])) {
while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) {
if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) {
// Get full text file.
$fileContent = GeneralUtility::getUrl($this->getFileLocation($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext]));
if ($fileContent !== false) {
$textFormat = $this->getTextFormat($fileContent);
} else {
$this->logger->warning('Couldn\'t load full text file for structure node @ID "' . $id . '"');
return $fullText;
}
break;
}
}
} else {
$this->logger->warning('Invalid structure node @ID "' . $id . '"');
return $fullText;
}
// Is this text format supported?
// This part actually differs from previous version of indexed OCR
if (!empty($fileContent) && !empty($this->formats[$textFormat])) {
$textMiniOcr = '';
if (!empty($this->formats[$textFormat]['class'])) {
$textMiniOcr = $this->getRawTextFromClass($id, $fileContent, $textFormat);
}
$fullText = $textMiniOcr;
} else {
$this->logger->warning('Unsupported text format "' . $textFormat . '" in physical node with @ID "' . $id . '"');
}
return $fullText;
}

/**
* Get raw text from class for given format.
*
* @access private
*
* @param $id
* @param $fileContent
* @param $textFormat
*
* @return string
*/
private function getRawTextFromClass($id, $fileContent, $textFormat): string
{
$textMiniOcr = '';
$class = $this->formats[$textFormat]['class'];
// Get the raw text from class.
if (class_exists($class)) {
$obj = GeneralUtility::makeInstance($class);
if ($obj instanceof FulltextInterface) {
// Load XML from file.
$ocrTextXml = Helper::getXmlFileAsString($fileContent);
$textMiniOcr = $obj->getTextAsMiniOcr($ocrTextXml);
$this->rawTextArray[$id] = $textMiniOcr;
} else {
$this->logger->warning('Invalid class/method "' . $class . '->getRawText()" for text format "' . $textFormat . '"');
}
} else {
$this->logger->warning('Class "' . $class . ' does not exists for "' . $textFormat . ' text format"');
}
return $textMiniOcr;
}

/**
* Get format of the OCR full text
* Get the configuration for given use group.
*
* @access private
* @access protected
*
* @param string $fileContent content of the XML file
* @param string $use
*
* @return string The format of the OCR full text
* @return array|string
*/
private function getTextFormat(string $fileContent): string
protected function getUseGroup(string $use)
{
$xml = Helper::getXmlFileAsString($fileContent);

if ($xml !== false) {
// Get the root element's name as text format.
return strtoupper($xml->getName());
} else {
return '';
}
return array_key_exists($use, $this->getUseGroups()) ? $this->useGroups[$use] : [];
}

/**
Expand Down Expand Up @@ -867,6 +759,37 @@ public function getStructureDepth(string $logId)
return $this->getTreeDepth($this->magicGetTableOfContents(), 1, $logId);
}

/**
* Get the configuration for use groups.
*
* @access protected
*
* @return array
*/
protected function getUseGroups(): array
{
if (empty($this->useGroups)) {
// Get configured USE attributes.
$extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey, 'files');

$configKeys = [
'fileGrpImages',
'fileGrpThumbs',
'fileGrpDownload',
'fileGrpFulltext',
'fileGrpAudio',
'fileGrpScore'
];

foreach ($configKeys as $key) {
if (!empty($extConf[$key])) {
$this->useGroups[$key] = GeneralUtility::trimExplode(',', $extConf[$key]);
}
}
}
return $this->useGroups;
}

/**
* Load XML file / IIIF resource from URL
*
Expand Down Expand Up @@ -927,6 +850,27 @@ protected function loadFormats(): void
}
}

/**
* Load IIIF resource from resource.
*
* @access protected
*
* @static
*
* @param string|array $resource IIIF resource. Can be an IRI, the JSON document as string
* or a dictionary in form of a PHP associative array
*
* @return NULL|\Ubl\Iiif\Presentation\Common\Model\AbstractIiifEntity An instance of the IIIF resource
*/
protected static function loadIiifResource($resource): mixed
{
$extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey, 'iiif');
IiifHelper::setUrlReader(IiifUrlReader::getInstance());
IiifHelper::setMaxThumbnailHeight($extConf['thumbnailHeight']);
IiifHelper::setMaxThumbnailWidth($extConf['thumbnailWidth']);
return IiifHelper::loadIiifResource($resource);
}

/**
* Register all available namespaces for a \SimpleXMLElement object
*
Expand Down Expand Up @@ -1260,45 +1204,4 @@ public function __set(string $var, $value): void
$this->$method($value);
}
}

/**
* Get Cache Hit for document instance
*
* @access private
*
* @static
*
* @param string $location
*
* @return AbstractDocument|false
*/
private static function getDocumentCache(string $location)
{
$cacheIdentifier = hash('md5', $location);
$cache = GeneralUtility::makeInstance(CacheManager::class)->getCache('tx_dlf_doc');
$cacheHit = $cache->get($cacheIdentifier);

return $cacheHit;
}

/**
* Set Cache for document instance
*
* @access private
*
* @static
*
* @param string $location
* @param AbstractDocument $currentDocument
*
* @return void
*/
private static function setDocumentCache(string $location, AbstractDocument $currentDocument): void
{
$cacheIdentifier = hash('md5', $location);
$cache = GeneralUtility::makeInstance(CacheManager::class)->getCache('tx_dlf_doc');

// Save value in cache
$cache->set($cacheIdentifier, $currentDocument);
}
}
Loading

0 comments on commit 2923761

Please sign in to comment.