diff --git a/Classes/Command/BaseCommand.php b/Classes/Command/BaseCommand.php index 76e1a51d44..c6bd8d4435 100644 --- a/Classes/Command/BaseCommand.php +++ b/Classes/Command/BaseCommand.php @@ -22,6 +22,7 @@ use Kitodo\Dlf\Domain\Model\Collection; use Kitodo\Dlf\Domain\Model\Document; use Kitodo\Dlf\Domain\Model\Library; +use Kitodo\Dlf\Validation\DocumentValidator; use Symfony\Component\Console\Command\Command; use TYPO3\CMS\Core\Configuration\ExtensionConfiguration; use TYPO3\CMS\Core\Utility\GeneralUtility; @@ -213,71 +214,76 @@ protected function saveToDatabase(Document $document): bool $doc->cPid = $this->storagePid; $metadata = $doc->getToplevelMetadata($this->storagePid); + $validator = new DocumentValidator($metadata, explode(',', $this->extConf['requiredMetadataFields'])); + + if ($validator->hasAllMandatoryMetadataFields()) { + // set title data + $document->setTitle($metadata['title'][0] ? : ''); + $document->setTitleSorting($metadata['title_sorting'][0] ? : ''); + $document->setPlace(implode('; ', $metadata['place'])); + $document->setYear(implode('; ', $metadata['year'])); + + // Remove appended "valueURI" from authors' names for storing in database. + foreach ($metadata['author'] as $i => $author) { + $splitName = explode(pack('C', 31), $author); + $metadata['author'][$i] = $splitName[0]; + } + $document->setAuthor(implode('; ', $metadata['author'])); + $document->setThumbnail($doc->thumbnail ? : ''); + $document->setMetsLabel($metadata['mets_label'][0] ? : ''); + $document->setMetsOrderlabel($metadata['mets_orderlabel'][0] ? : ''); - // set title data - $document->setTitle($metadata['title'][0] ? : ''); - $document->setTitleSorting($metadata['title_sorting'][0] ? : ''); - $document->setPlace(implode('; ', $metadata['place'])); - $document->setYear(implode('; ', $metadata['year'])); + $structure = $this->structureRepository->findOneByIndexName($metadata['type'][0]); + $document->setStructure($structure); - // Remove appended "valueURI" from authors' names for storing in database. - foreach ($metadata['author'] as $i => $author) { - $splitName = explode(chr(31), $author); - $metadata['author'][$i] = $splitName[0]; - } - $document->setAuthor($this->getAuthors($metadata['author'])); - $document->setThumbnail($doc->thumbnail ? : ''); - $document->setMetsLabel($metadata['mets_label'][0] ? : ''); - $document->setMetsOrderlabel($metadata['mets_orderlabel'][0] ? : ''); + if (is_array($metadata['collection'])) { + $this->addCollections($document, $metadata['collection']); + } - $structure = $this->structureRepository->findOneByIndexName($metadata['type'][0]); - $document->setStructure($structure); + // set identifiers + $document->setProdId($metadata['prod_id'][0] ? : ''); + $document->setOpacId($metadata['opac_id'][0] ? : ''); + $document->setUnionId($metadata['union_id'][0] ? : ''); + + $document->setRecordId($metadata['record_id'][0]); + $document->setUrn($metadata['urn'][0] ? : ''); + $document->setPurl($metadata['purl'][0] ? : ''); + $document->setDocumentFormat($metadata['document_format'][0] ? : ''); + + // set access + $document->setLicense($metadata['license'][0] ? : ''); + $document->setTerms($metadata['terms'][0] ? : ''); + $document->setRestrictions($metadata['restrictions'][0] ? : ''); + $document->setOutOfPrint($metadata['out_of_print'][0] ? : ''); + $document->setRightsInfo($metadata['rights_info'][0] ? : ''); + $document->setStatus(0); + + $this->setOwner($metadata['owner'][0]); + $document->setOwner($this->owner); + + // set volume data + $document->setVolume($metadata['volume'][0] ? : ''); + $document->setVolumeSorting($metadata['volume_sorting'][0] ? : $metadata['mets_order'][0] ? : ''); + + // Get UID of parent document. + if ($document->getDocumentFormat() === 'METS') { + $document->setPartof($this->getParentDocumentUidForSaving($document)); + } - if (is_array($metadata['collection'])) { - $this->addCollections($document, $metadata['collection']); - } + if ($document->getUid() === null) { + // new document + $this->documentRepository->add($document); + } else { + // update of existing document + $this->documentRepository->update($document); + } - // set identifiers - $document->setProdId($metadata['prod_id'][0] ? : ''); - $document->setOpacId($metadata['opac_id'][0] ? : ''); - $document->setUnionId($metadata['union_id'][0] ? : ''); - - $document->setRecordId($metadata['record_id'][0] ? : ''); // (?) $doc->recordId - $document->setUrn($metadata['urn'][0] ? : ''); - $document->setPurl($metadata['purl'][0] ? : ''); - $document->setDocumentFormat($metadata['document_format'][0] ? : ''); - - // set access - $document->setLicense($metadata['license'][0] ? : ''); - $document->setTerms($metadata['terms'][0] ? : ''); - $document->setRestrictions($metadata['restrictions'][0] ? : ''); - $document->setOutOfPrint($metadata['out_of_print'][0] ? : ''); - $document->setRightsInfo($metadata['rights_info'][0] ? : ''); - $document->setStatus(0); - - $this->setOwner($metadata['owner'][0]); - $document->setOwner($this->owner); - - // set volume data - $document->setVolume($metadata['volume'][0] ? : ''); - $document->setVolumeSorting($metadata['volume_sorting'][0] ? : $metadata['mets_order'][0] ? : ''); - - // Get UID of parent document. - if ($document->getDocumentFormat() === 'METS') { - $document->setPartof($this->getParentDocumentUidForSaving($document)); - } + $this->persistenceManager->persistAll(); - if ($document->getUid() === null) { - // new document - $this->documentRepository->add($document); - } else { - // update of existing document - $this->documentRepository->update($document); + return true; } - $this->persistenceManager->persistAll(); - - return true; + return false; } /** diff --git a/Classes/Command/IndexCommand.php b/Classes/Command/IndexCommand.php index c8dbd4ce65..746c3b2589 100644 --- a/Classes/Command/IndexCommand.php +++ b/Classes/Command/IndexCommand.php @@ -180,20 +180,29 @@ protected function execute(InputInterface $input, OutputInterface $output): int if ($dryRun) { $io->section('DRY RUN: Would index ' . $document->getUid() . ' ("' . $document->getLocation() . '") on PID ' . $this->storagePid . ' and Solr core ' . $solrCoreUid . '.'); + $io->success('All done!'); + return BaseCommand::SUCCESS; } else { + $document->setCurrentDocument($doc); + if ($io->isVerbose()) { - $io->section('Indexing ' . $document->getUid() . ' ("' . $document->getLocation() . '") on PID ' . $this->storagePid . ' and Solr core ' . $solrCoreUid . '.'); + $io->section('Indexing ' . $document->getUid() . ' ("' . $document->getLocation() . '") on PID ' . $this->storagePid . '.'); } - $document->setCurrentDocument($doc); - // save to database - $this->saveToDatabase($document); - // add to index - Indexer::add($document, $this->documentRepository); - } + $isSaved = $this->saveToDatabase($document); - $io->success('All done!'); + if ($isSaved) { + if ($io->isVerbose()) { + $io->section('Indexing ' . $document->getUid() . ' ("' . $document->getLocation() . '") on Solr core ' . $solrCoreUid . '.'); + } + Indexer::add($document, $this->documentRepository); - return BaseCommand::SUCCESS; + $io->success('All done!'); + return BaseCommand::SUCCESS; + } + + $io->error('ERROR: Document with UID "' . $document->getUid() . '" could not be indexed on PID ' . $this->storagePid . ' . There are missing mandatory fields (document format or record identifier) in this document.'); + return BaseCommand::FAILURE; + } } /** diff --git a/Classes/Validation/DocumentValidator.php b/Classes/Validation/DocumentValidator.php new file mode 100644 index 0000000000..82d3f0f378 --- /dev/null +++ b/Classes/Validation/DocumentValidator.php @@ -0,0 +1,141 @@ + + * + * This file is part of the Kitodo and TYPO3 projects. + * + * @license GNU General Public License version 3 or later. + * For the full copyright and license information, please read the + * LICENSE.txt file that was distributed with this source code. + */ + +namespace Kitodo\Dlf\Validation; + +use TYPO3\CMS\Core\Log\Logger; +use TYPO3\CMS\Core\Log\LogManager; +use TYPO3\CMS\Core\Utility\GeneralUtility; + +/** + * Class for document validation. Currently used for validating metadata + * fields but in the future should be extended also for other fields. + * + * @package TYPO3 + * @subpackage dlf + * + * @access public + */ +class DocumentValidator +{ + /** + * @access protected + * @var Logger This holds the logger + */ + protected Logger $logger; + + /** + * @access private + * @var array + */ + private array $metadata; + + /** + * @access private + * @var array + */ + private array $requiredMetadataFields; + + /** + * @access private + * @var ?\SimpleXMLElement + */ + private ?\SimpleXMLElement $xml; + + /** + * Constructs DocumentValidator instance. + * + * @access public + * + * @param array $metadata + * @param array $requiredMetadataFields + * + * @return void + */ + public function __construct(array $metadata = [], array $requiredMetadataFields = [], ?\SimpleXMLElement $xml = null) + { + $this->logger = GeneralUtility::makeInstance(LogManager::class)->getLogger(static::class); + $this->metadata = $metadata; + $this->requiredMetadataFields = $requiredMetadataFields; + $this->xml = $xml; + } + + /** + * Check if metadata array contains all mandatory fields before save. + * + * @access public + * + * @return bool + */ + public function hasAllMandatoryMetadataFields(): bool + { + foreach ($this->requiredMetadataFields as $requiredMetadataField) { + if (empty($this->metadata[$requiredMetadataField][0])) { + $this->logger->error('Missing required metadata field "' . $requiredMetadataField . '".'); + return false; + } + } + return true; + } + + /** + * Check if xml contains at least one logical structure with given type. + * + * @access public + * + * @param string $type e.g. documentary, newspaper or object + * + * @return bool + */ + public function hasCorrectLogicalStructure(string $type): bool + { + $expectedNodes = $this->xml->xpath('./mets:structMap[@TYPE="LOGICAL"]/mets:div[@TYPE="' . $type . '"]'); + if ($expectedNodes) { + return true; + } + + $existingNodes = $this->xml->xpath('./mets:structMap[@TYPE="LOGICAL"]/mets:div'); + if ($existingNodes) { + $this->logger->error('Document contains logical structure but @TYPE="' . $type . '" is missing.'); + return false; + } + + $this->logger->error('Document does not contain logical structure.'); + return false; + } + + /** + * Check if xml contains at least one physical structure with type 'physSequence'. + * + * @access public + * + * @return bool + */ + public function hasCorrectPhysicalStructure(): bool + { + $physSequenceNodes = $this->xml->xpath('./mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]'); + if ($physSequenceNodes) { + return true; + } + + $physicalStructureNodes = $this->xml->xpath('./mets:structMap[@TYPE="PHYSICAL"]/mets:div'); + if ($physicalStructureNodes) { + $this->logger->error('Document contains physical structure but @TYPE="physSequence" is missing.'); + return false; + } + + $this->logger->error('Document does not contain physical structure.'); + return false; + } +} diff --git a/Resources/Private/Language/de.locallang_labels.xlf b/Resources/Private/Language/de.locallang_labels.xlf index 0d3adafdad..7b11028234 100644 --- a/Resources/Private/Language/de.locallang_labels.xlf +++ b/Resources/Private/Language/de.locallang_labels.xlf @@ -665,6 +665,10 @@ Verwende externe APIs zum Abrufen von Metadaten?: (Standard ist "FALSE") Use external APIs for getting metadata?: (default is "FALSE") + + Für die Indizierung von Dokumenten erforderliche Metadatenfelder + Metadata fields required for indexing documents + Seiten fileGrps: Komma-getrennte Liste der @USE Attributwerte der Seitenansichten nach aufsteigender Größe sortiert (Standard ist "DEFAULT,MAX") Page fileGrps: comma-separated list of @USE attribute values ordered by increasing size (default is "DEFAULT,MAX") diff --git a/Resources/Private/Language/locallang_labels.xlf b/Resources/Private/Language/locallang_labels.xlf index 89938425c9..802dc413d7 100644 --- a/Resources/Private/Language/locallang_labels.xlf +++ b/Resources/Private/Language/locallang_labels.xlf @@ -500,6 +500,9 @@ Use external APIs for getting metadata?: (default is "FALSE") + + Metadata fields required for indexing documents + Page fileGrps: comma-separated list of @USE attribute values ordered by increasing size (default is "DEFAULT,MAX") diff --git a/ext_conf_template.txt b/ext_conf_template.txt index 7f96ebe0e0..8fc469c425 100644 --- a/ext_conf_template.txt +++ b/ext_conf_template.txt @@ -14,6 +14,8 @@ publishNewCollections = 1 unhideOnIndex = 0 # cat=Basic; type=boolean; label=LLL:EXT:dlf/Resources/Private/Language/locallang_labels.xlf:config.useExternalApisForMetadata useExternalApisForMetadata = 0 +# cat=Document; type=string; label=LLL:EXT:dlf/Resources/Private/Language/locallang_labels.xlf:config.requiredMetadataFields +requiredMetadataFields = document_format,record_id # cat=Files; type=string; label=LLL:EXT:dlf/Resources/Private/Language/locallang_labels.xlf:config.fileGrpImages fileGrpImages = DEFAULT,MAX # cat=Files; type=string; label=LLL:EXT:dlf/Resources/Private/Language/locallang_labels.xlf:config.fileGrpThumbs