From 833dac713c1dae3f7c4a3d931abdef866f670213 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 10 Jun 2024 23:49:42 +0200 Subject: [PATCH 001/249] deprecate Processor.process() --- src/ocrd/__init__.py | 2 +- src/ocrd/processor/base.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/ocrd/__init__.py b/src/ocrd/__init__.py index 62b6ffbc0..9aa507b2c 100644 --- a/src/ocrd/__init__.py +++ b/src/ocrd/__init__.py @@ -14,7 +14,7 @@ """ -from ocrd.processor.base import run_processor, run_cli, Processor +from ocrd.processor.base import run_processor, run_cli, Processor, ResourceNotFoundError from ocrd_models import OcrdMets, OcrdExif, OcrdFile, OcrdAgent from ocrd.resolver import Resolver from ocrd_validators import * diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 931d945d4..6b10d61b0 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -18,8 +18,9 @@ import sys import tarfile import io -from ocrd.workspace import Workspace +from deprecated import deprecated +from ocrd.workspace import Workspace from ocrd_utils import ( VERSION as OCRD_VERSION, MIMETYPE_PAGE, @@ -175,6 +176,9 @@ def __init__( if not report.is_valid: raise Exception("Invalid parameters %s" % report.errors) self.parameter = parameter + # workaround for deprecated#72 (deprecation does not work for subclasses): + setattr(self, 'process', + deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process'))) def show_help(self, subcommand=None): print(generate_processor_help(self.ocrd_tool, processor_instance=self, subcommand=subcommand)) @@ -188,6 +192,7 @@ def verify(self): """ return True + @deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()') def process(self) -> None: """ Process the :py:attr:`workspace` From 3f4c7f99a70bcbb881c4eed43315eacf8117fbdc Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 11 Jun 2024 00:36:29 +0200 Subject: [PATCH 002/249] fix #274: no default -I / -O --- src/ocrd/decorators/__init__.py | 2 ++ src/ocrd/decorators/ocrd_cli_options.py | 7 ++----- src/ocrd/processor/base.py | 7 ++----- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index 811587a10..cbeadc8d7 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -71,6 +71,8 @@ def ocrd_cli_wrap_processor( initLogging() LOG = getLogger('ocrd.cli_wrap_processor') + assert kwargs['input_file_grp'] is not None + assert kwargs['output_file_grp'] is not None # LOG.info('kwargs=%s' % kwargs) if 'parameter' in kwargs: # Disambiguate parameter file/literal, and resolve file diff --git a/src/ocrd/decorators/ocrd_cli_options.py b/src/ocrd/decorators/ocrd_cli_options.py index f32955838..e640a2003 100644 --- a/src/ocrd/decorators/ocrd_cli_options.py +++ b/src/ocrd/decorators/ocrd_cli_options.py @@ -29,11 +29,8 @@ def cli(mets_url): option('-m', '--mets', help="METS to process", default=DEFAULT_METS_BASENAME), option('-w', '--working-dir', help="Working Directory"), option('-U', '--mets-server-url', help="METS server URL. Starts with http:// then TCP, otherwise unix socket path"), - # TODO OCR-D/core#274 - # option('-I', '--input-file-grp', required=True), - # option('-O', '--output-file-grp', required=True), - option('-I', '--input-file-grp', default='INPUT'), - option('-O', '--output-file-grp', default='OUTPUT'), + option('-I', '--input-file-grp', default=None), + option('-O', '--output-file-grp', default=None), option('-g', '--page-id'), option('--overwrite', is_flag=True, default=False), option('--profile', is_flag=True, default=False), diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 6b10d61b0..b0cb1e26a 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -67,11 +67,8 @@ def __init__( workspace : Workspace, ocrd_tool=None, parameter=None, - # TODO OCR-D/core#274 - # input_file_grp=None, - # output_file_grp=None, - input_file_grp="INPUT", - output_file_grp="OUTPUT", + input_file_grp=None, + output_file_grp=None, page_id=None, resolve_resource=None, show_resource=None, From d2b5df3a0ad0293b258149dad242cb56964206c0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 18 Jun 2024 11:19:20 +0200 Subject: [PATCH 003/249] workspace.download: fix typo in exception --- src/ocrd/workspace.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index 41ea8e900..8ce42a070 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -208,7 +208,7 @@ def download_file(self, f, _recursion_count=0): self.baseurl, f.local_filename) url = '%s/%s' % (self.baseurl, f.local_filename) else: - raise FileNotFoundError(f"'local_filename' {f.local_filename} points to non-existing file," + raise FileNotFoundError(f"'local_filename' {f.local_filename} points to non-existing file, " "and no 'url' to download and no 'baseurl' set on workspace - nothing we can do.") file_path = Path(f.local_filename) self.resolver.download_to_directory(self.directory, url, subdir=file_path.parent, basename=file_path.name) @@ -219,7 +219,7 @@ def download_file(self, f, _recursion_count=0): f.local_filename = self.resolver.download_to_directory(self.directory, f.url, subdir=f.fileGrp, basename=basename) return f # If neither f.local_filename nor f.url is set, fail - raise ValueError("OcrdFile {f} has neither 'url' nor 'local_filename', so cannot be downloaded") + raise ValueError(f"OcrdFile {f} has neither 'url' nor 'local_filename', so cannot be downloaded") def remove_file(self, file_id, force=False, keep_file=False, page_recursive=False, page_same_group=False): """ From 9827c4d18d42f36a94c65621442be29a98e7254e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 24 Jun 2024 15:00:33 +0200 Subject: [PATCH 004/249] Processor: factor-out show_resource(), delegate to resolve_resource() --- src/ocrd/processor/base.py | 41 +++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index b0cb1e26a..263f81d63 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -129,27 +129,22 @@ def __init__( for res in self.list_all_resources(): print(res) return - if resolve_resource or show_resource: - initLogging() + if resolve_resource: try: - res_fname = self.resolve_resource(resolve_resource or show_resource) + res = self.resolve_resource(resolve_resource) + print(res) + except ResourceNotFoundError as e: + log = getLogger('ocrd.processor.base') + log.critical(e.message) + sys.exit(1) + return + if show_resource: + try: + self.show_resource(show_resource) except ResourceNotFoundError as e: log = getLogger('ocrd.processor.base') log.critical(e.message) sys.exit(1) - if resolve_resource: - print(res_fname) - return - fpath = Path(res_fname) - if fpath.is_dir(): - with pushd_popd(fpath): - fileobj = io.BytesIO() - with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball: - tarball.add('.') - fileobj.seek(0) - copyfileobj(fileobj, sys.stdout.buffer) - else: - sys.stdout.buffer.write(fpath.read_bytes()) return if show_help: self.show_help(subcommand=subcommand) @@ -235,6 +230,7 @@ def resolve_resource(self, val): Args: val (string): resource value to resolve """ + initLogging() executable = self.ocrd_tool['executable'] log = getLogger('ocrd.processor.base') if exists(val): @@ -252,6 +248,19 @@ def resolve_resource(self, val): return ret[0] raise ResourceNotFoundError(val, executable) + def show_resource(self, val): + res_fname = self.resolve_resource(val) + fpath = Path(res_fname) + if fpath.is_dir(): + with pushd_popd(fpath): + fileobj = io.BytesIO() + with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball: + tarball.add('.') + fileobj.seek(0) + copyfileobj(fileobj, sys.stdout.buffer) + else: + sys.stdout.buffer.write(fpath.read_bytes()) + def list_all_resources(self): """ List all resources found in the filesystem and matching content-type by filename suffix From 38fd4aafdcafee803fce03a12aa4810cf4a2fba6 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 24 Jun 2024 15:00:59 +0200 Subject: [PATCH 005/249] Processor: add setup(), run once in get_processor() --- src/ocrd/processor/base.py | 11 ++++++++++- src/ocrd/processor/builtin/dummy_processor.py | 5 +++++ src/ocrd/processor/helpers.py | 4 +++- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 263f81d63..5338f729c 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -184,6 +184,16 @@ def verify(self): """ return True + def setup(self) -> None: + """ + Prepare the processor for actual data processing, + prior to changing to the workspace directory but + after parsing parameters. + + (Override this to load models into memory etc.) + """ + pass + @deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()') def process(self) -> None: """ @@ -197,7 +207,6 @@ def process(self) -> None: """ raise NotImplementedError() - def add_metadata(self, pcgts): """ Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index 774332a73..9223118c9 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -83,6 +83,11 @@ def __init__(self, *args, **kwargs): kwargs['version'] = '0.0.3' super(DummyProcessor, self).__init__(*args, **kwargs) + def setup(self): + super().setup() + assert_file_grp_cardinality(self.input_file_grp, 1) + assert_file_grp_cardinality(self.output_file_grp, 1) + @click.command() @ocrd_cli_options def cli(*args, **kwargs): diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index f5b601063..9b74671ca 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -398,11 +398,13 @@ def get_processor( cached_processor.input_file_grp = input_file_grp cached_processor.output_file_grp = output_file_grp return cached_processor - return processor_class( + processor = processor_class( workspace=workspace, page_id=page_id, input_file_grp=input_file_grp, output_file_grp=output_file_grp, parameter=parameter ) + processor.setup() + return processor raise ValueError("Processor class is not known") From 580988ad5c6422bbb7eaaa68c783b4ac156d30ee Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 24 Jun 2024 13:50:36 +0200 Subject: [PATCH 006/249] ocrd_cli_wrap_processor: fix workspace arg (not a kwarg) --- src/ocrd/decorators/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index cbeadc8d7..3d0795702 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -48,11 +48,11 @@ def ocrd_cli_wrap_processor( **kwargs ): if not sys.argv[1:]: - processorClass(workspace=None, show_help=True) + processorClass(None, show_help=True) sys.exit(1) if dump_json or dump_module_dir or help or version or show_resource or list_resources: processorClass( - workspace=None, + None, dump_json=dump_json, dump_module_dir=dump_module_dir, show_help=help, From 224dfc5098e9912b9c2bf87f851a52e79b51250b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 24 Jun 2024 14:00:32 +0200 Subject: [PATCH 007/249] =?UTF-8?q?Processor:=20refactor=20processing=20AP?= =?UTF-8?q?I=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - add method `process_workspace(workspace)` as a replacement for passing `workspace` in the constructor and then calling `process` (implemented by subclasses): implement in the superclass - loop over input files - delegate processing to new method `process_page_file()` if possible - otherwise fall back to old `process()` outside of loop - download input files when needed if `self.download` - add method `process_page_file()` as single-page processing procedure on OcrdFiles: implement in the superclass for the most frequent/default use-case of - (multi-) image/PAGE input files - (single) PAGE output files - delegate to new method `process_page_pcgts()` if available - add PAGE processing metadata - set PAGE PcGtsId - handle `make_file_id` and `workspace.add_file` - add method `process_page_pcgts()` as single-page processing function on OcrdPage: to be implemented only by subclasses - constructor: add kwarg `download_files` controlling `self.download` (see above) --- src/ocrd/processor/base.py | 117 ++++++++++++++++++++++++++++++++++--- 1 file changed, 109 insertions(+), 8 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 5338f729c..78bc47c47 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -15,6 +15,7 @@ import os from os import getcwd from pathlib import Path +from typing import Optional import sys import tarfile import io @@ -32,9 +33,11 @@ list_all_resources, get_processor_resource_types, resource_filename, + make_file_id, ) from ocrd_validators import ParameterValidator -from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType +from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType, OcrdPage, to_xml +from ocrd_modelfactory import page_from_file # XXX imports must remain for backwards-compatibility from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import @@ -64,12 +67,15 @@ class Processor(): def __init__( self, - workspace : Workspace, + # FIXME: deprecate in favor of process_workspace(workspace) + workspace : Optional[Workspace], ocrd_tool=None, parameter=None, input_file_grp=None, output_file_grp=None, page_id=None, + download_files=True, + # FIXME: deprecate all the following in favor of respective methods resolve_resource=None, show_resource=None, list_resources=False, @@ -99,6 +105,7 @@ def __init__( output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output. page_id (string): comma-separated list of METS physical ``page`` IDs to process \ (or empty for all pages). + download_files (boolean): Whether input files will be downloaded prior to processing. resolve_resource (string): If not ``None``, then instead of processing, resolve \ given resource by name and print its full path to stdout. show_resource (string): If not ``None``, then instead of processing, resolve \ @@ -154,15 +161,17 @@ def __init__( self.show_version() return self.workspace = workspace - # FIXME HACK would be better to use pushd_popd(self.workspace.directory) - # but there is no way to do that in process here since it's an - # overridden method. chdir is almost always an anti-pattern. if self.workspace: + # FIXME deprecate setting this and calling process() over using process_workspace() + # which uses pushd_popd(self.workspace.directory) + # (because there is no way to do that in process() since it's an + # overridden method. chdir is almost always an anti-pattern.) self.old_pwd = getcwd() os.chdir(self.workspace.directory) self.input_file_grp = input_file_grp self.output_file_grp = output_file_grp self.page_id = None if page_id == [] or page_id is None else page_id + self.download = download_files parameterValidator = ParameterValidator(ocrd_tool) report = parameterValidator.validate(parameter) if not report.is_valid: @@ -197,17 +206,109 @@ def setup(self) -> None: @deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()') def process(self) -> None: """ - Process the :py:attr:`workspace` + Process all files of the :py:attr:`workspace` from the given :py:attr:`input_file_grp` to the given :py:attr:`output_file_grp` - for the given :py:attr:`page_id` + for the given :py:attr:`page_id` (or all pages) under the given :py:attr:`parameter`. (This contains the main functionality and needs to be overridden by subclasses.) """ raise NotImplementedError() - def add_metadata(self, pcgts): + def process_workspace(self, workspace: Workspace) -> None: + """ + Process all files of the given ``workspace``, + from the given :py:attr:`input_file_grp` + to the given :py:attr:`output_file_grp` + for the given :py:attr:`page_id` (or all pages) + under the given :py:attr:`parameter`. + + (This will iterate over pages and files, calling + :py:meth:`process_page`, handling exceptions.) + """ + # assert self.input_file_grp is not None + # assert self.output_file_grp is not None + # input_file_grps = self.input_file_grp.split(',') + # for input_file_grp in input_file_grps: + # assert input_file_grp in workspace.mets.file_groups + log = getLogger('ocrd.processor.base') + with pushd_popd(workspace.directory): + self.workspace = workspace + try: + # FIXME: add page parallelization by running multiprocessing.Pool (#322) + for input_file_tuple in self.zip_input_files(on_error='abort'): + # FIXME: add error handling by catching exceptions in various ways (#579) + # for example: + # - ResourceNotFoundError → use ResourceManager to download (once), then retry + # - transient (I/O or OOM) error → maybe sleep, retry + # - persistent (data) error → skip / dummy / raise + input_files = [None] * len(input_file_tuple) + for i, input_file in enumerate(input_file_tuple): + if i == 0: + log.info("processing page %s", input_file.pageId) + elif input_file is None: + # file/page not found in this file grp + continue + input_files[i] = input_file + if not self.download: + continue + try: + input_files[i] = self.workspace.download_file(input_file) + except ValueError as e: + log.error(repr(e)) + log.warning("skipping file %s for page %s", input_file, input_file.pageId) + self.process_page_file(*input_files) + except NotImplementedError: + # fall back to deprecated method + self.process() + + def process_page_file(self, *input_files) -> None: + """ + Process the given ``input_files`` of the :py:attr:`workspace`, + representing one physical page (passed as one opened + :py:class:`~ocrd_models.OcrdFile` per input fileGrp) + under the given :py:attr:`parameter`, and make sure the + results get added accordingly. + + (This uses process_page_pcgts, but can be overridden by subclasses + to handle cases like multiple fileGrps, non-PAGE input etc.) + """ + log = getLogger('ocrd.processor.base') + input_pcgts = [None] * len(input_files) + for i, input_file in enumerate(input_files): + # FIXME: what about non-PAGE input like image or JSON ??? + log.debug("parsing file %s for page %s", input_file.ID, input_file.pageId) + try: + input_pcgts[i] = page_from_file(input_file) + except ValueError as e: + log.info("non-PAGE input for page %s: %s", input_file.pageId, e) + output_pcgts = self.process_page_pcgts(*input_pcgts) + output_file_id = make_file_id(input_files[0], self.output_file_grp) + output_pcgts.set_pcGtsId(output_file_id) + self.add_metadata(output_pcgts) + # FIXME: what about save_image_file in process_page ??? + # FIXME: what about non-PAGE output like JSON ??? + self.workspace.add_file(file_id=output_file_id, + file_grp=self.output_file_grp, + page_id=input_files[0].pageId, + local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), + mimetype=MIMETYPE_PAGE, + content=to_xml(output_pcgts)) + + def process_page_pcgts(self, *input_pcgts) -> OcrdPage: + """ + Process the given ``input_pcgts`` of the :py:attr:`workspace`, + representing one physical page (passed as one parsed + :py:class:`~ocrd_models.OcrdPage` per input fileGrp) + under the given :py:attr:`parameter`, and return the + resulting :py:class:`~ocrd_models.OcrdPage`. + + (This contains the main functionality and must be overridden by subclasses.) + """ + raise NotImplementedError() + + def add_metadata(self, pcgts: OcrdPage) -> None: """ Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``. From 9714aaba47f74d5023255b20f0d9136eaf6cc12e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 24 Jun 2024 14:56:28 +0200 Subject: [PATCH 008/249] DummyProcessor: re-implement via new process_page_* - implement `process_page_pcgts` with behaviour for `copy_files=False` - override superclass `process_page_file` with behaviour for `copy_files=True` - remove old `process` implementation --- src/ocrd/processor/builtin/dummy_processor.py | 75 ++++++++----------- 1 file changed, 32 insertions(+), 43 deletions(-) diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index 9223118c9..d16e18271 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -24,59 +24,48 @@ class DummyProcessor(Processor): Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group """ - def process(self) -> None: + def process_page_pcgts(self, *input_pcgts): + # nothing to do here + return input_pcgts[0] + + def process_page_file(self, *input_files): LOG = getLogger('ocrd.dummy') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - copy_files = self.parameter['copy_files'] - for input_file in self.input_files: - input_file = self.workspace.download_file(input_file) + input_file = input_files[0] + if self.parameter['copy_files'] and input_file.mimetype != MIMETYPE_PAGE: + # we need to mimic the actual copying in addition to the PAGE boilerplate file_id = make_file_id(input_file, self.output_file_grp) ext = MIME_TO_EXT.get(input_file.mimetype, '') local_filename = join(self.output_file_grp, file_id + ext) - pcgts = page_from_file(self.workspace.download_file(input_file)) - pcgts.set_pcGtsId(file_id) - self.add_metadata(pcgts) - if input_file.mimetype == MIMETYPE_PAGE: - LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id) - # Source file is PAGE-XML: Write out in-memory PcGtsType - self.workspace.add_file( + LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id) + with open(input_file.local_filename, 'rb') as f: + content = f.read() + output_file = self.workspace.add_file( file_id=file_id, file_grp=self.output_file_grp, page_id=input_file.pageId, mimetype=input_file.mimetype, local_filename=local_filename, - content=to_xml(pcgts).encode('utf-8')) - else: - # Source file is not PAGE-XML: Copy byte-by-byte unless copy_files is False - if not copy_files: - LOG.info("Not copying %s because it is not a PAGE-XML file and copy_files was false" % input_file.local_filename) - else: - LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id) - with open(input_file.local_filename, 'rb') as f: - content = f.read() - self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - mimetype=input_file.mimetype, - local_filename=local_filename, - content=content) - if input_file.mimetype.startswith('image/'): - # write out the PAGE-XML representation for this image - page_file_id = file_id + '_PAGE' - pcgts.set_pcGtsId(page_file_id) - pcgts.get_Page().set_imageFilename(local_filename if copy_files else input_file.local_filename) - page_filename = join(self.output_file_grp, file_id + '.xml') - LOG.info("Add PAGE-XML %s generated for %s at %s", page_file_id, file_id, page_filename) - self.workspace.add_file( - file_id=page_file_id, - file_grp=self.output_file_grp, - page_id=input_file.pageId, - mimetype=MIMETYPE_PAGE, - local_filename=page_filename, - content=to_xml(pcgts).encode('utf-8')) + content=content) + file_id = file_id + '_PAGE' + pcgts = page_from_file(output_file) + pcgts = self.process_page_pcgts(pcgts) + pcgts.set_pcGtsId(file_id) + self.add_metadata(pcgts) + LOG.info("Add PAGE-XML %s generated for %s", file_id, output_file) + self.workspace.add_file(file_id=file_id, + file_grp=self.output_file_grp, + page_id=input_file.pageId, + local_filename=join(self.output_file_grp, file_id + '.xml'), + mimetype=MIMETYPE_PAGE, + content=to_xml(pcgts)) + else: + if self.parameter['copy_files']: + LOG.info("Not copying %s because it is a PAGE-XML file, which gets identity-transformed", input_file.local_filename) + else: + LOG.info("Not copying %s because it is not a PAGE-XML file and copy_files was false", input_file.local_filename) + # we can rely on base implementation verbatim + super().process_page_file(input_file) def __init__(self, *args, **kwargs): kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-dummy'] From e5d4736fd73f1e6a765141a7679a710de6009c7f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 24 Jun 2024 14:18:06 +0200 Subject: [PATCH 009/249] =?UTF-8?q?run=5Fprocessor:=20adapt=20to=20process?= =?UTF-8?q?=E2=86=92process=5Fworkspace?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/ocrd/processor/helpers.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index 9b74671ca..b4b798706 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -83,7 +83,6 @@ def run_processor( log = getLogger('ocrd.processor.helpers.run_processor') log.debug("Running processor %s", processorClass) - old_cwd = getcwd() processor = get_processor( processor_class=processorClass, parameter=parameter, @@ -93,8 +92,6 @@ def run_processor( output_file_grp=output_file_grp, instance_caching=instance_caching ) - processor.workspace = workspace - chdir(processor.workspace.directory) ocrd_tool = processor.ocrd_tool name = '%s v%s' % (ocrd_tool['executable'], processor.version) @@ -107,7 +104,7 @@ def run_processor( backend = 'psutil_pss' if 'PSS' in config.OCRD_PROFILE else 'psutil' from memory_profiler import memory_usage try: - mem_usage = memory_usage(proc=processor.process, + mem_usage = memory_usage(proc=processor.process_workspace(workspace), # only run process once max_iterations=1, interval=.1, timeout=None, timestamps=True, @@ -118,8 +115,6 @@ def run_processor( except Exception as err: log.exception("Failure in processor '%s'" % ocrd_tool['executable']) raise err - finally: - chdir(old_cwd) mem_usage_values = [mem for mem, _ in mem_usage] mem_output = 'memory consumption: ' mem_output += sparkline(mem_usage_values) @@ -127,12 +122,10 @@ def run_processor( logProfile.info(mem_output) else: try: - processor.process() + processor.process_workspace(workspace) except Exception as err: log.exception("Failure in processor '%s'" % ocrd_tool['executable']) raise err - finally: - chdir(old_cwd) t1_wall = perf_counter() - t0_wall t1_cpu = process_time() - t0_cpu From 809a01b452069f6524c894f1cc0360e8dc5a1edf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 24 Jun 2024 14:21:12 +0200 Subject: [PATCH 010/249] test DummyProcessor: adapt to new `download` default by setting `download_files=False` in tests (because they are not actually in the filesystem) --- tests/data/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/data/__init__.py b/tests/data/__init__.py index 93a2ea49a..113305e2b 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -21,6 +21,7 @@ class DummyProcessor(Processor): def __init__(self, *args, **kwargs): kwargs['ocrd_tool'] = DUMMY_TOOL kwargs['version'] = '0.0.1' + kwargs['download_files'] = False super(DummyProcessor, self).__init__(*args, **kwargs) def process(self): @@ -37,6 +38,7 @@ def __init__(self, *args, **kwargs): 'i-am-required': {'required': True} } } + kwargs['download_files'] = False super(DummyProcessorWithRequiredParameters, self).__init__(*args, **kwargs) class DummyProcessorWithOutput(Processor): @@ -44,6 +46,7 @@ class DummyProcessorWithOutput(Processor): def __init__(self, *args, **kwargs): kwargs['ocrd_tool'] = DUMMY_TOOL kwargs['version'] = '0.0.1' + kwargs['download_files'] = False super().__init__(*args, **kwargs) def process(self): From dfe7f8ef223e8ebcb6baae35efce702b3166bd64 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 24 Jun 2024 14:25:33 +0200 Subject: [PATCH 011/249] test DummyProcessor: override process_workspace() by delegating to process() directly --- tests/data/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/data/__init__.py b/tests/data/__init__.py index 113305e2b..d1edd2296 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -27,6 +27,10 @@ def __init__(self, *args, **kwargs): def process(self): print(json.dumps(self.parameter)) + # override to prevent iterating over empty files + def process_workspace(self, workspace): + self.process() + class DummyProcessorWithRequiredParameters(Processor): def process(self): pass def __init__(self, *args, **kwargs): From 1550668518923203646de04bd8ffce8ec143a2ec Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 24 Jun 2024 14:27:17 +0200 Subject: [PATCH 012/249] test builtin ocrd-dummy: adapt to consistent filename --- tests/processor/test_ocrd_dummy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/processor/test_ocrd_dummy.py b/tests/processor/test_ocrd_dummy.py index 41b585c6b..b85379e47 100644 --- a/tests/processor/test_ocrd_dummy.py +++ b/tests/processor/test_ocrd_dummy.py @@ -33,7 +33,7 @@ def test_copies_ok(self): output_files = workspace.mets.find_all_files(fileGrp='OUTPUT') output_files.sort(key=lambda x: x.url) assert output_files[0].local_filename == 'OUTPUT/OUTPUT_PHYS_0001.tif' - assert output_files[1].local_filename == 'OUTPUT/OUTPUT_PHYS_0001.xml' + assert output_files[1].local_filename == 'OUTPUT/OUTPUT_PHYS_0001_PAGE.xml' self.assertEqual(page_from_file(output_files[1]).pcGtsId, output_files[1].ID) assert page_from_file(output_files[1]).get_Page().imageFilename == str(output_files[0].local_filename) self.assertEqual(len(output_files), 6) From 75809b1949dfc5385a7c5156bbae2aace0b77c94 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 24 Jun 2024 14:31:58 +0200 Subject: [PATCH 013/249] test processor: adapt to `input_file_grp` required --- tests/processor/test_processor.py | 6 ++++-- tests/test_logging.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 784f68fc3..740846e89 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -112,7 +112,7 @@ def test_params(self): def test_run_agent(self): no_agents_before = len(self.workspace.mets.agents) - run_processor(DummyProcessor, workspace=self.workspace) + run_processor(DummyProcessor, workspace=self.workspace, input_file_grp="OCR-D-IMG") self.assertEqual(len(self.workspace.mets.agents), no_agents_before + 1, 'one more agent') # print(self.workspace.mets.agents[no_agents_before]) @@ -153,7 +153,9 @@ def test_run_output_overwrite(self): def test_run_cli(self): with TemporaryDirectory() as tempdir: - run_processor(DummyProcessor, workspace=self.workspace) + run_processor(DummyProcessor, workspace=self.workspace, + input_file_grp='OCR-D-IMG', + output_file_grp='OUTPUT') run_cli( 'echo', mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'), diff --git a/tests/test_logging.py b/tests/test_logging.py index 2e4e0861b..c2b6913b1 100644 --- a/tests/test_logging.py +++ b/tests/test_logging.py @@ -139,7 +139,7 @@ def testProcessorProfiling(self): getLogger('ocrd.process.profile').setLevel('DEBUG') getLogger('ocrd.process.profile').addHandler(ch) - run_processor(DummyProcessor, resolver=Resolver(), mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')) + run_processor(DummyProcessor, input_file_grp='OCR-D-IMG', resolver=Resolver(), mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')) log_contents = log_capture_string.getvalue() log_capture_string.close() From c429da5deeddc7400e5de83fc897c70700cdfd4a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 24 Jun 2024 14:32:45 +0200 Subject: [PATCH 014/249] test processor: adapt to `self.workspace` only during run_processor --- tests/processor/test_processor.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 740846e89..d65c5b3d4 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -54,12 +54,13 @@ def test_with_mets_url_input_files(self): input_file_grp='OCR-D-SEG-PAGE', resolver=self.resolver, workspace=self.workspace) + processor.workspace = self.workspace assert len(processor.input_files) == 2 assert [f.mimetype for f in processor.input_files] == [MIMETYPE_PAGE, MIMETYPE_PAGE] def test_parameter(self): with TemporaryDirectory(): - jsonpath = Path('params.json').name + jsonpath = 'params.json' with open(jsonpath, 'w') as f: f.write('{"baz": "quux"}') with open(jsonpath, 'r') as f: @@ -70,7 +71,7 @@ def test_parameter(self): resolver=self.resolver, workspace=self.workspace ) - self.assertEqual(len(processor.input_files), 3) + self.assertEqual(processor.parameter['baz'], 'quux') def test_verify(self): proc = DummyProcessor(self.workspace) From 295cdb63797bed56e2ae724ba9a8911454dca832 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 26 Jun 2024 14:02:55 +0200 Subject: [PATCH 015/249] Workspace.save_image_file: add kwarg file_path for predetermined local_filename --- src/ocrd/workspace.py | 20 ++++++++++++-------- tests/test_workspace.py | 11 +++++++---- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index 8ce42a070..5b7db48c5 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -1047,12 +1047,13 @@ def image_from_segment(self, segment, parent_image, parent_coords, return segment_image, segment_coords # pylint: disable=redefined-builtin - def save_image_file(self, image, - file_id, - file_grp, - page_id=None, - mimetype='image/png', - force=False): + def save_image_file(self, image : Image, + file_id : str, + file_grp : str, + file_path : Optional[str] = None, + page_id : Optional[str] = None, + mimetype : str = 'image/png', + force : bool = False) -> str: """Store an image in the filesystem and reference it as new file in the METS. Args: @@ -1060,12 +1061,14 @@ def save_image_file(self, image, file_id (string): `@ID` of the METS `file` to use file_grp (string): `@USE` of the METS `fileGrp` to use Keyword Args: + file_path (string): `@href` of the METS `file/FLocat` to use. page_id (string): `@ID` in the METS physical `structMap` to use mimetype (string): MIME type of the image format to serialize as force (boolean): whether to replace any existing `file` with that `@ID` Serialize the image into the filesystem, and add a `file` for it in the METS. - Use a filename extension based on ``mimetype``. + Use ``file_grp`` as directory and ``file_id`` concatenated with extension + based on ``mimetype`` as file name, unless directly passing ``file_path``. Returns: The (absolute) path of the created file. @@ -1075,7 +1078,8 @@ def save_image_file(self, image, force = True image_bytes = io.BytesIO() image.save(image_bytes, format=MIME_TO_PIL[mimetype]) - file_path = str(Path(file_grp, '%s%s' % (file_id, MIME_TO_EXT[mimetype]))) + if file_path is None: + file_path = str(Path(file_grp, '%s%s' % (file_id, MIME_TO_EXT[mimetype]))) out = self.add_file( file_grp, file_id=file_id, diff --git a/tests/test_workspace.py b/tests/test_workspace.py index c8df9b444..0f325f5ba 100644 --- a/tests/test_workspace.py +++ b/tests/test_workspace.py @@ -417,7 +417,7 @@ def test_save_image_file_invalid_mimetype_raises_exception(plain_workspace): # act raise with pytest.raises(KeyError) as key_exc: - plain_workspace.save_image_file(img, 'page1_img', 'IMG', 'page1', 'ceci/nest/pas/une/mimetype') + plain_workspace.save_image_file(img, 'page1_img', 'IMG', page_id='page1', mimetype='ceci/nest/pas/une/mimetype') assert "'ceci/nest/pas/une/mimetype'" == str(key_exc.value) @@ -428,13 +428,16 @@ def test_save_image_file(plain_workspace): img = Image.new('RGB', (1000, 1000)) # act - assert plain_workspace.save_image_file(img, 'page1_img', 'IMG', 'page1', 'image/jpeg') + assert plain_workspace.save_image_file(img, 'page1_img', 'IMG', page_id='page1', mimetype='image/jpeg') assert exists(join(plain_workspace.directory, 'IMG', 'page1_img.jpg')) # should succeed - assert plain_workspace.save_image_file(img, 'page1_img', 'IMG', 'page1', 'image/jpeg', force=True) + assert plain_workspace.save_image_file(img, 'page1_img', 'IMG', page_id='page1', mimetype='image/jpeg', force=True) # should also succeed plain_workspace.overwrite_mode = True - assert plain_workspace.save_image_file(img, 'page1_img', 'IMG', 'page1', 'image/jpeg') + assert plain_workspace.save_image_file(img, 'page1_img', 'IMG', page_id='page1', mimetype='image/jpeg') + # check file_path kwarg + assert plain_workspace.save_image_file(img, 'page1_img2', 'IMG', page_id='page1', file_path='IMG/page1_img2.png') + assert exists(join(plain_workspace.directory, 'IMG', 'page1_img2.png')) @pytest.fixture(name='workspace_kant_aufklaerung') From e2cbcb94eb5130bd2be937fa4d5fca119331e123 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 26 Jun 2024 14:04:01 +0200 Subject: [PATCH 016/249] Processor.process_page_pcgts: add kwargs and allow returning derived images --- src/ocrd/processor/base.py | 25 +++++++++++++++---- src/ocrd/processor/builtin/dummy_processor.py | 2 +- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 78bc47c47..ddbf32b02 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -276,27 +276,37 @@ def process_page_file(self, *input_files) -> None: """ log = getLogger('ocrd.processor.base') input_pcgts = [None] * len(input_files) + page_id = input_files[0].pageId for i, input_file in enumerate(input_files): # FIXME: what about non-PAGE input like image or JSON ??? log.debug("parsing file %s for page %s", input_file.ID, input_file.pageId) try: input_pcgts[i] = page_from_file(input_file) except ValueError as e: - log.info("non-PAGE input for page %s: %s", input_file.pageId, e) - output_pcgts = self.process_page_pcgts(*input_pcgts) + log.info("non-PAGE input for page %s: %s", page_id, e) output_file_id = make_file_id(input_files[0], self.output_file_grp) + output_pcgts = self.process_page_pcgts(*input_pcgts, output_file_id=output_file_id, page_id=page_id) + if isinstance(output_pcgts, (list, tuple)): + output_images = output_pcgts[1:] + output_pcgts = output_pcgts[0] + for output_image_pil, output_image_id, output_image_path in output_images: + self.workspace.save_image_file( + output_image_pil, + output_image_id, + self.output_file_grp, + page_id=page_id, + file_path=output_image_path) output_pcgts.set_pcGtsId(output_file_id) self.add_metadata(output_pcgts) - # FIXME: what about save_image_file in process_page ??? # FIXME: what about non-PAGE output like JSON ??? self.workspace.add_file(file_id=output_file_id, file_grp=self.output_file_grp, - page_id=input_files[0].pageId, + page_id=page_id, local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), mimetype=MIMETYPE_PAGE, content=to_xml(output_pcgts)) - def process_page_pcgts(self, *input_pcgts) -> OcrdPage: + def process_page_pcgts(self, *input_pcgts, output_file_id : str = None, page_id : str = None) -> OcrdPage: """ Process the given ``input_pcgts`` of the :py:attr:`workspace`, representing one physical page (passed as one parsed @@ -304,6 +314,11 @@ def process_page_pcgts(self, *input_pcgts) -> OcrdPage: under the given :py:attr:`parameter`, and return the resulting :py:class:`~ocrd_models.OcrdPage`. + Optionally, return a list or tuple of the :py:class:`~ocrd_models.OcrdPage` + and one or more lists or tuples of :py:class:`PIL.Image` (image data), + :py:class:str (file ID) and :py:class:str (file path) of derived images + to be annotated along with the resulting PAGE file. + (This contains the main functionality and must be overridden by subclasses.) """ raise NotImplementedError() diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index d16e18271..9916d70ae 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -24,7 +24,7 @@ class DummyProcessor(Processor): Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group """ - def process_page_pcgts(self, *input_pcgts): + def process_page_pcgts(self, *input_pcgts, output_file_id=None, page_id=None): # nothing to do here return input_pcgts[0] From 20a6a1cda0af286e7832595d6161bba13492bd4d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 26 Jun 2024 23:44:37 +0200 Subject: [PATCH 017/249] Workspace.save_image_file: save DPI metadata, too --- src/ocrd/workspace.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index 5b7db48c5..4a7eea432 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -1076,8 +1076,11 @@ def save_image_file(self, image : Image, log = getLogger('ocrd.workspace.save_image_file') if self.overwrite_mode: force = True + saveargs = dict() + if 'dpi' in image.info: + saveargs['dpi'] = image.info['dpi'] image_bytes = io.BytesIO() - image.save(image_bytes, format=MIME_TO_PIL[mimetype]) + image.save(image_bytes, format=MIME_TO_PIL[mimetype], **saveargs) if file_path is None: file_path = str(Path(file_grp, '%s%s' % (file_id, MIME_TO_EXT[mimetype]))) out = self.add_file( From 679ad85f6191e1529c4e739ddead15724be84134 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 26 Jun 2024 23:46:31 +0200 Subject: [PATCH 018/249] Workspace.image_from_*: annotate 'DPI' in result dict and ensure it's used in meta-data of resulting image --- src/ocrd/workspace.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index 4a7eea432..bd9e4c502 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -631,6 +631,7 @@ def image_from_page(self, page, page_id, i.e. after cropping to the page's border / bounding box (if any) and deskewing with the page's orientation angle (if any) - `"angle"`: the rotation/reflection angle applied to the image so far, + - `"DPI"`: the pixel density of the original image, - `"features"`: the `AlternativeImage` `@comments` for the image, i.e. names of all applied operations that lead up to this result, * an :py:class:`ocrd_models.ocrd_exif.OcrdExif` instance associated with @@ -672,6 +673,13 @@ def image_from_page(self, page, page_id, page_coords['angle'] = 0 # nothing applied yet (depends on filters) log.debug("page '%s' has %s orientation=%d skew=%.2f", page_id, "border," if border else "", orientation, skew) + if page_image_info.resolution != 1: + dpi = page_image_info.resolution + if page_image_info.resolutionUnit == 'cm': + dpi = round(dpi * 2.54) + dpi = int(dpi) + log.debug("page '%s' images will use %d DPI from image meta-data", page_id, dpi) + page_coords['DPI'] = dpi # initialize AlternativeImage@comments classes as empty: page_coords['features'] = '' @@ -790,6 +798,11 @@ def image_from_page(self, page, page_id, 'filter="%s" in page "%s"' % ( feature_filter, page_id)) page_image.format = 'PNG' # workaround for tesserocr#194 + # ensure DPI will be set in image meta-data again + if 'DPI' in page_coords: + dpi = page_coords['DPI'] + if 'dpi' not in page_image.info: + page_image.info['dpi'] = (dpi, dpi) return page_image, page_coords, page_image_info def image_from_segment(self, segment, parent_image, parent_coords, @@ -810,6 +823,7 @@ def image_from_segment(self, segment, parent_image, parent_coords, converts from absolute coordinates to those relative to the image, i.e. after applying all operations (starting with the original image) - `"angle"`: the rotation/reflection angle applied to the image so far, + - `"DPI"`: the pixel density of the parent image, - `"features"`: the ``AlternativeImage/@comments`` for the image, i.e. names of all operations that lead up to this result, and Keyword Args: @@ -875,6 +889,7 @@ def image_from_segment(self, segment, parent_image, parent_coords, the segment's bounding box, and deskewing with the segment's orientation angle (if any) - `"angle"`: the rotation/reflection angle applied to the image so far, + - `"DPI"`: the pixel density of this image, - `"features"`: the ``AlternativeImage/@comments`` for the image, i.e. names of all applied operations that lead up to this result. @@ -937,6 +952,8 @@ def image_from_segment(self, segment, parent_image, parent_coords, orientation = 0 skew = 0 segment_coords['angle'] = parent_coords['angle'] # nothing applied yet (depends on filters) + if 'DPI' in parent_coords: + segment_coords['DPI'] = parent_coords['DPI'] # not rescaled yet # initialize AlternativeImage@comments classes from parent, except # for those operations that can apply on multiple hierarchy levels: @@ -1044,6 +1061,11 @@ def image_from_segment(self, segment, parent_image, parent_coords, 'filter="%s" in segment "%s"' % ( feature_filter, segment.id)) segment_image.format = 'PNG' # workaround for tesserocr#194 + # ensure DPI will be set in image meta-data again + if 'DPI' in segment_coords: + dpi = segment_coords['DPI'] + if 'dpi' not in segment_image.info: + segment_image.info['dpi'] = (dpi, dpi) return segment_image, segment_coords # pylint: disable=redefined-builtin From 565a3d9806793cede166c8dd2d342a35e294e1db Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 26 Jun 2024 23:47:44 +0200 Subject: [PATCH 019/249] test_workspace: adapt to image_from_* DPI and add assertions --- tests/test_workspace.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/test_workspace.py b/tests/test_workspace.py index 0f325f5ba..2fe5f450a 100644 --- a/tests/test_workspace.py +++ b/tests/test_workspace.py @@ -487,8 +487,10 @@ def test_image_from_page_basic(workspace_gutachten_data): pcgts = parseString(f.read().encode('utf8'), silence=True) # act + assert - _, info, _ = workspace_gutachten_data.image_from_page(pcgts.get_Page(), page_id='PHYS_0017', feature_selector='clipped', feature_filter='cropped') - assert info['features'] == 'binarized,clipped' + img, coords, _ = workspace_gutachten_data.image_from_page(pcgts.get_Page(), page_id='PHYS_0017', feature_selector='clipped', feature_filter='cropped') + assert coords['features'] == 'binarized,clipped' + assert isinstance(img.info.get('dpi', None), tuple) + assert img.info['dpi'][0] == coords['DPI'] _, info, _ = workspace_gutachten_data.image_from_page(pcgts.get_Page(), page_id='PHYS_0017') assert info['features'] == 'binarized,clipped' @@ -529,6 +531,7 @@ def test_deskewing(plain_workspace): skew = 4.625 image = Image.new('L', size) image = polygon_mask(image, poly) + image.info['dpi'] = (300, 300) #image.show(title='image') pixels = np.count_nonzero(np.array(image) > 0) name = 'foo0' @@ -539,9 +542,12 @@ def test_deskewing(plain_workspace): Coords=CoordsType(points=points_from_polygon(poly)), orientation=-skew) page.add_TextRegion(region) - page_image, page_coords, _ = plain_workspace.image_from_page(page, '') + page_image, page_coords, page_info = plain_workspace.image_from_page(page, '') #page_image.show(title='page_image') assert list(image.getdata()) == list(page_image.getdata()) + assert 'dpi' in page_image.info + assert round(page_image.info['dpi'][0]) == 300 + assert page_coords['DPI'] == 300 assert np.all(page_coords['transform'] == np.eye(3)) reg_image, reg_coords = plain_workspace.image_from_segment(region, page_image, page_coords, feature_filter='deskewed', fill=0) @@ -550,6 +556,7 @@ def test_deskewing(plain_workspace): assert reg_image.height == xywh['h'] == 335 assert reg_coords['transform'][0, 2] == -xywh['x'] assert reg_coords['transform'][1, 2] == -xywh['y'] + assert round(reg_image.info['dpi'][0]) == 300 # same fg after cropping to minimal bbox reg_pixels = np.count_nonzero(np.array(reg_image) > 0) assert pixels == reg_pixels @@ -561,6 +568,7 @@ def test_deskewing(plain_workspace): assert reg_coords['transform'][0, 1] != 0 assert reg_coords['transform'][1, 0] != 0 assert 'deskewed' in reg_coords['features'] + assert round(reg_image.info['dpi'][0]) == 300 # same fg after cropping to minimal bbox (roughly - due to aliasing) reg_pixels = np.count_nonzero(np.array(reg_image) > 0) assert np.abs(pixels - reg_pixels) / pixels < 0.005 @@ -582,6 +590,7 @@ def test_deskewing(plain_workspace): assert reg_image2.height == reg_image.height assert np.allclose(reg_coords2['transform'], reg_coords['transform']) assert reg_coords2['features'] == reg_coords['features'] + assert round(reg_image2.info['dpi'][0]) == 300 # same fg after cropping to minimal bbox (roughly - due to aliasing) reg_pixels2 = np.count_nonzero(np.array(reg_image) > 0) assert reg_pixels2 == reg_pixels From 46f81aa75e42c692742b0e98de248e9ee44bfbfd Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 6 Jul 2024 18:57:39 +0200 Subject: [PATCH 020/249] autoload ocrd-tool.json and version from dist, executable name from entry point in stack --- src/ocrd/processor/base.py | 56 ++++++++++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 8 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index ddbf32b02..a572b26ce 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -17,6 +17,7 @@ from pathlib import Path from typing import Optional import sys +import inspect import tarfile import io from deprecated import deprecated @@ -33,7 +34,9 @@ list_all_resources, get_processor_resource_types, resource_filename, + resource_string, make_file_id, + deprecation_warning ) from ocrd_validators import ParameterValidator from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType, OcrdPage, to_xml @@ -65,6 +68,38 @@ class Processor(): a number of optional or mandatory parameters. """ + @property + def metadata(self): + """the ocrd-tool.json dict of the package""" + if hasattr(self, '_metadata'): + return self._metadata + self._metadata = json.loads(resource_string(self.__module__.split('.')[0], 'ocrd-tool.json')) + return self._metadata + + @property + def version(self): + """the version of the package""" + if hasattr(self, '_version'): + return self._version + self._version = self.metadata['version'] + return self._version + + @property + def executable(self): + """the executable name of this processor tool""" + if hasattr(self, '_executable'): + return self._executable + self._executable = os.path.basename(inspect.stack()[-1].filename) + return self._executable + + @property + def ocrd_tool(self): + """the ocrd-tool.json dict of this processor tool""" + if hasattr(self, '_ocrd_tool'): + return self._ocrd_tool + self._ocrd_tool = self.metadata['tools'][self.executable] + return self._ocrd_tool + def __init__( self, # FIXME: deprecate in favor of process_workspace(workspace) @@ -97,8 +132,6 @@ def __init__( Can be ``None`` even for processing (esp. on multiple workspaces), \ but then needs to be set before running. Keyword Args: - ocrd_tool (string): JSON of the ocrd-tool description for that processor. \ - Can be ``None`` for processing, but needs to be set before running. parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \ Can be ``None`` even for processing, but then needs to be set before running. input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input. @@ -123,11 +156,17 @@ def __init__( dump_module_dir (boolean): If true, then instead of processing, print :py:attr:`moduledir` \ on stdout. """ - self.ocrd_tool = ocrd_tool - if parameter is None: - parameter = {} + if ocrd_tool is not None: + deprecation_warning("Passing 'ocrd_tool' as keyword argument to Processor is deprecated - " + "use or override metadata/executable/ocrd-tool properties instead") + self._ocrd_tool = ocrd_tool + self._executable = ocrd_tool['executable'] + if version is not None: + deprecation_warning("Passing 'version' as keyword argument to Processor is deprecated - " + "use or override metadata/version properties instead") + self._version = version if dump_json: - print(json.dumps(ocrd_tool, indent=True)) + print(json.dumps(self.ocrd_tool, indent=True)) return if dump_module_dir: print(self.moduledir) @@ -156,7 +195,6 @@ def __init__( if show_help: self.show_help(subcommand=subcommand) return - self.version = version if show_version: self.show_version() return @@ -172,7 +210,9 @@ def __init__( self.output_file_grp = output_file_grp self.page_id = None if page_id == [] or page_id is None else page_id self.download = download_files - parameterValidator = ParameterValidator(ocrd_tool) + if parameter is None: + parameter = {} + parameterValidator = ParameterValidator(self.ocrd_tool) report = parameterValidator.validate(parameter) if not report.is_valid: raise Exception("Invalid parameters %s" % report.errors) From 4dd83aaa25f3f13660700a258ab5abfa1887c2cf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 6 Jul 2024 19:00:06 +0200 Subject: [PATCH 021/249] adapt to new Processor init (override metadata/version/executable name) --- src/ocrd/cli/bashlib.py | 16 ++++++--- src/ocrd/cli/ocrd_tool.py | 36 ++++++++++++++----- src/ocrd/processor/builtin/dummy_processor.py | 15 +++++--- 3 files changed, 50 insertions(+), 17 deletions(-) diff --git a/src/ocrd/cli/bashlib.py b/src/ocrd/cli/bashlib.py index 1def4638c..8b79d82fb 100644 --- a/src/ocrd/cli/bashlib.py +++ b/src/ocrd/cli/bashlib.py @@ -108,11 +108,17 @@ def bashlib_input_files(**kwargs): raise FileNotFoundError(msg) resolver = Resolver() workspace = resolver.workspace_from_url(mets, working_dir) - processor = Processor(workspace, - ocrd_tool=None, - page_id=kwargs['page_id'], - input_file_grp=kwargs['input_file_grp'], - output_file_grp=kwargs['output_file_grp']) + class BashlibProcessor(Processor): + @property + def ocrd_tool(self): + return {} + @property + def executable(self): + return '' + processor = BashlibProcessor(workspace, + page_id=kwargs['page_id'], + input_file_grp=kwargs['input_file_grp'], + output_file_grp=kwargs['output_file_grp']) for input_files in processor.zip_input_files(mimetype=None, on_error='abort'): # ensure all input files exist locally (without persisting them in the METS) # - this mimics the default behaviour of all Pythonic processors diff --git a/src/ocrd/cli/ocrd_tool.py b/src/ocrd/cli/ocrd_tool.py index 2a7fa99ec..b9807b0d7 100644 --- a/src/ocrd/cli/ocrd_tool.py +++ b/src/ocrd/cli/ocrd_tool.py @@ -100,10 +100,15 @@ def ocrd_tool_tool_description(ctx): def ocrd_tool_tool_list_resources(ctx): class BashProcessor(Processor): @property + def metadata(self): + return ctx.json + @property + def executable(self): + return ctx.tool_name + @property def moduledir(self): return os.path.dirname(ctx.filename) - BashProcessor(None, ocrd_tool=ctx.json['tools'][ctx.tool_name], - list_resources=True) + BashProcessor(None, list_resources=True) @ocrd_tool_tool.command('resolve-resource', help="Get a tool's file resource full path name") @click.argument('res_name') @@ -111,10 +116,15 @@ def moduledir(self): def ocrd_tool_tool_resolve_resource(ctx, res_name): class BashProcessor(Processor): @property + def metadata(self): + return ctx.json + @property + def executable(self): + return ctx.tool_name + @property def moduledir(self): return os.path.dirname(ctx.filename) - BashProcessor(None, ocrd_tool=ctx.json['tools'][ctx.tool_name], - resolve_resource=res_name) + BashProcessor(None, resolve_resource=res_name) @ocrd_tool_tool.command('show-resource', help="Dump a tool's file resource") @click.argument('res_name') @@ -122,24 +132,34 @@ def moduledir(self): def ocrd_tool_tool_show_resource(ctx, res_name): class BashProcessor(Processor): @property + def metadata(self): + return ctx.json + @property + def executable(self): + return ctx.tool_name + @property def moduledir(self): return os.path.dirname(ctx.filename) - BashProcessor(None, ocrd_tool=ctx.json['tools'][ctx.tool_name], - show_resource=res_name) + BashProcessor(None, show_resource=res_name) @ocrd_tool_tool.command('help', help="Generate help for processors") @click.argument('subcommand', required=False) @pass_ocrd_tool def ocrd_tool_tool_params_help(ctx, subcommand): class BashProcessor(Processor): + @property + def metadata(self): + return ctx.json + @property + def executable(self): + return ctx.tool_name # set docstrings to empty __doc__ = None # HACK: override the module-level docstring, too getmodule(OcrdToolCtx).__doc__ = None def process(self): return super() - BashProcessor(None, ocrd_tool=ctx.json['tools'][ctx.tool_name], - show_help=True, subcommand=subcommand) + BashProcessor(None, show_help=True, subcommand=subcommand) # ---------------------------------------------------------------------- # ocrd ocrd-tool tool categories diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index 9916d70ae..424c05772 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -67,10 +67,17 @@ def process_page_file(self, *input_files): # we can rely on base implementation verbatim super().process_page_file(input_file) - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-dummy'] - kwargs['version'] = '0.0.3' - super(DummyProcessor, self).__init__(*args, **kwargs) + @property + def metadata(self): + return OCRD_TOOL + + @property + def executable(self): + return 'ocrd-dummy' + + @property + def version(self): + return '0.0.3' def setup(self): super().setup() From 4cafbcc88f7ce1a1e89da3fd327746932632b687 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 6 Jul 2024 19:00:36 +0200 Subject: [PATCH 022/249] tests: adapt to new Processor init (override metadata/version/executable name) --- tests/data/__init__.py | 52 ++++++++++++++++++++++++------- tests/processor/test_processor.py | 21 ++++++++++--- 2 files changed, 58 insertions(+), 15 deletions(-) diff --git a/tests/data/__init__.py b/tests/data/__init__.py index d1edd2296..ff403ebef 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -17,12 +17,21 @@ } class DummyProcessor(Processor): + @property + def ocrd_tool(self): + return DUMMY_TOOL + + @property + def version(self): + return '0.0.1' + + @property + def executable(self): + return 'ocrd-test' def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = DUMMY_TOOL - kwargs['version'] = '0.0.1' kwargs['download_files'] = False - super(DummyProcessor, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) def process(self): print(json.dumps(self.parameter)) @@ -32,24 +41,43 @@ def process_workspace(self, workspace): self.process() class DummyProcessorWithRequiredParameters(Processor): - def process(self): pass - def __init__(self, *args, **kwargs): - kwargs['version'] = '0.0.1' - kwargs['ocrd_tool'] = { + @property + def ocrd_tool(self): + return { 'executable': 'ocrd-test', 'steps': ['recognition/post-correction'], 'parameters': { 'i-am-required': {'required': True} } } + @property + def version(self): + return '0.0.1' + + @property + def executable(self): + return 'ocrd-test' + + def __init__(self, *args, **kwargs): kwargs['download_files'] = False - super(DummyProcessorWithRequiredParameters, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) + + def process(self): pass class DummyProcessorWithOutput(Processor): + @property + def ocrd_tool(self): + return DUMMY_TOOL + + @property + def version(self): + return '0.0.1' + + @property + def executable(self): + return 'ocrd-test' def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = DUMMY_TOOL - kwargs['version'] = '0.0.1' kwargs['download_files'] = False super().__init__(*args, **kwargs) @@ -67,6 +95,8 @@ def process(self): content='CONTENT') class IncompleteProcessor(Processor): - pass + @property + def ocrd_tool(self): + return {} diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index d65c5b3d4..e0ebfbb1d 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -108,7 +108,11 @@ def test_params_preset_resolve(self): overwrite=True) def test_params(self): - proc = Processor(workspace=self.workspace) + class ParamTestProcessor(Processor): + @property + def ocrd_tool(self): + return {} + proc = ParamTestProcessor(self.workspace) self.assertEqual(proc.parameter, {}) def test_run_agent(self): @@ -176,7 +180,10 @@ def test_run_cli(self): ) def test_zip_input_files(self): - class ZipTestProcessor(Processor): pass + class ZipTestProcessor(Processor): + @property + def ocrd_tool(self): + return {} with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') @@ -196,7 +203,10 @@ class ZipTestProcessor(Processor): pass assert ('foobar3', 'foobar4') in tuples def test_zip_input_files_multi_mixed(self): - class ZipTestProcessor(Processor): pass + class ZipTestProcessor(Processor): + @property + def ocrd_tool(self): + return {} with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') @@ -234,7 +244,10 @@ class ZipTestProcessor(Processor): pass tuples = proc.zip_input_files() def test_zip_input_files_require_first(self): - class ZipTestProcessor(Processor): pass + class ZipTestProcessor(Processor): + @property + def ocrd_tool(self): + return {} self.capture_out_err() with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) From 9c9a4c92258f76a146dc7d96fb262063649e2457 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 29 Jul 2024 13:18:36 +0200 Subject: [PATCH 023/249] generate_processor_help: include process_workspace docstring, too --- src/ocrd/processor/helpers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index b4b798706..d94bec124 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -230,6 +230,8 @@ def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None) doc_help += '\n' + inspect.cleandoc(module.__doc__) + '\n' if processor_instance.__doc__: doc_help += '\n' + inspect.cleandoc(processor_instance.__doc__) + '\n' + if processor_instance.process_workspace.__doc__: + doc_help += '\n' + inspect.cleandoc(processor_instance.process_workspace.__doc__) + '\n' if processor_instance.process.__doc__: doc_help += '\n' + inspect.cleandoc(processor_instance.process.__doc__) + '\n' if doc_help: From aa0bd68dc20e601e34f659c51b542308720b52c1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 8 Aug 2024 17:55:56 +0200 Subject: [PATCH 024/249] get_processor: also run setup if instance_caching --- src/ocrd/processor/helpers.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index d94bec124..e1de22770 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -369,7 +369,9 @@ def get_cached_processor(parameter: dict, processor_class): """ if processor_class: dict_params = dict(parameter) if parameter else None - return processor_class(workspace=None, parameter=dict_params) + processor = processor_class(workspace=None, parameter=dict_params) + processor.setup() + return processor return None From 99d16281d63a3c641a26e01979b8a93841107bef Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 12 Aug 2024 10:12:05 +0200 Subject: [PATCH 025/249] ocrd-tool CLI: pass class in context --- src/ocrd/cli/ocrd_tool.py | 70 +++++++++++++-------------------------- 1 file changed, 23 insertions(+), 47 deletions(-) diff --git a/src/ocrd/cli/ocrd_tool.py b/src/ocrd/cli/ocrd_tool.py index b9807b0d7..dacefab00 100644 --- a/src/ocrd/cli/ocrd_tool.py +++ b/src/ocrd/cli/ocrd_tool.py @@ -31,6 +31,25 @@ def __init__(self, filename): self.content = f.read() self.json = loads(self.content) + class BashProcessor(Processor): + @property + def metadata(inner_self): + return self.json + @property + def executable(inner_self): + return self.tool_name + @property + def moduledir(inner_self): + return os.path.dirname(self.filename) + # set docstrings to empty + __doc__ = None + # HACK: override the module-level docstring, too + getmodule(OcrdToolCtx).__doc__ = None + def process(inner_self): + return super() + + self.processor = BashProcessor + pass_ocrd_tool = click.make_pass_decorator(OcrdToolCtx) # ---------------------------------------------------------------------- @@ -98,68 +117,25 @@ def ocrd_tool_tool_description(ctx): @ocrd_tool_tool.command('list-resources', help="List tool's file resources") @pass_ocrd_tool def ocrd_tool_tool_list_resources(ctx): - class BashProcessor(Processor): - @property - def metadata(self): - return ctx.json - @property - def executable(self): - return ctx.tool_name - @property - def moduledir(self): - return os.path.dirname(ctx.filename) - BashProcessor(None, list_resources=True) + ctx.processor(None, list_resources=True) @ocrd_tool_tool.command('resolve-resource', help="Get a tool's file resource full path name") @click.argument('res_name') @pass_ocrd_tool def ocrd_tool_tool_resolve_resource(ctx, res_name): - class BashProcessor(Processor): - @property - def metadata(self): - return ctx.json - @property - def executable(self): - return ctx.tool_name - @property - def moduledir(self): - return os.path.dirname(ctx.filename) - BashProcessor(None, resolve_resource=res_name) + ctx.processor(None, resolve_resource=res_name) @ocrd_tool_tool.command('show-resource', help="Dump a tool's file resource") @click.argument('res_name') @pass_ocrd_tool def ocrd_tool_tool_show_resource(ctx, res_name): - class BashProcessor(Processor): - @property - def metadata(self): - return ctx.json - @property - def executable(self): - return ctx.tool_name - @property - def moduledir(self): - return os.path.dirname(ctx.filename) - BashProcessor(None, show_resource=res_name) + ctx.processor(None, show_resource=res_name) @ocrd_tool_tool.command('help', help="Generate help for processors") @click.argument('subcommand', required=False) @pass_ocrd_tool def ocrd_tool_tool_params_help(ctx, subcommand): - class BashProcessor(Processor): - @property - def metadata(self): - return ctx.json - @property - def executable(self): - return ctx.tool_name - # set docstrings to empty - __doc__ = None - # HACK: override the module-level docstring, too - getmodule(OcrdToolCtx).__doc__ = None - def process(self): - return super() - BashProcessor(None, show_help=True, subcommand=subcommand) + ctx.processor(None, show_help=True, subcommand=subcommand) # ---------------------------------------------------------------------- # ocrd ocrd-tool tool categories From 12231b8ee8e581c071d2ffb7d10a3b261a4369ed Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 12 Aug 2024 10:26:59 +0200 Subject: [PATCH 026/249] use more specific exception if parameters are invalid Co-authored-by: Konstantin Baierer --- src/ocrd/processor/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index a572b26ce..5860a2861 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -215,9 +215,9 @@ def __init__( parameterValidator = ParameterValidator(self.ocrd_tool) report = parameterValidator.validate(parameter) if not report.is_valid: - raise Exception("Invalid parameters %s" % report.errors) + raise ValueError("Invalid parameters %s" % report.errors) self.parameter = parameter - # workaround for deprecated#72 (deprecation does not work for subclasses): + # workaround for deprecated#72 (@deprecated decorator does not work for subclasses): setattr(self, 'process', deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process'))) From d112f8ffb4885287cc35761805621a9c4eb0592a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 12 Aug 2024 10:31:14 +0200 Subject: [PATCH 027/249] run_processor w/ mem_usage: pass as args tuple Co-authored-by: Konstantin Baierer --- src/ocrd/processor/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index e1de22770..d9edaaa25 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -104,7 +104,7 @@ def run_processor( backend = 'psutil_pss' if 'PSS' in config.OCRD_PROFILE else 'psutil' from memory_profiler import memory_usage try: - mem_usage = memory_usage(proc=processor.process_workspace(workspace), + mem_usage = memory_usage(proc=(processor.process_workspace, [workspace], {})), # only run process once max_iterations=1, interval=.1, timeout=None, timestamps=True, From 319ceaa4e56c11c9e21f6b0e9c872d5a6a09e039 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 12 Aug 2024 11:22:09 +0200 Subject: [PATCH 028/249] Processor.process_workspace: add fileGrp assertions --- src/ocrd/processor/base.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 5860a2861..f1ecd8def 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -246,12 +246,12 @@ def setup(self) -> None: @deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()') def process(self) -> None: """ - Process all files of the :py:attr:`workspace` + Process all files of the :py:attr:`workspace` from the given :py:attr:`input_file_grp` to the given :py:attr:`output_file_grp` for the given :py:attr:`page_id` (or all pages) under the given :py:attr:`parameter`. - + (This contains the main functionality and needs to be overridden by subclasses.) """ raise NotImplementedError() @@ -267,11 +267,11 @@ def process_workspace(self, workspace: Workspace) -> None: (This will iterate over pages and files, calling :py:meth:`process_page`, handling exceptions.) """ - # assert self.input_file_grp is not None - # assert self.output_file_grp is not None - # input_file_grps = self.input_file_grp.split(',') - # for input_file_grp in input_file_grps: - # assert input_file_grp in workspace.mets.file_groups + assert self.input_file_grp is not None + assert self.output_file_grp is not None + input_file_grps = self.input_file_grp.split(',') + for input_file_grp in input_file_grps: + assert input_file_grp in workspace.mets.file_groups log = getLogger('ocrd.processor.base') with pushd_popd(workspace.directory): self.workspace = workspace From 80590a9b8ce0804b0bc73a47f9424967dc3d39b8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 12 Aug 2024 12:52:40 +0200 Subject: [PATCH 029/249] process_page_pcgts: add (variadic) type checks Co-authored-by: Konstantin Baierer --- src/ocrd/processor/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index f1ecd8def..170b1643a 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -346,7 +346,7 @@ def process_page_file(self, *input_files) -> None: mimetype=MIMETYPE_PAGE, content=to_xml(output_pcgts)) - def process_page_pcgts(self, *input_pcgts, output_file_id : str = None, page_id : str = None) -> OcrdPage: + def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[str] = None, page_id : Optional[str] = None) -> OcrdPage: """ Process the given ``input_pcgts`` of the :py:attr:`workspace`, representing one physical page (passed as one parsed From 68ae8ff382adf4bf6662f29829ac2d96989e628d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 12 Aug 2024 17:34:39 +0200 Subject: [PATCH 030/249] run_processor: fix typo --- src/ocrd/processor/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index d9edaaa25..92846a6f0 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -104,7 +104,7 @@ def run_processor( backend = 'psutil_pss' if 'PSS' in config.OCRD_PROFILE else 'psutil' from memory_profiler import memory_usage try: - mem_usage = memory_usage(proc=(processor.process_workspace, [workspace], {})), + mem_usage = memory_usage(proc=(processor.process_workspace, [workspace], {}), # only run process once max_iterations=1, interval=.1, timeout=None, timestamps=True, From 2a18883d7883c8897eabb3242edceb282a4db673 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 12 Aug 2024 17:36:30 +0200 Subject: [PATCH 031/249] Processor init: deprecate passing workspace --- src/ocrd/processor/base.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 170b1643a..3cf132278 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -129,8 +129,8 @@ def __init__( Args: workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \ - Can be ``None`` even for processing (esp. on multiple workspaces), \ - but then needs to be set before running. + Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \ + before processing. Keyword Args: parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \ Can be ``None`` even for processing, but then needs to be set before running. @@ -200,10 +200,8 @@ def __init__( return self.workspace = workspace if self.workspace: - # FIXME deprecate setting this and calling process() over using process_workspace() - # which uses pushd_popd(self.workspace.directory) - # (because there is no way to do that in process() since it's an - # overridden method. chdir is almost always an anti-pattern.) + deprecation_warning("Passing a workspace argument other than 'None' to Processor " + "is deprecated - pass as argument to process_workspace instead") self.old_pwd = getcwd() os.chdir(self.workspace.directory) self.input_file_grp = input_file_grp From b9338b4c8418a0a572358fe265ada7e3cb7dcff1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 12 Aug 2024 17:37:08 +0200 Subject: [PATCH 032/249] docs: fix relative VERSION path --- docs/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index 3ab2e1826..f1f8f5e55 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -15,7 +15,7 @@ # import os # import sys # # sys.path.insert(0, os.path.abspath('..')) -with open('VERSION', encoding='utf-8') as f: +with open('../VERSION', encoding='utf-8') as f: VERSION = f.read() From 6ca6a4086b786fdd6bfbf60fb3b8ae5c21b398e3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 12 Aug 2024 17:37:29 +0200 Subject: [PATCH 033/249] docs: do/not exclude tests/src --- docs/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index f1f8f5e55..917c5c62c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -72,7 +72,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path . -exclude_patterns = [u'build', 'Thumbs.db', '.DS_Store', 'src', 'venv'] +exclude_patterns = [u'build', 'Thumbs.db', '.DS_Store', 'tests', 'venv'] # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' From bc9ec057639df7e85b4748735f451d64ce1dd836 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 12 Aug 2024 17:37:44 +0200 Subject: [PATCH 034/249] docs: add ocrd_network module --- docs/index.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/index.rst b/docs/index.rst index 96a4e9836..67bba66fe 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -7,9 +7,10 @@ OCR-D/core ocrd ocrd_utils + ocrd_modelfactory ocrd_models ocrd_validators - ocrd_modelfactory + ocrd_network Indices and tables From 54f1d88e1a233e7b93db5356332aac21389004d1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 12 Aug 2024 17:43:21 +0200 Subject: [PATCH 035/249] docs:regenerated rST --- docs/api/ocrd_network/ocrd_network.deployer.rst | 7 ------- .../ocrd_network.deployment_utils.rst | 7 ------- docs/api/ocrd_network/ocrd_network.logging.rst | 7 ------- .../ocrd_network/ocrd_network.logging_utils.rst | 7 +++++++ .../ocrd_network.rabbitmq_utils.helpers.rst | 7 +++++++ .../ocrd_network.rabbitmq_utils.rst | 1 + docs/api/ocrd_network/ocrd_network.rst | 7 +++---- .../ocrd_network.runtime_data.config_parser.rst | 7 +++++++ ..._network.runtime_data.connection_clients.rst | 7 +++++++ .../ocrd_network.runtime_data.deployer.rst | 7 +++++++ .../ocrd_network.runtime_data.hosts.rst | 7 +++++++ ...ocrd_network.runtime_data.network_agents.rst | 7 +++++++ ...rd_network.runtime_data.network_services.rst | 7 +++++++ .../ocrd_network/ocrd_network.runtime_data.rst | 17 +++++++++++++++-- .../ocrd_network.tcp_to_uds_mets_proxy.rst | 7 +++++++ 15 files changed, 82 insertions(+), 27 deletions(-) delete mode 100644 docs/api/ocrd_network/ocrd_network.deployer.rst delete mode 100644 docs/api/ocrd_network/ocrd_network.deployment_utils.rst delete mode 100644 docs/api/ocrd_network/ocrd_network.logging.rst create mode 100644 docs/api/ocrd_network/ocrd_network.logging_utils.rst create mode 100644 docs/api/ocrd_network/ocrd_network.rabbitmq_utils.helpers.rst create mode 100644 docs/api/ocrd_network/ocrd_network.runtime_data.config_parser.rst create mode 100644 docs/api/ocrd_network/ocrd_network.runtime_data.connection_clients.rst create mode 100644 docs/api/ocrd_network/ocrd_network.runtime_data.deployer.rst create mode 100644 docs/api/ocrd_network/ocrd_network.runtime_data.hosts.rst create mode 100644 docs/api/ocrd_network/ocrd_network.runtime_data.network_agents.rst create mode 100644 docs/api/ocrd_network/ocrd_network.runtime_data.network_services.rst create mode 100644 docs/api/ocrd_network/ocrd_network.tcp_to_uds_mets_proxy.rst diff --git a/docs/api/ocrd_network/ocrd_network.deployer.rst b/docs/api/ocrd_network/ocrd_network.deployer.rst deleted file mode 100644 index 205a331ba..000000000 --- a/docs/api/ocrd_network/ocrd_network.deployer.rst +++ /dev/null @@ -1,7 +0,0 @@ -ocrd\_network.deployer module -============================= - -.. automodule:: ocrd_network.deployer - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.deployment_utils.rst b/docs/api/ocrd_network/ocrd_network.deployment_utils.rst deleted file mode 100644 index cc1f315ac..000000000 --- a/docs/api/ocrd_network/ocrd_network.deployment_utils.rst +++ /dev/null @@ -1,7 +0,0 @@ -ocrd\_network.deployment\_utils module -====================================== - -.. automodule:: ocrd_network.deployment_utils - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.logging.rst b/docs/api/ocrd_network/ocrd_network.logging.rst deleted file mode 100644 index d2ac721d1..000000000 --- a/docs/api/ocrd_network/ocrd_network.logging.rst +++ /dev/null @@ -1,7 +0,0 @@ -ocrd\_network.logging module -============================ - -.. automodule:: ocrd_network.logging - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.logging_utils.rst b/docs/api/ocrd_network/ocrd_network.logging_utils.rst new file mode 100644 index 000000000..561ce0019 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.logging_utils.rst @@ -0,0 +1,7 @@ +ocrd\_network.logging\_utils module +=================================== + +.. automodule:: ocrd_network.logging_utils + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.rabbitmq_utils.helpers.rst b/docs/api/ocrd_network/ocrd_network.rabbitmq_utils.helpers.rst new file mode 100644 index 000000000..e13ff897a --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.rabbitmq_utils.helpers.rst @@ -0,0 +1,7 @@ +ocrd\_network.rabbitmq\_utils.helpers module +============================================ + +.. automodule:: ocrd_network.rabbitmq_utils.helpers + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.rabbitmq_utils.rst b/docs/api/ocrd_network/ocrd_network.rabbitmq_utils.rst index 36b581a33..63fd6f89a 100644 --- a/docs/api/ocrd_network/ocrd_network.rabbitmq_utils.rst +++ b/docs/api/ocrd_network/ocrd_network.rabbitmq_utils.rst @@ -15,5 +15,6 @@ Submodules ocrd_network.rabbitmq_utils.connector ocrd_network.rabbitmq_utils.constants ocrd_network.rabbitmq_utils.consumer + ocrd_network.rabbitmq_utils.helpers ocrd_network.rabbitmq_utils.ocrd_messages ocrd_network.rabbitmq_utils.publisher diff --git a/docs/api/ocrd_network/ocrd_network.rst b/docs/api/ocrd_network/ocrd_network.rst index ae12ae1f5..449770275 100644 --- a/docs/api/ocrd_network/ocrd_network.rst +++ b/docs/api/ocrd_network/ocrd_network.rst @@ -15,6 +15,7 @@ Subpackages ocrd_network.cli ocrd_network.models ocrd_network.rabbitmq_utils + ocrd_network.runtime_data Submodules ---------- @@ -25,15 +26,13 @@ Submodules ocrd_network.client ocrd_network.constants ocrd_network.database - ocrd_network.deployer - ocrd_network.deployment_utils - ocrd_network.logging + ocrd_network.logging_utils ocrd_network.param_validators ocrd_network.process_helpers ocrd_network.processing_server ocrd_network.processing_worker ocrd_network.processor_server - ocrd_network.runtime_data ocrd_network.server_cache ocrd_network.server_utils + ocrd_network.tcp_to_uds_mets_proxy ocrd_network.utils diff --git a/docs/api/ocrd_network/ocrd_network.runtime_data.config_parser.rst b/docs/api/ocrd_network/ocrd_network.runtime_data.config_parser.rst new file mode 100644 index 000000000..e56ad31f8 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.runtime_data.config_parser.rst @@ -0,0 +1,7 @@ +ocrd\_network.runtime\_data.config\_parser module +================================================= + +.. automodule:: ocrd_network.runtime_data.config_parser + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.runtime_data.connection_clients.rst b/docs/api/ocrd_network/ocrd_network.runtime_data.connection_clients.rst new file mode 100644 index 000000000..2fd62e5ef --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.runtime_data.connection_clients.rst @@ -0,0 +1,7 @@ +ocrd\_network.runtime\_data.connection\_clients module +====================================================== + +.. automodule:: ocrd_network.runtime_data.connection_clients + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.runtime_data.deployer.rst b/docs/api/ocrd_network/ocrd_network.runtime_data.deployer.rst new file mode 100644 index 000000000..62abe20db --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.runtime_data.deployer.rst @@ -0,0 +1,7 @@ +ocrd\_network.runtime\_data.deployer module +=========================================== + +.. automodule:: ocrd_network.runtime_data.deployer + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.runtime_data.hosts.rst b/docs/api/ocrd_network/ocrd_network.runtime_data.hosts.rst new file mode 100644 index 000000000..8f9001c38 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.runtime_data.hosts.rst @@ -0,0 +1,7 @@ +ocrd\_network.runtime\_data.hosts module +======================================== + +.. automodule:: ocrd_network.runtime_data.hosts + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.runtime_data.network_agents.rst b/docs/api/ocrd_network/ocrd_network.runtime_data.network_agents.rst new file mode 100644 index 000000000..1a597caad --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.runtime_data.network_agents.rst @@ -0,0 +1,7 @@ +ocrd\_network.runtime\_data.network\_agents module +================================================== + +.. automodule:: ocrd_network.runtime_data.network_agents + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.runtime_data.network_services.rst b/docs/api/ocrd_network/ocrd_network.runtime_data.network_services.rst new file mode 100644 index 000000000..d72e67c9d --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.runtime_data.network_services.rst @@ -0,0 +1,7 @@ +ocrd\_network.runtime\_data.network\_services module +==================================================== + +.. automodule:: ocrd_network.runtime_data.network_services + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.runtime_data.rst b/docs/api/ocrd_network/ocrd_network.runtime_data.rst index fefa00b49..cdf45f6b6 100644 --- a/docs/api/ocrd_network/ocrd_network.runtime_data.rst +++ b/docs/api/ocrd_network/ocrd_network.runtime_data.rst @@ -1,7 +1,20 @@ -ocrd\_network.runtime\_data module -================================== +ocrd\_network.runtime\_data package +=================================== .. automodule:: ocrd_network.runtime_data :members: :undoc-members: :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + ocrd_network.runtime_data.config_parser + ocrd_network.runtime_data.connection_clients + ocrd_network.runtime_data.deployer + ocrd_network.runtime_data.hosts + ocrd_network.runtime_data.network_agents + ocrd_network.runtime_data.network_services diff --git a/docs/api/ocrd_network/ocrd_network.tcp_to_uds_mets_proxy.rst b/docs/api/ocrd_network/ocrd_network.tcp_to_uds_mets_proxy.rst new file mode 100644 index 000000000..fa6e607f9 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.tcp_to_uds_mets_proxy.rst @@ -0,0 +1,7 @@ +ocrd\_network.tcp\_to\_uds\_mets\_proxy module +============================================== + +.. automodule:: ocrd_network.tcp_to_uds_mets_proxy + :members: + :undoc-members: + :show-inheritance: From 67633f53f87725181ce14fbe5e97915b9d0faf2a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:12:49 +0200 Subject: [PATCH 036/249] test_mets_server: fix arg vs kwarg --- tests/test_mets_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index da0b95894..61752b6ed 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -52,10 +52,10 @@ def add_file_server(x): mets_server_url, i = x workspace_server = Workspace(resolver=Resolver(), directory=WORKSPACE_DIR, mets_server_url=mets_server_url) workspace_server.add_file( + 'FOO', local_filename=f'local_filename{i}', mimetype=MIMETYPE_PAGE, page_id=f'page{i}', - file_grp='FOO', file_id=f'FOO_page{i}_foo{i}', # url=f'url{i}' ) From 751a1fe1bead708b6b184ca68ed361feef0f4d42 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:13:46 +0200 Subject: [PATCH 037/249] mets_server: ClientSideOcrdMets needs OcrdMets-like kwargs (without deprecation) --- src/ocrd/mets_server.py | 19 +++++++++---------- tests/test_mets_server.py | 2 +- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index d7edec5ec..5131f3f05 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -247,11 +247,9 @@ def add_agent(self, *args, **kwargs): ).json() return OcrdAgentModel.create(**kwargs) - @deprecated_alias(ID="file_id") - @deprecated_alias(pageId="page_id") - @deprecated_alias(fileGrp="file_grp") def find_files(self, **kwargs): self.log.debug("find_files(%s)", kwargs) + # translate from native OcrdMets kwargs to OcrdMetsServer REST params if "pageId" in kwargs: kwargs["page_id"] = kwargs.pop("pageId") if "ID" in kwargs: @@ -277,14 +275,14 @@ def find_files(self, **kwargs): def find_all_files(self, *args, **kwargs): return list(self.find_files(*args, **kwargs)) - @deprecated_alias(pageId="page_id") - @deprecated_alias(ID="file_id") def add_file( - self, file_grp, content=None, file_id=None, url=None, local_filename=None, mimetype=None, page_id=None, **kwargs + self, file_grp, content=None, ID=None, url=None, local_filename=None, mimetype=None, pageId=None, **kwargs ): data = OcrdFileModel.create( - file_id=file_id, file_grp=file_grp, page_id=page_id, mimetype=mimetype, url=url, - local_filename=local_filename + file_grp=file_grp, + # translate from native OcrdMets kwargs to OcrdMetsServer REST params + file_id=ID, page_id=pageId, + mimetype=mimetype, url=url, local_filename=local_filename ) if not self.multiplexing_mode: @@ -297,8 +295,9 @@ def add_file( raise RuntimeError(f"Add file failed: Msg: {r['error']}") return ClientSideOcrdFile( - None, ID=file_id, fileGrp=file_grp, url=url, pageId=page_id, mimetype=mimetype, - local_filename=local_filename + None, fileGrp=file_grp, + ID=ID, pageId=pageId, + url=url, mimetype=mimetype, local_filename=local_filename ) diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index 61752b6ed..b1350ed66 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -233,7 +233,7 @@ def test_reload(start_mets_server : Tuple[str, Workspace]): assert len(workspace_server.mets.find_all_files()) == 35, '35 files total' assert len(workspace_server_copy.mets.find_all_files()) == 35, '35 files total' - workspace_server_copy.add_file('FOO', ID='foo', mimetype='foo/bar', local_filename='mets.xml', pageId='foo') + workspace_server_copy.add_file('FOO', file_id='foo', mimetype='foo/bar', local_filename='mets.xml', page_id='foo') assert len(workspace_server.mets.find_all_files()) == 35, '35 files total' assert len(workspace_server_copy.mets.find_all_files()) == 36, '36 files total' From 86d956938068a2f5e9fdffc7d1fb81f9080b54ac Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:27:39 +0200 Subject: [PATCH 038/249] =?UTF-8?q?Processor/CLI=20decorator:=20:fire:=20s?= =?UTF-8?q?eparate=20kwargs=20and=20constructor=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - `Processor.__init__`: remove non-processing kwargs - `Processor.__init__`: deprecate processing kwargs (passing file groups etc) - `Processor`: define members for all non-processing calls - `ocrd_cli_wrap_processor`: non-processing calls instead of init kwargs - `run_processor` and `get_processor` and `cli.bashlib` and `cli.ocrd_tool`: always set processing attributes _after_ init - `Processor.process_workspace`: delegate fileGrp checking to `verify` (still empty) - `DummyProcessor.setup`: no more fileGrp assertions here (too early!) (This is meant to ensure that existing processor implementations, i.e. subclasses of `Processor` do not call `setup` in the constructor anymore. That way, v3.0 will stay backwards compatible in more respects and thus adopting it along the way will become easier.) --- src/ocrd/cli/bashlib.py | 10 +- src/ocrd/cli/ocrd_tool.py | 8 +- src/ocrd/decorators/__init__.py | 65 +++++---- src/ocrd/processor/base.py | 134 ++++++++---------- src/ocrd/processor/builtin/dummy_processor.py | 5 - src/ocrd/processor/helpers.py | 27 ++-- 6 files changed, 122 insertions(+), 127 deletions(-) diff --git a/src/ocrd/cli/bashlib.py b/src/ocrd/cli/bashlib.py index 8b79d82fb..2c57bb412 100644 --- a/src/ocrd/cli/bashlib.py +++ b/src/ocrd/cli/bashlib.py @@ -115,10 +115,12 @@ def ocrd_tool(self): @property def executable(self): return '' - processor = BashlibProcessor(workspace, - page_id=kwargs['page_id'], - input_file_grp=kwargs['input_file_grp'], - output_file_grp=kwargs['output_file_grp']) + processor = BashlibProcessor(None) + # go half way of the normal run_processor / process_workspace call tree + processor.workspace = workspace + processor.page_id = kwargs['page_id'] + processor.input_file_grp = kwargs['input_file_grp'] + processor.output_file_grp = kwargs['output_file_grp'] for input_files in processor.zip_input_files(mimetype=None, on_error='abort'): # ensure all input files exist locally (without persisting them in the METS) # - this mimics the default behaviour of all Pythonic processors diff --git a/src/ocrd/cli/ocrd_tool.py b/src/ocrd/cli/ocrd_tool.py index dacefab00..929fe47cc 100644 --- a/src/ocrd/cli/ocrd_tool.py +++ b/src/ocrd/cli/ocrd_tool.py @@ -117,25 +117,25 @@ def ocrd_tool_tool_description(ctx): @ocrd_tool_tool.command('list-resources', help="List tool's file resources") @pass_ocrd_tool def ocrd_tool_tool_list_resources(ctx): - ctx.processor(None, list_resources=True) + ctx.processor(None).list_resources() @ocrd_tool_tool.command('resolve-resource', help="Get a tool's file resource full path name") @click.argument('res_name') @pass_ocrd_tool def ocrd_tool_tool_resolve_resource(ctx, res_name): - ctx.processor(None, resolve_resource=res_name) + ctx.processor(None).resolve_resource(res_name) @ocrd_tool_tool.command('show-resource', help="Dump a tool's file resource") @click.argument('res_name') @pass_ocrd_tool def ocrd_tool_tool_show_resource(ctx, res_name): - ctx.processor(None, show_resource=res_name) + ctx.processor(None).show_resource(res_name) @ocrd_tool_tool.command('help', help="Generate help for processors") @click.argument('subcommand', required=False) @pass_ocrd_tool def ocrd_tool_tool_params_help(ctx, subcommand): - ctx.processor(None, show_help=True, subcommand=subcommand) + ctx.processor(None).show_help(subcommand=subcommand) # ---------------------------------------------------------------------- # ocrd ocrd-tool tool categories diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index 3d0795702..d9d1fb69d 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -36,6 +36,7 @@ def ocrd_cli_wrap_processor( profile_file=None, version=False, overwrite=False, + resolve_resource=None, show_resource=None, list_resources=False, # ocrd_network params start # @@ -47,20 +48,42 @@ def ocrd_cli_wrap_processor( # ocrd_network params end # **kwargs ): + # FIXME: remove workspace arg entirely + processor = processorClass(None) if not sys.argv[1:]: - processorClass(None, show_help=True) + processor.show_help(subcommand=subcommand) sys.exit(1) - if dump_json or dump_module_dir or help or version or show_resource or list_resources: - processorClass( - None, - dump_json=dump_json, - dump_module_dir=dump_module_dir, - show_help=help, - subcommand=subcommand, - show_version=version, - show_resource=show_resource, - list_resources=list_resources - ) + if help: + processor.show_help(subcommand=subcommand) + sys.exit() + if version: + processor.show_version() + sys.exit() + if dump_json: + processor.dump_json() + sys.exit() + if dump_module_dir: + processor.dump_module_dir() + sys.exit() + if resolve_resource: + try: + res = processor.resolve_resource(resolve_resource) + print(res) + sys.exit() + except ResourceNotFoundError as e: + log = getLogger('ocrd.processor.base') + log.critical(e.message) + sys.exit(1) + if show_resource: + try: + processor.show_resource(show_resource) + sys.exit() + except ResourceNotFoundError as e: + log = getLogger('ocrd.processor.base') + log.critical(e.message) + sys.exit(1) + if list_resources: + processor.list_resources() sys.exit() if subcommand: # Used for checking/starting network agents for the WebAPI architecture @@ -68,18 +91,13 @@ def ocrd_cli_wrap_processor( elif address or queue or database: raise ValueError(f"Subcommand options --address --queue and --database are only valid for subcommands: {SUBCOMMANDS}") + # from here: single-run processing context initLogging() - - LOG = getLogger('ocrd.cli_wrap_processor') - assert kwargs['input_file_grp'] is not None - assert kwargs['output_file_grp'] is not None - # LOG.info('kwargs=%s' % kwargs) if 'parameter' in kwargs: # Disambiguate parameter file/literal, and resolve file - disposable = processorClass(workspace=None) def resolve(name): try: - return disposable.resolve_resource(name) + return processor.resolve_resource(name) except ResourceNotFoundError: return None kwargs['parameter'] = parse_json_string_or_file(*kwargs['parameter'], @@ -89,12 +107,11 @@ def resolve(name): # Merge parameter overrides and parameters if 'parameter_override' in kwargs: set_json_key_value_overrides(kwargs['parameter'], *kwargs['parameter_override']) - # TODO OCR-D/core#274 # Assert -I / -O - # if not kwargs['input_file_grp']: - # raise ValueError('-I/--input-file-grp is required') - # if not kwargs['output_file_grp']: - # raise ValueError('-O/--output-file-grp is required') + if not kwargs['input_file_grp']: + raise ValueError('-I/--input-file-grp is required') + if not kwargs['output_file_grp']: + raise ValueError('-O/--output-file-grp is required') resolver = Resolver() working_dir, mets, _, mets_server_url = \ resolver.resolve_mets_arguments(working_dir, mets, None, mets_server_url) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 3cf132278..ff970b9a1 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -110,15 +110,6 @@ def __init__( output_file_grp=None, page_id=None, download_files=True, - # FIXME: deprecate all the following in favor of respective methods - resolve_resource=None, - show_resource=None, - list_resources=False, - show_help=False, - subcommand=None, - show_version=False, - dump_json=False, - dump_module_dir=False, version=None ): """ @@ -134,27 +125,17 @@ def __init__( Keyword Args: parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \ Can be ``None`` even for processing, but then needs to be set before running. - input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input. - output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output. + input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input. \ + Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \ + before processing. + output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output. \ + Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \ + before processing. page_id (string): comma-separated list of METS physical ``page`` IDs to process \ - (or empty for all pages). + (or empty for all pages). \ + Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \ + before processing. download_files (boolean): Whether input files will be downloaded prior to processing. - resolve_resource (string): If not ``None``, then instead of processing, resolve \ - given resource by name and print its full path to stdout. - show_resource (string): If not ``None``, then instead of processing, resolve \ - given resource by name and print its contents to stdout. - list_resources (boolean): If true, then instead of processing, find all installed \ - resource files in the search paths and print their path names. - show_help (boolean): If true, then instead of processing, print a usage description \ - including the standard CLI and all of this processor's ocrd-tool parameters and \ - docstrings. - subcommand (string): 'worker' or 'server', only used here for the right --help output - show_version (boolean): If true, then instead of processing, print information on \ - this processor's version and OCR-D version. Exit afterwards. - dump_json (boolean): If true, then instead of processing, print :py:attr:`ocrd_tool` \ - on stdout. - dump_module_dir (boolean): If true, then instead of processing, print :py:attr:`moduledir` \ - on stdout. """ if ocrd_tool is not None: deprecation_warning("Passing 'ocrd_tool' as keyword argument to Processor is deprecated - " @@ -165,48 +146,24 @@ def __init__( deprecation_warning("Passing 'version' as keyword argument to Processor is deprecated - " "use or override metadata/version properties instead") self._version = version - if dump_json: - print(json.dumps(self.ocrd_tool, indent=True)) - return - if dump_module_dir: - print(self.moduledir) - return - if list_resources: - for res in self.list_all_resources(): - print(res) - return - if resolve_resource: - try: - res = self.resolve_resource(resolve_resource) - print(res) - except ResourceNotFoundError as e: - log = getLogger('ocrd.processor.base') - log.critical(e.message) - sys.exit(1) - return - if show_resource: - try: - self.show_resource(show_resource) - except ResourceNotFoundError as e: - log = getLogger('ocrd.processor.base') - log.critical(e.message) - sys.exit(1) - return - if show_help: - self.show_help(subcommand=subcommand) - return - if show_version: - self.show_version() - return - self.workspace = workspace - if self.workspace: + if workspace is not None: deprecation_warning("Passing a workspace argument other than 'None' to Processor " "is deprecated - pass as argument to process_workspace instead") + self.workspace = workspace self.old_pwd = getcwd() os.chdir(self.workspace.directory) - self.input_file_grp = input_file_grp - self.output_file_grp = output_file_grp - self.page_id = None if page_id == [] or page_id is None else page_id + if input_file_grp is not None: + deprecation_warning("Passing an input_file_grp kwarg other than 'None' to Processor " + "is deprecated - pass as argument to process_workspace instead") + self.input_file_grp = input_file_grp + if output_file_grp is not None: + deprecation_warning("Passing an output_file_grp kwarg other than 'None' to Processor " + "is deprecated - pass as argument to process_workspace instead") + self.output_file_grp = output_file_grp + if page_id is not None: + deprecation_warning("Passing a page_id kwarg other than 'None' to Processor " + "is deprecated - pass as argument to process_workspace instead") + self.page_id = page_id or None self.download = download_files if parameter is None: parameter = {} @@ -220,9 +177,16 @@ def __init__( deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process'))) def show_help(self, subcommand=None): + """ + Print a usage description including the standard CLI and all of this processor's ocrd-tool + parameters and docstrings. + """ print(generate_processor_help(self.ocrd_tool, processor_instance=self, subcommand=subcommand)) def show_version(self): + """ + Print information on this processor's version and OCR-D version. + """ print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION)) def verify(self): @@ -231,6 +195,28 @@ def verify(self): """ return True + def dump_json(self): + """ + Print :py:attr:`ocrd_tool` on stdout. + """ + print(json.dumps(self.ocrd_tool, indent=True)) + return + + def dump_module_dir(self): + """ + Print :py:attr:`moduledir` on stdout. + """ + print(self.moduledir) + return + + def list_resources(self): + """ + Find all installed resource files in the search paths and print their path names. + """ + for res in self.list_all_resources(): + print(res) + return + def setup(self) -> None: """ Prepare the processor for actual data processing, @@ -265,14 +251,10 @@ def process_workspace(self, workspace: Workspace) -> None: (This will iterate over pages and files, calling :py:meth:`process_page`, handling exceptions.) """ - assert self.input_file_grp is not None - assert self.output_file_grp is not None - input_file_grps = self.input_file_grp.split(',') - for input_file_grp in input_file_grps: - assert input_file_grp in workspace.mets.file_groups log = getLogger('ocrd.processor.base') with pushd_popd(workspace.directory): self.workspace = workspace + self.verify() try: # FIXME: add page parallelization by running multiprocessing.Pool (#322) for input_file_tuple in self.zip_input_files(on_error='abort'): @@ -412,6 +394,14 @@ def resolve_resource(self, val): raise ResourceNotFoundError(val, executable) def show_resource(self, val): + """ + Resolve a resource name to a file path with the algorithm in + https://ocr-d.de/en/spec/ocrd_tool#file-parameters, + then print its contents to stdout. + + Args: + val (string): resource value to show + """ res_fname = self.resolve_resource(val) fpath = Path(res_fname) if fpath.is_dir(): @@ -477,7 +467,7 @@ def input_files(self): - Otherwise raise an error (complaining that only PAGE-XML warrants having multiple images for a single page) Algorithm _ - + Returns: A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects. """ diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index 424c05772..b05ca9e6d 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -79,11 +79,6 @@ def executable(self): def version(self): return '0.0.3' - def setup(self): - super().setup() - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - @click.command() @ocrd_cli_options def cli(*args, **kwargs): diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index 92846a6f0..dff14cfca 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -369,7 +369,7 @@ def get_cached_processor(parameter: dict, processor_class): """ if processor_class: dict_params = dict(parameter) if parameter else None - processor = processor_class(workspace=None, parameter=dict_params) + processor = processor_class(None, parameter=dict_params) processor.setup() return processor return None @@ -386,22 +386,13 @@ def get_processor( ): if processor_class: if instance_caching: - cached_processor = get_cached_processor( - parameter=parameter, - processor_class=processor_class - ) - cached_processor.workspace = workspace - cached_processor.page_id = page_id - cached_processor.input_file_grp = input_file_grp - cached_processor.output_file_grp = output_file_grp - return cached_processor - processor = processor_class( - workspace=workspace, - page_id=page_id, - input_file_grp=input_file_grp, - output_file_grp=output_file_grp, - parameter=parameter - ) - processor.setup() + processor = get_cached_processor(parameter, processor_class) + else: + processor = processor_class(None, parameter=parameter) + processor.setup() + processor.workspace = workspace + processor.page_id = page_id + processor.input_file_grp = input_file_grp + processor.output_file_grp = output_file_grp return processor raise ValueError("Processor class is not known") From 1f6f0c84f845a2928aaa17ce29eecd6d97fd50f1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:39:35 +0200 Subject: [PATCH 039/249] =?UTF-8?q?Processor=20/=20ocrd-tool.json:=20:fire?= =?UTF-8?q?:=20fileGrp=20cardinality=20checks=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - `ocrd_tool.schema.yml`: - deprecate `input_file_grp` - deprecate `output_file_grp` - introduce+require `input_file_grp_cardinality`: number of min:max - introduce+require `output_file_grp_cardinality`: number of min:max - `ocrd_utils.assert_file_grp_cardinality`: deprecate - `Processor.verify`: check that - fileGrp attributes exist, - input fileGrp(s) exist in METS - input/output fileGrp(s) match the cardinality constraints, if specified in ocrd-tool.json: exact number, or minimum+maximum number (skipping negative or zero) (Processor implementors must now specify `input_file_grp_cardinality` and `output_file_grp_cardinality` in order to have a valid `ocrd-tool.json` again.) --- src/ocrd/processor/base.py | 29 ++++++++++++++- .../processor/builtin/dummy/ocrd-tool.json | 6 ++- src/ocrd_utils/str.py | 2 + src/ocrd_validators/ocrd_tool.schema.yml | 37 +++++++++++++++++-- 4 files changed, 67 insertions(+), 7 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index ff970b9a1..5cde4d9fe 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -191,8 +191,33 @@ def show_version(self): def verify(self): """ - Verify that the :py:attr:`input_file_grp` fulfills the processor's requirements. - """ + Verify that :py:attr:`input_file_grp` and :py:attr:`output_file_grp` fulfill the processor's requirements. + """ + assert self.input_file_grp is not None + assert self.output_file_grp is not None + input_file_grps = self.input_file_grp.split(',') + output_file_grps = self.output_file_grp.split(',') + def assert_file_grp_cardinality(grps, spec, msg): + if isinstance(spec, int) and spec > 0: + assert len(grps) == spec, msg % (len(grps), str(spec)) + else: + minimum = spec[0] + maximum = spec[1] + if minimum > 0: + assert len(grps) >= minimum, msg % (len(grps), str(spec)) + if maximum > 0: + assert len(grps) <= maximum, msg % (len(grps), str(spec)) + # FIXME: maybe we should enforce the cardinality properties to be specified or apply default=1 here + # (but we already have ocrd-tool validation, and these first need to be adopted by implementors) + if 'input_file_grp_cardinality' in self.ocrd_tool: + assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'], + "Unexpected number of input file groups %d vs %s") + if 'output_file_grp_cardinality' in self.ocrd_tool: + assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'], + "Unexpected number of output file groups %d vs %s") + for input_file_grp in input_file_grps: + assert input_file_grp in self.workspace.mets.file_groups + # keep this for backwards compatibility: return True def dump_json(self): diff --git a/src/ocrd/processor/builtin/dummy/ocrd-tool.json b/src/ocrd/processor/builtin/dummy/ocrd-tool.json index 30a6d99fd..ef4a4810f 100644 --- a/src/ocrd/processor/builtin/dummy/ocrd-tool.json +++ b/src/ocrd/processor/builtin/dummy/ocrd-tool.json @@ -1,12 +1,14 @@ { + "version": "1.0.0", + "git_url": "https://github.com/OCR-D/core", "tools": { "ocrd-dummy": { "executable": "ocrd-dummy", "description": "Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group", "steps": ["preprocessing/optimization"], "categories": ["Image preprocessing"], - "input_file_grp": "DUMMY_INPUT", - "output_file_grp": "DUMMY_OUTPUT", + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "parameters": { "copy_files": { "type": "boolean", diff --git a/src/ocrd_utils/str.py b/src/ocrd_utils/str.py index 51cce4bf2..38839064f 100644 --- a/src/ocrd_utils/str.py +++ b/src/ocrd_utils/str.py @@ -7,6 +7,7 @@ from typing import List, Union from .constants import REGEX_FILE_ID, SPARKLINE_CHARS from .deprecate import deprecation_warning +from deprecated import deprecated from warnings import warn from numpy import array_split @@ -26,6 +27,7 @@ ] +@deprecated(version='3.0', reason='specify input and output file_grp_cardinality in ocrd-tool.json instead') def assert_file_grp_cardinality(grps, n, msg=None): """ Assert that a string of comma-separated fileGrps contains exactly ``n`` entries. diff --git a/src/ocrd_validators/ocrd_tool.schema.yml b/src/ocrd_validators/ocrd_tool.schema.yml index 766fd892c..db1b61458 100644 --- a/src/ocrd_validators/ocrd_tool.schema.yml +++ b/src/ocrd_validators/ocrd_tool.schema.yml @@ -29,28 +29,59 @@ properties: - steps - executable - categories - - input_file_grp - # Not required because not all processors produce output files - # - output_file_grp + - input_file_grp_cardinality + - output_file_grp_cardinality properties: executable: description: The name of the CLI executable in $PATH type: string input_file_grp: + deprecated: true description: Input fileGrp@USE this tool expects by default type: array items: type: string # pattern: '^OCR-D-[A-Z0-9-]+$' output_file_grp: + deprecated: true description: Output fileGrp@USE this tool produces by default type: array items: type: string # pattern: '^OCR-D-[A-Z0-9-]+$' + input_file_grp_cardinality: + description: Number of (comma-separated) input fileGrp@USE this tool expects (either an exact value or a minimum,maximum list with -1 for unlimited) + oneOf: + - items: + type: number + multipleOf: 1 + - items: + type: array + items: + type: number + multipleOf: 1 + minItems: 2 + maxItems: 2 + default: 1 + additionalProperties: false + output_file_grp_cardinality: + description: Number of (comma-separated) output fileGrp@USE this tool expects (either an exact value or a minimum,maximum list with -1 for unlimited) + oneOf: + - items: + type: number + multipleOf: 1 + - items: + type: array + items: + type: number + multipleOf: 1 + minItems: 2 + maxItems: 2 + default: 1 parameters: description: Object describing the parameters of a tool. Keys are parameter names, values sub-schemas. type: object + default: {} patternProperties: ".*": type: object From 9b417d69ae0be9f51d6cc7dfbbc6a9c514738437 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:49:09 +0200 Subject: [PATCH 040/249] test_processor: adapt to Processor init changes --- tests/processor/test_processor.py | 43 ++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index e0ebfbb1d..d4f0637f7 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -30,8 +30,11 @@ def setUp(self): def test_incomplete_processor(self): proc = IncompleteProcessor(None) + proc.input_file_grp = 'OCR-D-IMG' + proc.output_file_grp = 'DUMMY' + proc.page_id = None with self.assertRaises(NotImplementedError): - proc.process() + proc.process_workspace(self.workspace) def test_no_resolver(self): with self.assertRaisesRegex(Exception, 'pass a resolver to create a workspace'): @@ -74,15 +77,20 @@ def test_parameter(self): self.assertEqual(processor.parameter['baz'], 'quux') def test_verify(self): - proc = DummyProcessor(self.workspace) + proc = DummyProcessor(None) + with self.assertRaises(AttributeError): + proc.verify() + proc.workspace = self.workspace + proc.input_file_grp = "OCR-D-IMG" + proc.output_file_grp = "DUMMY" self.assertEqual(proc.verify(), True) def test_json(self): - DummyProcessor(self.workspace, dump_json=True) + DummyProcessor(None).dump_json() def test_params_missing_required(self): with self.assertRaisesRegex(Exception, 'is a required property'): - DummyProcessorWithRequiredParameters(workspace=self.workspace) + DummyProcessorWithRequiredParameters(None) def test_params_preset_resolve(self): with pushd_popd(tempdir=True) as tempdir: @@ -112,7 +120,7 @@ class ParamTestProcessor(Processor): @property def ocrd_tool(self): return {} - proc = ParamTestProcessor(self.workspace) + proc = ParamTestProcessor(None) self.assertEqual(proc.parameter, {}) def test_run_agent(self): @@ -192,7 +200,10 @@ def ocrd_tool(self): ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar4', page_id='phys_0002') for page_id in [None, 'phys_0001,phys_0002']: with self.subTest(page_id=page_id): - proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) + proc = ZipTestProcessor(None) + proc.workspace = ws + proc.input_file_grp = 'GRP1,GRP2' + proc.page_id = page_id tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()] assert ('foobar1', 'foobar2') in tuples assert ('foobar3', 'foobar4') in tuples @@ -217,7 +228,10 @@ def ocrd_tool(self): ws.add_file('GRP2', mimetype='image/tiff', file_id='foobar4', page_id='phys_0002') for page_id in [None, 'phys_0001,phys_0002']: with self.subTest(page_id=page_id): - proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) + proc = ZipTestProcessor(None) + proc.workspace = ws + proc.input_file_grp = 'GRP1,GRP2' + proc.page_id = page_id print("unfiltered") tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()] assert ('foobar1', 'foobar2') in tuples @@ -228,7 +242,10 @@ def ocrd_tool(self): ws.add_file('GRP2', mimetype='image/tiff', file_id='foobar4dup', page_id='phys_0002') for page_id in [None, 'phys_0001,phys_0002']: with self.subTest(page_id=page_id): - proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) + proc = ZipTestProcessor(None) + proc.workspace = ws + proc.input_file_grp = 'GRP1,GRP2' + proc.page_id = page_id tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(on_error='first')] assert ('foobar1', 'foobar2') in tuples assert ('foobar3', 'foobar4') in tuples @@ -239,7 +256,10 @@ def ocrd_tool(self): ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar2dup', page_id='phys_0001') for page_id in [None, 'phys_0001,phys_0002']: with self.subTest(page_id=page_id): - proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) + proc = ZipTestProcessor(None) + proc.workspace = ws + proc.input_file_grp = 'GRP1,GRP2' + proc.page_id = page_id with self.assertRaisesRegex(Exception, "Multiple PAGE-XML matches for page"): tuples = proc.zip_input_files() @@ -255,7 +275,10 @@ def ocrd_tool(self): ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0001') for page_id in [None, 'phys_0001']: with self.subTest(page_id=page_id): - proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) + proc = ZipTestProcessor(None) + proc.workspace = ws + proc.input_file_grp = 'GRP1,GRP2' + proc.page_id = page_id assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')] r = self.capture_out_err() assert 'ERROR ocrd.processor.base - found no page phys_0001 in file group GRP1' in r.err From fbe83c9e9ed186664a42f54638cd2976ebf98a7a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:50:42 +0200 Subject: [PATCH 041/249] adapt to ocrd-tool.json cardinality changes --- tests/cli/test_bashlib.py | 2 +- tests/cli/test_validate.py | 4 ++-- tests/validator/test_ocrd_tool_validator.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/cli/test_bashlib.py b/tests/cli/test_bashlib.py index ab52b6b1b..1807b1f47 100644 --- a/tests/cli/test_bashlib.py +++ b/tests/cli/test_bashlib.py @@ -50,7 +50,7 @@ def invoke_bash(self, script, *args, executable=None): return -1, "", str(e) finally: os.remove(scriptfile.name) - + def setUp(self): self.maxDiff = None super().setUp() diff --git a/tests/cli/test_validate.py b/tests/cli/test_validate.py index 36ee3e599..ecfedc679 100644 --- a/tests/cli/test_validate.py +++ b/tests/cli/test_validate.py @@ -21,8 +21,8 @@ "ocrd-xyz": { "executable": "ocrd-xyz", "description": "bars all the foos", - "input_file_grp": ["OCR-D-FOO"], - "output_file_grp": ["OCR-D-BAR"], + "input_file_grp_cardinality": [1, 2], + "output_file_grp_cardinality": 1, "categories": ["Layout analysis"], "steps": ["layout/analysis"], "parameters": { diff --git a/tests/validator/test_ocrd_tool_validator.py b/tests/validator/test_ocrd_tool_validator.py index 3ad40d864..5c89ecbf0 100644 --- a/tests/validator/test_ocrd_tool_validator.py +++ b/tests/validator/test_ocrd_tool_validator.py @@ -12,8 +12,8 @@ "ocrd-xyz": { "executable": "ocrd-xyz", "description": "bars all the foos", - "input_file_grp": ["OCR-D-FOO"], - "output_file_grp": ["OCR-D-BAR"], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "categories": ["Layout analysis"], "steps": ["layout/analysis"] } From 09dd54bef98b03e7936ab3f19465e55e0bea70af Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:52:09 +0200 Subject: [PATCH 042/249] use up-to-date kwargs (avoiding old deprecations) --- tests/data/__init__.py | 4 ++-- tests/processor/test_processor.py | 10 +++++----- tests/validator/test_page_validator.py | 9 +++++---- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/tests/data/__init__.py b/tests/data/__init__.py index ff403ebef..b299c512e 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -87,9 +87,9 @@ def process(self): file_id = make_file_id(input_file, self.output_file_grp) # print(input_file.ID, file_id) self.workspace.add_file( - ID=file_id, + file_id=file_id, file_grp=self.output_file_grp, - pageId=input_file.pageId, + page_id=input_file.pageId, mimetype=input_file.mimetype, local_filename=os.path.join(self.output_file_grp, file_id), content='CONTENT') diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index d4f0637f7..2cf8a189b 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -138,8 +138,8 @@ def test_run_input(self): def test_run_output0(self): with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001') - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0002') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0002') run_processor(DummyProcessorWithOutput, workspace=ws, input_file_grp="GRP1", output_file_grp="OCR-D-OUT") @@ -148,10 +148,10 @@ def test_run_output0(self): def test_run_output_overwrite(self): with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001') - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0002') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0002') ws.overwrite_mode = True - ws.add_file('OCR-D-OUT', mimetype=MIMETYPE_PAGE, ID='OCR-D-OUT_phys_0001', pageId='phys_0001') + ws.add_file('OCR-D-OUT', mimetype=MIMETYPE_PAGE, file_id='OCR-D-OUT_phys_0001', page_id='phys_0001') ws.overwrite_mode = False with pytest.raises(Exception) as exc: run_processor(DummyProcessorWithOutput, workspace=ws, diff --git a/tests/validator/test_page_validator.py b/tests/validator/test_page_validator.py index 79e92d90f..e6aaff152 100644 --- a/tests/validator/test_page_validator.py +++ b/tests/validator/test_page_validator.py @@ -16,9 +16,10 @@ def test_validate_err(self): PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, page_textequiv_strategy='best') # test with deprecated name with self.assertRaisesRegex(Exception, 'page_textequiv_strategy best not implemented'): - PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, strategy='best') + with self.assertWarnsRegex(DeprecationWarning, r'use page_textequiv_strategy'): + PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, strategy='best') with self.assertRaisesRegex(Exception, 'page_textequiv_consistency level superstrictest not implemented'): - PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, page_textequiv_consistency='superstrictest', strategy='first') + PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, page_textequiv_consistency='superstrictest', page_textequiv_strategy='first') def test_validate_filename(self): report = PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME) @@ -44,7 +45,7 @@ def test_validate_lax(self): report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 26, '26 textequiv consistency errors - strict') - report = PageValidator.validate(ocrd_page=ocrd_page, strictness='lax') + report = PageValidator.validate(ocrd_page=ocrd_page, page_textequiv_consistency='lax') self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 1, '1 textequiv consistency errors - lax') def test_validate_multi_textequiv_first(self): @@ -89,7 +90,7 @@ def test_fix(self): ocrd_page = parse(FAULTY_GLYPH_PAGE_FILENAME, silence=True) report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 17, '17 textequiv consistency errors') - PageValidator.validate(ocrd_page=ocrd_page, strictness='fix') + PageValidator.validate(ocrd_page=ocrd_page, page_textequiv_consistency='fix') report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 0, 'no more textequiv consistency errors') From af880e4a302332d23a58e45c0f933351c67cc936 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:52:59 +0200 Subject: [PATCH 043/249] hide/test expected deprecation warnings --- tests/data/__init__.py | 4 +++- tests/test_resolver.py | 29 +++++++++++++++-------------- tests/test_utils.py | 12 ++++++++---- 3 files changed, 26 insertions(+), 19 deletions(-) diff --git a/tests/data/__init__.py b/tests/data/__init__.py index b299c512e..e7ef30fc2 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -1,5 +1,6 @@ import json import os +from pytest import warns from ocrd import Processor from ocrd_utils import make_file_id @@ -38,7 +39,8 @@ def process(self): # override to prevent iterating over empty files def process_workspace(self, workspace): - self.process() + with warns(DeprecationWarning, match='should be replaced with process_page'): + self.process() class DummyProcessorWithRequiredParameters(Processor): @property diff --git a/tests/test_resolver.py b/tests/test_resolver.py index abcf69257..7e102612e 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -287,20 +287,21 @@ def test_resolve_mets_arguments(): https://github.com/OCR-D/core/issues/517 """ resolver = Resolver() - assert resolver.resolve_mets_arguments(None, None, None, None) == (str(Path.cwd()), str(Path.cwd() / 'mets.xml'), 'mets.xml', None) - assert resolver.resolve_mets_arguments('/', None, 'mets.xml', None) == ('/', '/mets.xml', 'mets.xml', None) - assert resolver.resolve_mets_arguments('/foo', '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) - assert resolver.resolve_mets_arguments(None, '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) - assert resolver.resolve_mets_arguments('/foo', 'foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) - assert resolver.resolve_mets_arguments('/foo', 'http://bar/foo.xml', None, None) == ('/foo', 'http://bar/foo.xml', 'foo.xml', None) - with pytest.raises(ValueError, match="Use either --mets or --mets-basename, not both"): - resolver.resolve_mets_arguments('/', '/foo/bar', 'foo.xml', None) - with pytest.raises(ValueError, match="inconsistent with --directory"): - resolver.resolve_mets_arguments('/foo', '/bar/foo.xml', None, None) - with pytest.warns(DeprecationWarning): - resolver.resolve_mets_arguments('/foo', None, 'not_mets.xml', None) - with pytest.raises(ValueError, match=r"--mets is an http\(s\) URL but no --directory was given"): - resolver.resolve_mets_arguments(None, 'http://bar/foo.xml', None, None) + with pytest.warns(DeprecationWarning, match='--mets-basename'): + assert resolver.resolve_mets_arguments(None, None, None, None) == (str(Path.cwd()), str(Path.cwd() / 'mets.xml'), 'mets.xml', None) + assert resolver.resolve_mets_arguments('/', None, 'mets.xml', None) == ('/', '/mets.xml', 'mets.xml', None) + assert resolver.resolve_mets_arguments('/foo', '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) + assert resolver.resolve_mets_arguments(None, '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) + assert resolver.resolve_mets_arguments('/foo', 'foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) + assert resolver.resolve_mets_arguments('/foo', 'http://bar/foo.xml', None, None) == ('/foo', 'http://bar/foo.xml', 'foo.xml', None) + with pytest.raises(ValueError, match="Use either --mets or --mets-basename, not both"): + resolver.resolve_mets_arguments('/', '/foo/bar', 'foo.xml', None) + with pytest.raises(ValueError, match="inconsistent with --directory"): + resolver.resolve_mets_arguments('/foo', '/bar/foo.xml', None, None) + with pytest.warns(DeprecationWarning): + resolver.resolve_mets_arguments('/foo', None, 'not_mets.xml', None) + with pytest.raises(ValueError, match=r"--mets is an http\(s\) URL but no --directory was given"): + resolver.resolve_mets_arguments(None, 'http://bar/foo.xml', None, None) if __name__ == '__main__': main(__file__) diff --git a/tests/test_utils.py b/tests/test_utils.py index 89ff6d90f..dea7ad794 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -242,12 +242,16 @@ def test_set_json_key_value_overrides(): def test_assert_file_grp_cardinality(): with raises(AssertionError, match="Expected exactly 5 output file groups, but '.'FOO', 'BAR'.' has 2"): - assert_file_grp_cardinality('FOO,BAR', 5) + with warns(DeprecationWarning, match="file_grp_cardinality in ocrd-tool.json instead"): + assert_file_grp_cardinality('FOO,BAR', 5) with raises(AssertionError, match="Expected exactly 1 output file group, but '.'FOO', 'BAR'.' has 2"): - assert_file_grp_cardinality('FOO,BAR', 1) - assert_file_grp_cardinality('FOO,BAR', 2) + with warns(DeprecationWarning, match="file_grp_cardinality in ocrd-tool.json instead"): + assert_file_grp_cardinality('FOO,BAR', 1) + with warns(DeprecationWarning, match="file_grp_cardinality in ocrd-tool.json instead"): + assert_file_grp_cardinality('FOO,BAR', 2) with raises(AssertionError, match="Expected exactly 1 output file group .foo bar., but '.'FOO', 'BAR'.' has 2"): - assert_file_grp_cardinality('FOO,BAR', 1, 'foo bar') + with warns(DeprecationWarning, match="file_grp_cardinality in ocrd-tool.json instead"): + assert_file_grp_cardinality('FOO,BAR', 1, 'foo bar') def test_make_file_id_simple(): f = create_ocrd_file('MAX', ID="MAX_0012") From e381a0fe94a14150e8004c50869462f870e5b591 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:53:25 +0200 Subject: [PATCH 044/249] improve output in case of assertion failures --- tests/cli/test_validate.py | 22 ++++++++++----------- tests/validator/test_ocrd_tool_validator.py | 4 ++-- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/cli/test_validate.py b/tests/cli/test_validate.py index ecfedc679..0682ea7a0 100644 --- a/tests/cli/test_validate.py +++ b/tests/cli/test_validate.py @@ -57,24 +57,24 @@ def test_validate_ocrd_tool(self): json_path.write_text(OCRD_TOOL) # normal call - code, _, _ = self.invoke_cli(validate_cli, ['tool-json', str(json_path)]) - self.assertEqual(code, 0) + code, _, err = self.invoke_cli(validate_cli, ['tool-json', str(json_path)]) + self.assertEqual(code, 0, err) # relative path with pushd_popd(tempdir): - code, _, _ = self.invoke_cli(validate_cli, ['tool-json', 'ocrd-tool.json']) - self.assertEqual(code, 0) + code, _, err = self.invoke_cli(validate_cli, ['tool-json', 'ocrd-tool.json']) + self.assertEqual(code, 0, err) # default path with pushd_popd(tempdir): - code, _, _ = self.invoke_cli(validate_cli, ['tool-json']) - self.assertEqual(code, 0) + code, _, err = self.invoke_cli(validate_cli, ['tool-json']) + self.assertEqual(code, 0, err) def test_validate_parameter(self): with TemporaryDirectory() as tempdir: json_path = Path(tempdir, 'ocrd-tool.json') json_path.write_text(OCRD_TOOL) with pushd_popd(tempdir): - code, _, _ = self.invoke_cli(validate_cli, ['parameters', 'ocrd-tool.json', 'ocrd-xyz', dumps({"baz": "foo"})]) - self.assertEqual(code, 0) + code, _, err = self.invoke_cli(validate_cli, ['parameters', 'ocrd-tool.json', 'ocrd-xyz', dumps({"baz": "foo"})]) + self.assertEqual(code, 0, err) def test_validate_page(self): page_path = assets.path_to('glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS.xml') @@ -84,11 +84,11 @@ def test_validate_page(self): def test_validate_tasks(self): # simple - code, _, _ = self.invoke_cli(validate_cli, ['tasks', + code, _, err = self.invoke_cli(validate_cli, ['tasks', "sample-processor-required-param -I FOO -O OUT1 -p '{\"param1\": true}'", "sample-processor-required-param -I FOO -O OUT2 -p '{\"param1\": true}'", ]) - self.assertEqual(code, 0) + self.assertEqual(code, 0, err) # with workspace code, out, err = self.invoke_cli(validate_cli, ['tasks', '--workspace', assets.path_to('kant_aufklaerung_1784/data'), @@ -96,7 +96,7 @@ def test_validate_tasks(self): "sample-processor-required-param -I OCR-D-IMG,OCR-D-GT-PAGE -O OUT2 -p '{\"param1\": true}'", ]) print('code=%s out=%s err=%s' % (code, out, err)) - self.assertEqual(code, 0) + self.assertEqual(code, 0, err) if __name__ == '__main__': diff --git a/tests/validator/test_ocrd_tool_validator.py b/tests/validator/test_ocrd_tool_validator.py index 5c89ecbf0..861235389 100644 --- a/tests/validator/test_ocrd_tool_validator.py +++ b/tests/validator/test_ocrd_tool_validator.py @@ -29,7 +29,7 @@ def setUp(self): def test_smoke(self): report = OcrdToolValidator.validate(self.ocrd_tool) - self.assertEqual(report.is_valid, True) + self.assertTrue(report.is_valid, report) def test_additional_props(self): self.ocrd_tool['not-allowed'] = 'YUP' @@ -48,7 +48,7 @@ def test_file_param_ok(self): ocrd_tool = json.loads(skeleton) ocrd_tool['tools']['ocrd-xyz']['parameters'] = {"file-param": {"description": "...", "type": "string", "content-type": 'application/rdf+xml'}} report = OcrdToolValidator.validate(ocrd_tool) - self.assertEqual(report.is_valid, True) + self.assertTrue(report.is_valid, report) # Not restricted anymore since spec 3.3.0 # def test_file_param_bad_content_types(self): From 874b5061583342f82ef122ec0ab2718a84b20b45 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 14 Aug 2024 11:03:37 +0200 Subject: [PATCH 045/249] Set VERSION to upcoming 3.0.0a1 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 3d6ac35b1..a6f4248b2 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.66.0 +3.0.0a1 From 5ffe3cb258ee2dc4dad8b095e0ac2ef914508933 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 14 Aug 2024 11:44:53 +0200 Subject: [PATCH 046/249] CircleCI: use version 2.1 --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 9c5ff8322..24c742aa6 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,4 +1,4 @@ -version: 2 +version: 2.1 orbs: python: circleci/python@2.0.3 From 93a742efffb31cf4890585c555bfd7e1e77c22bc Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 14 Aug 2024 13:41:42 +0200 Subject: [PATCH 047/249] test_bashlib: use version verbatim --- repo/spec | 2 +- src/ocrd_utils/config.py | 2 +- tests/cli/test_bashlib.py | 4 +--- tests/validator/test_ocrd_tool_validator.py | 4 ++-- 4 files changed, 5 insertions(+), 7 deletions(-) diff --git a/repo/spec b/repo/spec index 506b33936..2bbd4dd91 160000 --- a/repo/spec +++ b/repo/spec @@ -1 +1 @@ -Subproject commit 506b33936d89080a683fa8a26837f2a23b23e5e2 +Subproject commit 2bbd4dd916519f567e5c648b24c0b5ca6fc8a183 diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index b3a3e9537..d0955a8dc 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -125,7 +125,7 @@ def raw_value(self, name): description="If set, then the CPU profile is written to this file for later peruse with a analysis tools like snakeviz") config.add("OCRD_DOWNLOAD_RETRIES", - description="Number of times to retry failed attempts for downloads of workspace files.", + description="Number of times to retry failed attempts for downloads of resource or workspace files.", validator=int, parser=int) diff --git a/tests/cli/test_bashlib.py b/tests/cli/test_bashlib.py index 1807b1f47..c4b2fd7da 100644 --- a/tests/cli/test_bashlib.py +++ b/tests/cli/test_bashlib.py @@ -104,10 +104,8 @@ def test_bashlib_minversion(self): exit_code, out, err = self.invoke_bash( "source $(ocrd bashlib filename) && ocrd__minversion 2.29.0") assert exit_code == 0 - (major, minor, patch) = map(int, str(VERSION).split('.')) - version = "%d.%d.%d" % (major, minor + 1, patch) exit_code, out, err = self.invoke_bash( - "source $(ocrd bashlib filename) && ocrd__minversion " + version) + "source $(ocrd bashlib filename) && ocrd__minversion " + VERSION) assert exit_code > 0 assert "ERROR: ocrd/core is too old" in err diff --git a/tests/validator/test_ocrd_tool_validator.py b/tests/validator/test_ocrd_tool_validator.py index 861235389..2d035757e 100644 --- a/tests/validator/test_ocrd_tool_validator.py +++ b/tests/validator/test_ocrd_tool_validator.py @@ -29,7 +29,7 @@ def setUp(self): def test_smoke(self): report = OcrdToolValidator.validate(self.ocrd_tool) - self.assertTrue(report.is_valid, report) + self.assertTrue(report.is_valid, str(report.errors)) def test_additional_props(self): self.ocrd_tool['not-allowed'] = 'YUP' @@ -48,7 +48,7 @@ def test_file_param_ok(self): ocrd_tool = json.loads(skeleton) ocrd_tool['tools']['ocrd-xyz']['parameters'] = {"file-param": {"description": "...", "type": "string", "content-type": 'application/rdf+xml'}} report = OcrdToolValidator.validate(ocrd_tool) - self.assertTrue(report.is_valid, report) + self.assertTrue(report.is_valid, str(report.errors)) # Not restricted anymore since spec 3.3.0 # def test_file_param_bad_content_types(self): From 51176841d330d6e238d35fabbad7db3ecceecf9d Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 14 Aug 2024 16:17:46 +0200 Subject: [PATCH 048/249] . --- repo/spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repo/spec b/repo/spec index 506b33936..2bbd4dd91 160000 --- a/repo/spec +++ b/repo/spec @@ -1 +1 @@ -Subproject commit 506b33936d89080a683fa8a26837f2a23b23e5e2 +Subproject commit 2bbd4dd916519f567e5c648b24c0b5ca6fc8a183 From 456cc6dd65a40ccb17006392eea2d4e1481884a5 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 14 Aug 2024 17:45:29 +0200 Subject: [PATCH 049/249] fix make spec --- Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 1b4ef47bd..886eed963 100644 --- a/Makefile +++ b/Makefile @@ -238,9 +238,9 @@ repo/assets repo/spec: always-update .PHONY: spec # Copy JSON Schema, OpenAPI from OCR-D/spec -spec: repo/spec - cp repo/spec/ocrd_tool.schema.yml ocrd_validators/ocrd_validators/ocrd_tool.schema.yml - cp repo/spec/bagit-profile.yml ocrd_validators/ocrd_validators/bagit-profile.yml +spec: # repo/spec + cp repo/spec/ocrd_tool.schema.yml src/ocrd_validators/ocrd_tool.schema.yml + cp repo/spec/bagit-profile.yml src/ocrd_validators/bagit-profile.yml # # Assets From 7a9fc2778f774cef304706f33a0f8f68a71b4fe6 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 14 Aug 2024 18:39:46 +0200 Subject: [PATCH 050/249] adapt lib.bash to handle prerelease suffixes like a1, b2, rc3 --- src/ocrd/lib.bash | 31 +++++++++++++++++++++++-------- tests/cli/test_bashlib.py | 28 +++++++++++++++++++++++----- 2 files changed, 46 insertions(+), 13 deletions(-) diff --git a/src/ocrd/lib.bash b/src/ocrd/lib.bash index 1e3ecfc6e..9e0460e6d 100644 --- a/src/ocrd/lib.bash +++ b/src/ocrd/lib.bash @@ -27,12 +27,22 @@ ocrd__log () { ## Ensure minimum version # ht https://stackoverflow.com/posts/4025065 ocrd__minversion () { - local minversion="$1" - local version=$(ocrd --version|sed 's/ocrd, version //') - #echo "$minversion < $version?" - local IFS=. - version=($version) - minversion=($minversion) + local minversion_raw="$1" + set -e + local version_raw=$(ocrd --version|sed 's/ocrd, version //') + local version_mmp=$(echo "$version_raw" | grep -Eo '([0-9]+\.?){3}') + local version_prerelease_suffix="${version_raw#$version_mmp}" + if [[ -z $version_prerelease_suffix ]];then + version_prerelease_suffix=0 + fi + local minversion_mmp=$(echo "$minversion_raw" | grep -Eo '([0-9]+\.?){3}') + local minversion_prerelease_suffix="${minversion_raw#$minversion_mmp}" + if [[ -z $minversion_prerelease_suffix ]];then + minversion_prerelease_suffix=0 + fi + local IFS='.' + version=($version_mmp) + minversion=($minversion_mmp) # MAJOR > MAJOR if (( ${version[0]} > ${minversion[0]} ));then return @@ -44,12 +54,17 @@ ocrd__minversion () { # MINOR == MINOR elif (( ${version[1]} == ${minversion[1]} ));then # PATCH > PATCH - if (( ${version[2]} >= ${minversion[2]} ));then + if (( ${version[2]} > ${minversion[2]} ));then + return + elif (( ${version[2]} == ${minversion[2]}));then + # Match prerelease suffix like a1, b1 only literally + if [[ $version_prerelease_suffix == $minversion_prerelease_suffix ]];then return + fi fi fi fi - ocrd__raise "ocrd/core is too old (${version[*]} < ${minversion[*]}). Please update OCR-D/core" + ocrd__raise "ocrd/core is too old ($version_raw < $minversion_raw). Please update OCR-D/core" } ## ### `ocrd__dumpjson` diff --git a/tests/cli/test_bashlib.py b/tests/cli/test_bashlib.py index c4b2fd7da..15af49350 100644 --- a/tests/cli/test_bashlib.py +++ b/tests/cli/test_bashlib.py @@ -1,4 +1,6 @@ from contextlib import contextmanager +import re +from typing import Tuple, Union from tests.base import CapturingTestCase as TestCase, main, assets, copy_of_directory import os, sys @@ -20,6 +22,13 @@ from ocrd_utils import pushd_popd +def parse_version(v : str) -> Union[Tuple[int, int, int], Tuple[int, int, int, str]]: + tokens = re.split('((?:a|b|rc)[0-9]+)', v, 1) + version_wo_suffix = tokens[0] + prerelease_suffix = tokens[1] if len(tokens) > 1 else '' + (major, minor, patch) = map(int, version_wo_suffix.split('.')) + return (major, minor, patch, prerelease_suffix) + class TestBashlibCli(TestCase): def invoke_bash(self, script, *args, executable=None): @@ -101,13 +110,22 @@ def test_bashlib_defs(self): assert 'function' in out def test_bashlib_minversion(self): - exit_code, out, err = self.invoke_bash( - "source $(ocrd bashlib filename) && ocrd__minversion 2.29.0") + exit_code, out, err = self.invoke_bash("source $(ocrd bashlib filename) && ocrd__minversion 2.29.0") assert exit_code == 0 - exit_code, out, err = self.invoke_bash( - "source $(ocrd bashlib filename) && ocrd__minversion " + VERSION) + major, minor, patch, prerelease_suffix = parse_version(VERSION) + + # test normal version with impossible minimum minor version + version = "%d.%d.%d" % (major, minor + 1, patch) + exit_code, out, err = self.invoke_bash("source $(ocrd bashlib filename) && ocrd__minversion " + version) + assert exit_code > 0 + assert f"ERROR: ocrd/core is too old ({VERSION} < {version})" in err + + # test non-matching prerelease (the 99th alpha pre-release here) + version = "%d.%d.%da99" % (major, minor, patch) + assert VERSION != version # assuming we will never have 99 alpha prereleases ^^ + exit_code, out, err = self.invoke_bash("source $(ocrd bashlib filename) && ocrd__minversion " + version) assert exit_code > 0 - assert "ERROR: ocrd/core is too old" in err + assert f"ERROR: ocrd/core is too old ({VERSION} < {version})" in err def test_bashlib_cp_processor(self): # script = (Path(__file__).parent.parent / 'data/bashlib_cp_processor.sh').read_text() From 90afb8a7dccbde24e147f774bfb8929ae56854d9 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 14 Aug 2024 19:43:36 +0200 Subject: [PATCH 051/249] process_page_pcgts must return OcrdProcessResult --- src/ocrd/processor/base.py | 39 ++++++++++--------- src/ocrd/processor/builtin/dummy_processor.py | 10 +++-- src/ocrd/workspace.py | 2 +- src/ocrd_modelfactory/__init__.py | 2 +- src/ocrd_models/__init__.py | 1 + 5 files changed, 29 insertions(+), 25 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 43aec4ace..2a4679ed4 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -15,7 +15,7 @@ import os from os import getcwd from pathlib import Path -from typing import Optional +from typing import List, Optional import sys import inspect import tarfile @@ -23,6 +23,8 @@ from deprecated import deprecated from ocrd.workspace import Workspace +from ocrd_models.ocrd_file import OcrdFile +from ocrd_models.ocrd_process_result import OcrdProcessResult from ocrd_utils import ( VERSION as OCRD_VERSION, MIMETYPE_PAGE, @@ -309,7 +311,7 @@ def process_workspace(self, workspace: Workspace) -> None: # fall back to deprecated method self.process() - def process_page_file(self, *input_files) -> None: + def process_page_file(self, *input_files : OcrdFile) -> None: """ Process the given ``input_files`` of the :py:attr:`workspace`, representing one physical page (passed as one opened @@ -321,7 +323,7 @@ def process_page_file(self, *input_files) -> None: to handle cases like multiple fileGrps, non-PAGE input etc.) """ log = getLogger('ocrd.processor.base') - input_pcgts = [None] * len(input_files) + input_pcgts : List[OcrdPage] = [None] * len(input_files) page_id = input_files[0].pageId for i, input_file in enumerate(input_files): # FIXME: what about non-PAGE input like image or JSON ??? @@ -331,28 +333,25 @@ def process_page_file(self, *input_files) -> None: except ValueError as e: log.info("non-PAGE input for page %s: %s", page_id, e) output_file_id = make_file_id(input_files[0], self.output_file_grp) - output_pcgts = self.process_page_pcgts(*input_pcgts, output_file_id=output_file_id, page_id=page_id) - if isinstance(output_pcgts, (list, tuple)): - output_images = output_pcgts[1:] - output_pcgts = output_pcgts[0] - for output_image_pil, output_image_id, output_image_path in output_images: - self.workspace.save_image_file( - output_image_pil, - output_image_id, - self.output_file_grp, - page_id=page_id, - file_path=output_image_path) - output_pcgts.set_pcGtsId(output_file_id) - self.add_metadata(output_pcgts) + result = self.process_page_pcgts(*input_pcgts, output_file_id=output_file_id, page_id=page_id) + for output_image_pil, output_image_id, output_image_path in result.images: + self.workspace.save_image_file( + output_image_pil, + output_image_id, + self.output_file_grp, + page_id=page_id, + file_path=output_image_path) + result.pcgts.set_pcGtsId(output_file_id) + self.add_metadata(result.pcgts) # FIXME: what about non-PAGE output like JSON ??? self.workspace.add_file(file_id=output_file_id, file_grp=self.output_file_grp, page_id=page_id, local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), mimetype=MIMETYPE_PAGE, - content=to_xml(output_pcgts)) + content=to_xml(result.pcgts)) - def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[str] = None, page_id : Optional[str] = None) -> OcrdPage: + def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[str] = None, page_id : Optional[str] = None) -> OcrdProcessResult: """ Process the given ``input_pcgts`` of the :py:attr:`workspace`, representing one physical page (passed as one parsed @@ -374,7 +373,9 @@ def add_metadata(self, pcgts: OcrdPage) -> None: Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``. """ - pcgts.get_Metadata().add_MetadataItem( + metadata_obj = pcgts.get_Metadata() + assert metadata_obj is not None + metadata_obj.add_MetadataItem( MetadataItemType(type_="processingStep", name=self.ocrd_tool['steps'][0], value=self.ocrd_tool['executable'], diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index b05ca9e6d..e01f097d3 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -1,11 +1,13 @@ # pylint: disable=missing-module-docstring,invalid-name from os.path import join, basename +from typing import Optional import click from ocrd import Processor from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor -from ocrd_models.ocrd_page import to_xml +from ocrd_models.ocrd_page import OcrdPage, to_xml +from ocrd_models.ocrd_process_result import OcrdProcessResult from ocrd_utils import ( getLogger, assert_file_grp_cardinality, @@ -24,9 +26,9 @@ class DummyProcessor(Processor): Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group """ - def process_page_pcgts(self, *input_pcgts, output_file_id=None, page_id=None): + def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[str] = None, page_id: Optional[str] = None) -> OcrdProcessResult: # nothing to do here - return input_pcgts[0] + return OcrdProcessResult(input_pcgts[0]) def process_page_file(self, *input_files): LOG = getLogger('ocrd.dummy') @@ -48,7 +50,7 @@ def process_page_file(self, *input_files): content=content) file_id = file_id + '_PAGE' pcgts = page_from_file(output_file) - pcgts = self.process_page_pcgts(pcgts) + pcgts = self.process_page_pcgts(pcgts).pcgts pcgts.set_pcGtsId(file_id) self.add_metadata(pcgts) LOG.info("Add PAGE-XML %s generated for %s", file_id, output_file) diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index fc619b7d0..eeaa6434f 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -1073,7 +1073,7 @@ def image_from_segment(self, segment, parent_image, parent_coords, return segment_image, segment_coords # pylint: disable=redefined-builtin - def save_image_file(self, image : Image, + def save_image_file(self, image : Image.Image, file_id : str, file_grp : str, file_path : Optional[str] = None, diff --git a/src/ocrd_modelfactory/__init__.py b/src/ocrd_modelfactory/__init__.py index 7afc5b176..a98499b2e 100644 --- a/src/ocrd_modelfactory/__init__.py +++ b/src/ocrd_modelfactory/__init__.py @@ -79,7 +79,7 @@ def page_from_image(input_file, with_tree=False): revmap = dict(((node, element) for element, node in mapping.items())) return pcgts, etree, mapping, revmap -def page_from_file(input_file, with_tree=False) -> Union[PcGtsType, Tuple[PcGtsType, ET.Element, dict, dict]]: +def page_from_file(input_file, with_tree=False) -> Union[PcGtsType, Tuple[PcGtsType, ET._Element, dict, dict]]: """ Create :py:class:`~ocrd_models.ocrd_page.OcrdPage` from an :py:class:`~ocrd_models.ocrd_file.OcrdFile` or a file path diff --git a/src/ocrd_models/__init__.py b/src/ocrd_models/__init__.py index a89ee1dec..19d80a072 100644 --- a/src/ocrd_models/__init__.py +++ b/src/ocrd_models/__init__.py @@ -7,3 +7,4 @@ from .ocrd_mets import OcrdMets from .ocrd_xml_base import OcrdXmlDocument from .report import ValidationReport +from .ocrd_process_result import OcrdProcessResult From 70ad19120f995fae79dbe37099411dd1df3c1554 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 15 Aug 2024 11:29:28 +0200 Subject: [PATCH 052/249] bashlib ocrd__minversion: compare prerelease suffix alphabetically Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- src/ocrd/lib.bash | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd/lib.bash b/src/ocrd/lib.bash index 9e0460e6d..65ef9c1ce 100644 --- a/src/ocrd/lib.bash +++ b/src/ocrd/lib.bash @@ -57,8 +57,8 @@ ocrd__minversion () { if (( ${version[2]} > ${minversion[2]} ));then return elif (( ${version[2]} == ${minversion[2]}));then - # Match prerelease suffix like a1, b1 only literally - if [[ $version_prerelease_suffix == $minversion_prerelease_suffix ]];then + # Match prerelease suffix like a1, b1 alphabetically + if [[ $version_prerelease_suffix = $minversion_prerelease_suffix -o $version_prerelease_suffix > $minversion_prerelease_suffix ]]; then return fi fi From 228272b6a4ee94795e8266af4182eacae38e713c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 15 Aug 2024 12:14:57 +0200 Subject: [PATCH 053/249] fix ocrd_tool.schema.yml cardinality oneOf syntax, update spec --- repo/spec | 2 +- src/ocrd_validators/ocrd_tool.schema.yml | 43 +++++++++++------------- 2 files changed, 20 insertions(+), 25 deletions(-) diff --git a/repo/spec b/repo/spec index 2bbd4dd91..2948bca7b 160000 --- a/repo/spec +++ b/repo/spec @@ -1 +1 @@ -Subproject commit 2bbd4dd916519f567e5c648b24c0b5ca6fc8a183 +Subproject commit 2948bca7bda274137221abfdc0765c52beeedc33 diff --git a/src/ocrd_validators/ocrd_tool.schema.yml b/src/ocrd_validators/ocrd_tool.schema.yml index db1b61458..5de65a04e 100644 --- a/src/ocrd_validators/ocrd_tool.schema.yml +++ b/src/ocrd_validators/ocrd_tool.schema.yml @@ -11,7 +11,7 @@ properties: type: string pattern: '^[0-9]+\.[0-9]+\.[0-9]+$' git_url: - description: Github/Gitlab URL + description: GitHub/GitLab URL type: string format: url dockerhub: @@ -37,14 +37,14 @@ properties: type: string input_file_grp: deprecated: true - description: Input fileGrp@USE this tool expects by default + description: (DEPRECATED) Input fileGrp@USE this tool expects by default type: array items: type: string # pattern: '^OCR-D-[A-Z0-9-]+$' output_file_grp: deprecated: true - description: Output fileGrp@USE this tool produces by default + description: (DEPRECATED) Output fileGrp@USE this tool produces by default type: array items: type: string @@ -52,31 +52,26 @@ properties: input_file_grp_cardinality: description: Number of (comma-separated) input fileGrp@USE this tool expects (either an exact value or a minimum,maximum list with -1 for unlimited) oneOf: - - items: + - type: number + multipleOf: 1 + - type: array + items: type: number multipleOf: 1 - - items: - type: array - items: - type: number - multipleOf: 1 - minItems: 2 - maxItems: 2 + minItems: 2 + maxItems: 2 default: 1 - additionalProperties: false output_file_grp_cardinality: description: Number of (comma-separated) output fileGrp@USE this tool expects (either an exact value or a minimum,maximum list with -1 for unlimited) oneOf: - - items: + - type: number + multipleOf: 1 + - type: array + items: type: number multipleOf: 1 - - items: - type: array - items: - type: number - multipleOf: 1 - minItems: 2 - maxItems: 2 + minItems: 2 + maxItems: 2 default: 1 parameters: description: Object describing the parameters of a tool. Keys are parameter names, values sub-schemas. @@ -152,9 +147,9 @@ properties: description: "If parameter is reference to file: Whether the file should be cached, e.g. because it is large and won't change." default: false description: - description: Concise description what the tool does + description: Concise description of what the tool does categories: - description: Tools belong to this categories, representing modules within the OCR-D project structure + description: Tools belong to these categories, representing modules within the OCR-D project structure type: array items: type: string @@ -229,7 +224,7 @@ properties: default: 'as-is' path_in_archive: type: string - description: if type is archive, the resource is at this location in the archive + description: If type is archive, the resource is at this location in the archive default: '.' version_range: type: string @@ -237,4 +232,4 @@ properties: default: '>= 0.0.1' size: type: number - description: Size of the resource in bytes + description: "Size of the resource in bytes to be retrieved (for archives: size of the archive)" From 5aba83b91ea2d37943f13dddc7ab3c7c444c9af5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 15 Aug 2024 12:22:53 +0200 Subject: [PATCH 054/249] bashlib: fix ocrd__minversion test syntax --- src/ocrd/lib.bash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/lib.bash b/src/ocrd/lib.bash index 65ef9c1ce..82fa2005d 100644 --- a/src/ocrd/lib.bash +++ b/src/ocrd/lib.bash @@ -58,7 +58,7 @@ ocrd__minversion () { return elif (( ${version[2]} == ${minversion[2]}));then # Match prerelease suffix like a1, b1 alphabetically - if [[ $version_prerelease_suffix = $minversion_prerelease_suffix -o $version_prerelease_suffix > $minversion_prerelease_suffix ]]; then + if [ "$version_prerelease_suffix" = "$minversion_prerelease_suffix" -o "$version_prerelease_suffix" \> "$minversion_prerelease_suffix" ]; then return fi fi From 3d094d6cac7cca62fec4555f95d35ccac828cc14 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 15 Aug 2024 13:30:02 +0200 Subject: [PATCH 055/249] reimplement OcrdPageResult --- src/ocrd/processor/base.py | 4 ++-- src/ocrd/processor/builtin/dummy_processor.py | 7 +++---- src/ocrd/processor/ocrd_page_result.py | 15 +++++++++++++++ src/ocrd_models/__init__.py | 1 - 4 files changed, 20 insertions(+), 7 deletions(-) create mode 100644 src/ocrd/processor/ocrd_page_result.py diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 2a4679ed4..5e3b8a7fd 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -24,7 +24,7 @@ from ocrd.workspace import Workspace from ocrd_models.ocrd_file import OcrdFile -from ocrd_models.ocrd_process_result import OcrdProcessResult +from ocrd.processor.ocrd_page_result import OcrdPageResult from ocrd_utils import ( VERSION as OCRD_VERSION, MIMETYPE_PAGE, @@ -351,7 +351,7 @@ def process_page_file(self, *input_files : OcrdFile) -> None: mimetype=MIMETYPE_PAGE, content=to_xml(result.pcgts)) - def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[str] = None, page_id : Optional[str] = None) -> OcrdProcessResult: + def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[str] = None, page_id : Optional[str] = None) -> OcrdPageResult: """ Process the given ``input_pcgts`` of the :py:attr:`workspace`, representing one physical page (passed as one parsed diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index e01f097d3..4ddb434f2 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -6,11 +6,10 @@ from ocrd import Processor from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor +from ocrd.processor.ocrd_page_result import OcrdPageResult from ocrd_models.ocrd_page import OcrdPage, to_xml -from ocrd_models.ocrd_process_result import OcrdProcessResult from ocrd_utils import ( getLogger, - assert_file_grp_cardinality, make_file_id, MIME_TO_EXT, MIMETYPE_PAGE, @@ -26,9 +25,9 @@ class DummyProcessor(Processor): Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group """ - def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[str] = None, page_id: Optional[str] = None) -> OcrdProcessResult: + def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[str] = None, page_id: Optional[str] = None) -> OcrdPageResult: # nothing to do here - return OcrdProcessResult(input_pcgts[0]) + return OcrdPageResult(input_pcgts[0]) def process_page_file(self, *input_files): LOG = getLogger('ocrd.dummy') diff --git a/src/ocrd/processor/ocrd_page_result.py b/src/ocrd/processor/ocrd_page_result.py new file mode 100644 index 000000000..6e00bd4e5 --- /dev/null +++ b/src/ocrd/processor/ocrd_page_result.py @@ -0,0 +1,15 @@ +from dataclasses import dataclass, field +from typing import List +from ocrd_models.ocrd_page import OcrdPage +from PIL.Image import Image + +@dataclass +class OcrdPageResultImage(): + pil : Image + file_id : str + file_path : str + +@dataclass +class OcrdPageResult(): + pcgts : OcrdPage + images : List[OcrdPageResultImage] = field(default_factory=list) diff --git a/src/ocrd_models/__init__.py b/src/ocrd_models/__init__.py index 19d80a072..a89ee1dec 100644 --- a/src/ocrd_models/__init__.py +++ b/src/ocrd_models/__init__.py @@ -7,4 +7,3 @@ from .ocrd_mets import OcrdMets from .ocrd_xml_base import OcrdXmlDocument from .report import ValidationReport -from .ocrd_process_result import OcrdProcessResult From f8b6896bf29f960cfdfea8941d2b5fbb2b2e81fa Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 15 Aug 2024 13:44:13 +0200 Subject: [PATCH 056/249] update spec (with new ocrd_tool.schema) --- repo/spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repo/spec b/repo/spec index 2948bca7b..cb1ba2e72 160000 --- a/repo/spec +++ b/repo/spec @@ -1 +1 @@ -Subproject commit 2948bca7bda274137221abfdc0765c52beeedc33 +Subproject commit cb1ba2e72bd176f1a1076eea38d6438c647e68e7 From 72eb75b6509fdd3ba2e8d44fe4c6508b305110a6 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 15 Aug 2024 13:44:48 +0200 Subject: [PATCH 057/249] update spec to v3.25.0, ocrd_tool.schema.yml --- repo/spec | 2 +- src/ocrd_validators/ocrd_tool.schema.yml | 43 +++++++++++------------- 2 files changed, 20 insertions(+), 25 deletions(-) diff --git a/repo/spec b/repo/spec index 2bbd4dd91..cb1ba2e72 160000 --- a/repo/spec +++ b/repo/spec @@ -1 +1 @@ -Subproject commit 2bbd4dd916519f567e5c648b24c0b5ca6fc8a183 +Subproject commit cb1ba2e72bd176f1a1076eea38d6438c647e68e7 diff --git a/src/ocrd_validators/ocrd_tool.schema.yml b/src/ocrd_validators/ocrd_tool.schema.yml index db1b61458..5de65a04e 100644 --- a/src/ocrd_validators/ocrd_tool.schema.yml +++ b/src/ocrd_validators/ocrd_tool.schema.yml @@ -11,7 +11,7 @@ properties: type: string pattern: '^[0-9]+\.[0-9]+\.[0-9]+$' git_url: - description: Github/Gitlab URL + description: GitHub/GitLab URL type: string format: url dockerhub: @@ -37,14 +37,14 @@ properties: type: string input_file_grp: deprecated: true - description: Input fileGrp@USE this tool expects by default + description: (DEPRECATED) Input fileGrp@USE this tool expects by default type: array items: type: string # pattern: '^OCR-D-[A-Z0-9-]+$' output_file_grp: deprecated: true - description: Output fileGrp@USE this tool produces by default + description: (DEPRECATED) Output fileGrp@USE this tool produces by default type: array items: type: string @@ -52,31 +52,26 @@ properties: input_file_grp_cardinality: description: Number of (comma-separated) input fileGrp@USE this tool expects (either an exact value or a minimum,maximum list with -1 for unlimited) oneOf: - - items: + - type: number + multipleOf: 1 + - type: array + items: type: number multipleOf: 1 - - items: - type: array - items: - type: number - multipleOf: 1 - minItems: 2 - maxItems: 2 + minItems: 2 + maxItems: 2 default: 1 - additionalProperties: false output_file_grp_cardinality: description: Number of (comma-separated) output fileGrp@USE this tool expects (either an exact value or a minimum,maximum list with -1 for unlimited) oneOf: - - items: + - type: number + multipleOf: 1 + - type: array + items: type: number multipleOf: 1 - - items: - type: array - items: - type: number - multipleOf: 1 - minItems: 2 - maxItems: 2 + minItems: 2 + maxItems: 2 default: 1 parameters: description: Object describing the parameters of a tool. Keys are parameter names, values sub-schemas. @@ -152,9 +147,9 @@ properties: description: "If parameter is reference to file: Whether the file should be cached, e.g. because it is large and won't change." default: false description: - description: Concise description what the tool does + description: Concise description of what the tool does categories: - description: Tools belong to this categories, representing modules within the OCR-D project structure + description: Tools belong to these categories, representing modules within the OCR-D project structure type: array items: type: string @@ -229,7 +224,7 @@ properties: default: 'as-is' path_in_archive: type: string - description: if type is archive, the resource is at this location in the archive + description: If type is archive, the resource is at this location in the archive default: '.' version_range: type: string @@ -237,4 +232,4 @@ properties: default: '>= 0.0.1' size: type: number - description: Size of the resource in bytes + description: "Size of the resource in bytes to be retrieved (for archives: size of the archive)" From 75cb20c36ef9f82f858a82b9dc393679d7b20f8a Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 15 Aug 2024 14:24:47 +0200 Subject: [PATCH 058/249] process_page_file: fix handling of images --- src/ocrd/processor/base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 5e3b8a7fd..626c3ca97 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -334,13 +334,13 @@ def process_page_file(self, *input_files : OcrdFile) -> None: log.info("non-PAGE input for page %s: %s", page_id, e) output_file_id = make_file_id(input_files[0], self.output_file_grp) result = self.process_page_pcgts(*input_pcgts, output_file_id=output_file_id, page_id=page_id) - for output_image_pil, output_image_id, output_image_path in result.images: + for image in result.images: self.workspace.save_image_file( - output_image_pil, - output_image_id, + image.pil, + image.file_id, self.output_file_grp, page_id=page_id, - file_path=output_image_path) + file_path=image.file_path) result.pcgts.set_pcGtsId(output_file_id) self.add_metadata(result.pcgts) # FIXME: what about non-PAGE output like JSON ??? From 9a1c7ad083f2266e0aec3fe4cde6b956e04d7567 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 15 Aug 2024 14:57:13 +0200 Subject: [PATCH 059/249] process_page_pcgts: remove output_file_id, replace OcrdPageResult.file_id with OcrdPageResult.file_id_suffix --- src/ocrd/processor/base.py | 41 +++++++++++-------- src/ocrd/processor/builtin/dummy_processor.py | 12 ++++-- src/ocrd/processor/ocrd_page_result.py | 2 +- 3 files changed, 35 insertions(+), 20 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 626c3ca97..198537756 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -15,7 +15,7 @@ import os from os import getcwd from pathlib import Path -from typing import List, Optional +from typing import List, Optional, Union import sys import inspect import tarfile @@ -23,8 +23,9 @@ from deprecated import deprecated from ocrd.workspace import Workspace -from ocrd_models.ocrd_file import OcrdFile +from ocrd_models.ocrd_file import ClientSideOcrdFile, OcrdFile from ocrd.processor.ocrd_page_result import OcrdPageResult +from ocrd_models.ocrd_page_generateds import PcGtsType from ocrd_utils import ( VERSION as OCRD_VERSION, MIMETYPE_PAGE, @@ -200,10 +201,11 @@ def verify(self): assert self.output_file_grp is not None input_file_grps = self.input_file_grp.split(',') output_file_grps = self.output_file_grp.split(',') - def assert_file_grp_cardinality(grps, spec, msg): + def assert_file_grp_cardinality(grps : List[str], spec : Union[int, List[int]], msg): if isinstance(spec, int) and spec > 0: assert len(grps) == spec, msg % (len(grps), str(spec)) else: + assert isinstance(spec, list) minimum = spec[0] maximum = spec[1] if minimum > 0: @@ -291,7 +293,7 @@ def process_workspace(self, workspace: Workspace) -> None: # - ResourceNotFoundError → use ResourceManager to download (once), then retry # - transient (I/O or OOM) error → maybe sleep, retry # - persistent (data) error → skip / dummy / raise - input_files = [None] * len(input_file_tuple) + input_files : List[Optional[Union[OcrdFile, ClientSideOcrdFile]]] = [None] * len(input_file_tuple) for i, input_file in enumerate(input_file_tuple): if i == 0: log.info("processing page %s", input_file.pageId) @@ -311,7 +313,7 @@ def process_workspace(self, workspace: Workspace) -> None: # fall back to deprecated method self.process() - def process_page_file(self, *input_files : OcrdFile) -> None: + def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None: """ Process the given ``input_files`` of the :py:attr:`workspace`, representing one physical page (passed as one opened @@ -323,21 +325,25 @@ def process_page_file(self, *input_files : OcrdFile) -> None: to handle cases like multiple fileGrps, non-PAGE input etc.) """ log = getLogger('ocrd.processor.base') - input_pcgts : List[OcrdPage] = [None] * len(input_files) + input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files) + assert isinstance(input_files[0], (OcrdFile, ClientSideOcrdFile)) page_id = input_files[0].pageId for i, input_file in enumerate(input_files): + assert isinstance(input_file, (OcrdFile, ClientSideOcrdFile)) # FIXME: what about non-PAGE input like image or JSON ??? log.debug("parsing file %s for page %s", input_file.ID, input_file.pageId) try: - input_pcgts[i] = page_from_file(input_file) + page_ = page_from_file(input_file) + assert isinstance(page_, PcGtsType) + input_pcgts[i] = page_ except ValueError as e: log.info("non-PAGE input for page %s: %s", page_id, e) output_file_id = make_file_id(input_files[0], self.output_file_grp) - result = self.process_page_pcgts(*input_pcgts, output_file_id=output_file_id, page_id=page_id) + result = self.process_page_pcgts(*input_pcgts, page_id=page_id) for image in result.images: self.workspace.save_image_file( image.pil, - image.file_id, + f'{output_file_id}_{image.file_id_suffix}', self.output_file_grp, page_id=page_id, file_path=image.file_path) @@ -351,18 +357,21 @@ def process_page_file(self, *input_files : OcrdFile) -> None: mimetype=MIMETYPE_PAGE, content=to_xml(result.pcgts)) - def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[str] = None, page_id : Optional[str] = None) -> OcrdPageResult: + def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: """ Process the given ``input_pcgts`` of the :py:attr:`workspace`, representing one physical page (passed as one parsed :py:class:`~ocrd_models.OcrdPage` per input fileGrp) under the given :py:attr:`parameter`, and return the - resulting :py:class:`~ocrd_models.OcrdPage`. - - Optionally, return a list or tuple of the :py:class:`~ocrd_models.OcrdPage` - and one or more lists or tuples of :py:class:`PIL.Image` (image data), - :py:class:str (file ID) and :py:class:str (file path) of derived images - to be annotated along with the resulting PAGE file. + resulting :py:class:`~ocrd.processor.ocrd_page_result.OcrdPageResult`. + + Optionally, add to the ``images`` attribute of the resulting + :py:class:`~ocrd.processor.ocrd_page_result.OcrdPageResult` instances + of :py:class:`~ocrd.processor.ocrd_page_result.OcrdPageResultImage`, + which have required fields for ``pil`` (:py:class:`PIL.Image` image data), + ``file_id_suffix`` (used for generating IDs of saved images) and + ``file_path`` (the path used in the AlternativeImage and for saving the + file). (This contains the main functionality and must be overridden by subclasses.) """ diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index 4ddb434f2..5ef76d2fa 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -1,13 +1,15 @@ # pylint: disable=missing-module-docstring,invalid-name from os.path import join, basename -from typing import Optional +from typing import Optional, Union import click from ocrd import Processor from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor from ocrd.processor.ocrd_page_result import OcrdPageResult +from ocrd_models.ocrd_file import ClientSideOcrdFile, OcrdFile from ocrd_models.ocrd_page import OcrdPage, to_xml +from ocrd_models.ocrd_page_generateds import PcGtsType from ocrd_utils import ( getLogger, make_file_id, @@ -25,13 +27,16 @@ class DummyProcessor(Processor): Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group """ - def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[str] = None, page_id: Optional[str] = None) -> OcrdPageResult: + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: + assert input_pcgts[0] # nothing to do here return OcrdPageResult(input_pcgts[0]) - def process_page_file(self, *input_files): + def process_page_file(self, *input_files: Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None: LOG = getLogger('ocrd.dummy') input_file = input_files[0] + assert input_file + assert input_file.local_filename if self.parameter['copy_files'] and input_file.mimetype != MIMETYPE_PAGE: # we need to mimic the actual copying in addition to the PAGE boilerplate file_id = make_file_id(input_file, self.output_file_grp) @@ -49,6 +54,7 @@ def process_page_file(self, *input_files): content=content) file_id = file_id + '_PAGE' pcgts = page_from_file(output_file) + assert isinstance(pcgts, PcGtsType) pcgts = self.process_page_pcgts(pcgts).pcgts pcgts.set_pcGtsId(file_id) self.add_metadata(pcgts) diff --git a/src/ocrd/processor/ocrd_page_result.py b/src/ocrd/processor/ocrd_page_result.py index 6e00bd4e5..92f926cb9 100644 --- a/src/ocrd/processor/ocrd_page_result.py +++ b/src/ocrd/processor/ocrd_page_result.py @@ -6,7 +6,7 @@ @dataclass class OcrdPageResultImage(): pil : Image - file_id : str + file_id_suffix : str file_path : str @dataclass From 60ad4247eab4d54846620431a2b23dd71499f8c4 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 15 Aug 2024 15:07:48 +0200 Subject: [PATCH 060/249] OcrdPageResultImage requires passing alternative_image w/o filename set --- src/ocrd/processor/base.py | 13 ++++++++----- src/ocrd/processor/ocrd_page_result.py | 4 +++- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 198537756..8feedcb88 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -9,7 +9,7 @@ 'run_processor' ] -from os.path import exists +from os.path import exists, join from shutil import copyfileobj import json import os @@ -340,13 +340,16 @@ def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOc log.info("non-PAGE input for page %s: %s", page_id, e) output_file_id = make_file_id(input_files[0], self.output_file_grp) result = self.process_page_pcgts(*input_pcgts, page_id=page_id) - for image in result.images: + for image_result in result.images: + image_file_id = f'{output_file_id}_{image_result.file_id_suffix}' + image_file_path = join(self.output_file_grp, f'{image_file_id}.png') + image_result.alternative_image.set_filename(image_file_path) self.workspace.save_image_file( - image.pil, - f'{output_file_id}_{image.file_id_suffix}', + image_result.pil, + image_file_id, self.output_file_grp, page_id=page_id, - file_path=image.file_path) + file_path=image_file_path) result.pcgts.set_pcGtsId(output_file_id) self.add_metadata(result.pcgts) # FIXME: what about non-PAGE output like JSON ??? diff --git a/src/ocrd/processor/ocrd_page_result.py b/src/ocrd/processor/ocrd_page_result.py index 92f926cb9..c63330c73 100644 --- a/src/ocrd/processor/ocrd_page_result.py +++ b/src/ocrd/processor/ocrd_page_result.py @@ -3,11 +3,13 @@ from ocrd_models.ocrd_page import OcrdPage from PIL.Image import Image +from ocrd_models.ocrd_page_generateds import AlternativeImageType + @dataclass class OcrdPageResultImage(): pil : Image file_id_suffix : str - file_path : str + alternative_image : AlternativeImageType @dataclass class OcrdPageResult(): From 50dfdd6356395f9965505f915e4641d1dc553834 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 15 Aug 2024 17:06:19 +0200 Subject: [PATCH 061/249] Processor.verify: handle -1 case Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- src/ocrd/processor/base.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 8feedcb88..230d1fdba 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -202,8 +202,9 @@ def verify(self): input_file_grps = self.input_file_grp.split(',') output_file_grps = self.output_file_grp.split(',') def assert_file_grp_cardinality(grps : List[str], spec : Union[int, List[int]], msg): - if isinstance(spec, int) and spec > 0: - assert len(grps) == spec, msg % (len(grps), str(spec)) + if isinstance(spec, int): + if spec > 0: + assert len(grps) == spec, msg % (len(grps), str(spec)) else: assert isinstance(spec, list) minimum = spec[0] From 53f2634280c437bc057e23bfc7e0992ae7930a82 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 15 Aug 2024 17:21:32 +0200 Subject: [PATCH 062/249] processor.base: remove obsolete FIXME Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- src/ocrd/processor/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 230d1fdba..17d9eac40 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -331,7 +331,6 @@ def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOc page_id = input_files[0].pageId for i, input_file in enumerate(input_files): assert isinstance(input_file, (OcrdFile, ClientSideOcrdFile)) - # FIXME: what about non-PAGE input like image or JSON ??? log.debug("parsing file %s for page %s", input_file.ID, input_file.pageId) try: page_ = page_from_file(input_file) From d210afa527003c7f8ed4af5ea3853dc0db5ccd52 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 15 Aug 2024 17:26:14 +0200 Subject: [PATCH 063/249] Processor.process_page_pcgts: update docstring for file_path/alternative_image Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- src/ocrd/processor/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 17d9eac40..9daa23697 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -372,9 +372,9 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option :py:class:`~ocrd.processor.ocrd_page_result.OcrdPageResult` instances of :py:class:`~ocrd.processor.ocrd_page_result.OcrdPageResultImage`, which have required fields for ``pil`` (:py:class:`PIL.Image` image data), - ``file_id_suffix`` (used for generating IDs of saved images) and - ``file_path`` (the path used in the AlternativeImage and for saving the - file). + ``file_id_suffix`` (used for generating IDs of the saved image) and + ``alternative_image`` (reference of the :py:class:`ocrd_models.ocrd_page.AlternativeImageType` + for setting the filename of the saved image). (This contains the main functionality and must be overridden by subclasses.) """ From 5718cf92b7a1729f789992569de801660837cb76 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 15 Aug 2024 17:28:44 +0200 Subject: [PATCH 064/249] export OcrdPageResult{Image} from ocrd.processor --- src/ocrd/processor/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/ocrd/processor/__init__.py b/src/ocrd/processor/__init__.py index 21b0c69eb..0b3ce5a56 100644 --- a/src/ocrd/processor/__init__.py +++ b/src/ocrd/processor/__init__.py @@ -2,6 +2,10 @@ Processor, ResourceNotFoundError ) +from .ocrd_page_result import ( + OcrdPageResult, + OcrdPageResultImage +) from .helpers import ( run_cli, run_processor, From f5f3145ef4dc902edf71e72ee4ca5fdaf6640361 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 15 Aug 2024 17:29:17 +0200 Subject: [PATCH 065/249] Processor.process.page_pcgts: simplify references in docstring --- src/ocrd/processor/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 8feedcb88..54f05d6d1 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -366,11 +366,11 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option representing one physical page (passed as one parsed :py:class:`~ocrd_models.OcrdPage` per input fileGrp) under the given :py:attr:`parameter`, and return the - resulting :py:class:`~ocrd.processor.ocrd_page_result.OcrdPageResult`. + resulting :py:class:`~ocrd.processor.OcrdPageResult`. Optionally, add to the ``images`` attribute of the resulting - :py:class:`~ocrd.processor.ocrd_page_result.OcrdPageResult` instances - of :py:class:`~ocrd.processor.ocrd_page_result.OcrdPageResultImage`, + :py:class:`~ocrd.processor.OcrdPageResult` instances + of :py:class:`~ocrd.processor.OcrdPageResultImage`, which have required fields for ``pil`` (:py:class:`PIL.Image` image data), ``file_id_suffix`` (used for generating IDs of saved images) and ``file_path`` (the path used in the AlternativeImage and for saving the From 7045318105e9e58328a6c12adb63e0fbae1e9a69 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 15 Aug 2024 17:34:43 +0200 Subject: [PATCH 066/249] allow "from ocrd_models import OcrdPage --- src/ocrd_models/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ocrd_models/__init__.py b/src/ocrd_models/__init__.py index a89ee1dec..330fefe97 100644 --- a/src/ocrd_models/__init__.py +++ b/src/ocrd_models/__init__.py @@ -5,5 +5,6 @@ from .ocrd_exif import OcrdExif from .ocrd_file import OcrdFile, ClientSideOcrdFile from .ocrd_mets import OcrdMets +from .ocrd_page import OcrdPage from .ocrd_xml_base import OcrdXmlDocument from .report import ValidationReport From 3220e3f6d9805949e2e40f48a43f2509e0e25936 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 15 Aug 2024 19:11:12 +0200 Subject: [PATCH 067/249] :memo: v3.0.0a1 --- CHANGELOG.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index dd816a354..1b6a47d02 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,20 @@ Change Log Versioned according to [Semantic Versioning](http://semver.org/). +## [3.0.0a1] - 2024-08-15 + +Changed: + - :fire: Deprecate `Processor.process` + - update spec to v3.25.0, which allows annotating fileGrp cardinality in `ocrd-tool.json` + - `ocrd.processor`: Handle loading of bundled `ocrd-tool.json` generically + +Added: + - `Processor.process_workspace`: process a complete workspace, with default implementation + - `Processor.process_page_file`: process an OcrdFile, with default implementation + - `Processor.process_page_pcgts`: process a single OcrdPage, produce a single OcrdPage, required to implement + - `Processor.verify`: handle fileGrp cardinality verification, with default implementation + - `Processor.setup`: to set up processor before processing, optional + ## Unreleased Changed: From e1f5744746b29851aa2e2241f8ea3546be965cdc Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 16 Aug 2024 09:58:37 +0200 Subject: [PATCH 068/249] Update CHANGELOG.md Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- CHANGELOG.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1b6a47d02..b0e89bb8f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,12 @@ Versioned according to [Semantic Versioning](http://semver.org/). Changed: - :fire: Deprecate `Processor.process` - - update spec to v3.25.0, which allows annotating fileGrp cardinality in `ocrd-tool.json` + - update spec to v3.25.0, which requires annotating fileGrp cardinality in `ocrd-tool.json` + - :fire: Remove passing non-processing kwargs to `Processor` constructor, add as members + (i.e. `show_help`, `dump_json`, `dump_module_dir`, `list_resources`, `show_resource`, `resolve_resource`) + - :fire: Deprecate passing processing arg / kwargs to `Processor` constructor + (i.e. `workspace`, `page_id`, `input_file_grp`, `output_file_grp`; now all set by `run_processor`) + - :fire: Deprecate passing `ocrd-tool.json` metadata to `Processor` constructor - `ocrd.processor`: Handle loading of bundled `ocrd-tool.json` generically Added: From 80d42f1bb17a67d6b32e4edb8cb3e66ce42badd4 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 16 Aug 2024 15:00:02 +0200 Subject: [PATCH 069/249] ocrd: more convenience imports --- src/ocrd/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ocrd/__init__.py b/src/ocrd/__init__.py index 9aa507b2c..e4c782685 100644 --- a/src/ocrd/__init__.py +++ b/src/ocrd/__init__.py @@ -15,7 +15,8 @@ """ from ocrd.processor.base import run_processor, run_cli, Processor, ResourceNotFoundError -from ocrd_models import OcrdMets, OcrdExif, OcrdFile, OcrdAgent +from ocrd.processor.ocrd_page_result import OcrdPageResult, OcrdPageResultImage +from ocrd_models import OcrdMets, OcrdPage, OcrdExif, OcrdFile, OcrdAgent from ocrd.resolver import Resolver from ocrd_validators import * from ocrd.workspace import Workspace From 0e57b4b3897b2dc03a0c8480d146e2e403ee4a23 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 16 Aug 2024 15:01:01 +0200 Subject: [PATCH 070/249] ocrd.cli: more fix module import order, export help cmd --- src/ocrd/cli/__init__.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index 9b80abeb4..198406afd 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -10,7 +10,18 @@ from ocrd_utils import config -__all__ = ['cli'] +from ..decorators import ocrd_loglevel +from .ocrd_tool import ocrd_tool_cli +from .workspace import workspace_cli +from .process import process_cli +from .bashlib import bashlib_cli +from .validate import validate_cli +from .resmgr import resmgr_cli +from .zip import zip_cli +from .log import log_cli +from .network import network_cli + +__all__ = ['cli', 'command_with_replaced_help'] _epilog = f""" @@ -54,6 +65,7 @@ {config.describe('OCRD_LOGGING_DEBUG')} """ + def command_with_replaced_help(*replacements): class CommandWithReplacedHelp(click.Command): @@ -66,17 +78,6 @@ def get_help(self, ctx): return CommandWithReplacedHelp -from ocrd.cli.ocrd_tool import ocrd_tool_cli -from ocrd.cli.workspace import workspace_cli -from ocrd.cli.process import process_cli -from ocrd.cli.bashlib import bashlib_cli -from ocrd.cli.validate import validate_cli -from ocrd.cli.resmgr import resmgr_cli -from ocrd.decorators import ocrd_loglevel -from .zip import zip_cli -from .log import log_cli -from .network import network_cli - @click.group(epilog=_epilog) @click.version_option(package_name='ocrd') From 9cfd70cffcc71118293a391310ed7eb3eff7b7a4 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 16 Aug 2024 15:01:19 +0200 Subject: [PATCH 071/249] fix imports --- src/ocrd/decorators/parameter_option.py | 2 +- src/ocrd/workspace.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ocrd/decorators/parameter_option.py b/src/ocrd/decorators/parameter_option.py index 0fbe3e057..55abbc2a5 100644 --- a/src/ocrd/decorators/parameter_option.py +++ b/src/ocrd/decorators/parameter_option.py @@ -1,10 +1,10 @@ from click import option -#from ocrd_utils import parse_json_string_or_file __all__ = ['parameter_option', 'parameter_override_option'] def _handle_param_option(ctx, param, value): + from ocrd_utils import parse_json_string_or_file return parse_json_string_or_file(*list(value)) parameter_option = option('-p', '--parameter', diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index eeaa6434f..509b8123b 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -24,6 +24,7 @@ coordinates_of_segment, adjust_canvas_to_rotation, adjust_canvas_to_transposition, + scale_coordinates, shift_coordinates, rotate_coordinates, transform_coordinates, From 95212b598f19ca4576ff2c22ec5573bf1cd5de7a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 16 Aug 2024 15:04:19 +0200 Subject: [PATCH 072/249] fix type assertion --- src/ocrd/processor/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 46417ac2c..79b52dde6 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -25,7 +25,6 @@ from ocrd.workspace import Workspace from ocrd_models.ocrd_file import ClientSideOcrdFile, OcrdFile from ocrd.processor.ocrd_page_result import OcrdPageResult -from ocrd_models.ocrd_page_generateds import PcGtsType from ocrd_utils import ( VERSION as OCRD_VERSION, MIMETYPE_PAGE, @@ -334,9 +333,10 @@ def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOc log.debug("parsing file %s for page %s", input_file.ID, input_file.pageId) try: page_ = page_from_file(input_file) - assert isinstance(page_, PcGtsType) + assert isinstance(page_, OcrdPage) input_pcgts[i] = page_ except ValueError as e: + # not PAGE and not an image to generate PAGE for log.info("non-PAGE input for page %s: %s", page_id, e) output_file_id = make_file_id(input_files[0], self.output_file_grp) result = self.process_page_pcgts(*input_pcgts, page_id=page_id) From 4aa288a7ff57a2fd2255d5ed8606ea39102d42a5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 16 Aug 2024 15:13:58 +0200 Subject: [PATCH 073/249] ocrd_utils: forgot to export scale_coordinates at toplvl --- src/ocrd_utils/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ocrd_utils/__init__.py b/src/ocrd_utils/__init__.py index 2055758a8..78400791a 100644 --- a/src/ocrd_utils/__init__.py +++ b/src/ocrd_utils/__init__.py @@ -13,6 +13,7 @@ :py:meth:`ocrd.workspace.Workspace.image_from_segment`.) * :py:func:`rotate_coordinates`, + :py:func:`scale_coordinates`, :py:func:`shift_coordinates`, :py:func:`transpose_coordinates`, :py:func:`transform_coordinates` @@ -147,6 +148,7 @@ polygon_mask, rotate_coordinates, rotate_image, + scale_coordinates, shift_coordinates, transform_coordinates, transpose_coordinates, From 8044e60590655e19ec3a3127efb925e428d08e6a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 16 Aug 2024 15:34:57 +0200 Subject: [PATCH 074/249] fix 9cfd70cffcc --- src/ocrd/cli/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index 198406afd..a79faabe9 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -21,7 +21,7 @@ from .log import log_cli from .network import network_cli -__all__ = ['cli', 'command_with_replaced_help'] +__all__ = ['cli'] _epilog = f""" From 21ff810f68c76311ce504d00bc9babde7d14b963 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 16 Aug 2024 15:58:58 +0200 Subject: [PATCH 075/249] fix 9cfd70cffcc (revert to wrong import order to avoid circle) --- src/ocrd/cli/__init__.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index a79faabe9..322cbde19 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -10,19 +10,6 @@ from ocrd_utils import config -from ..decorators import ocrd_loglevel -from .ocrd_tool import ocrd_tool_cli -from .workspace import workspace_cli -from .process import process_cli -from .bashlib import bashlib_cli -from .validate import validate_cli -from .resmgr import resmgr_cli -from .zip import zip_cli -from .log import log_cli -from .network import network_cli - -__all__ = ['cli'] - _epilog = f""" \b @@ -65,7 +52,6 @@ {config.describe('OCRD_LOGGING_DEBUG')} """ - def command_with_replaced_help(*replacements): class CommandWithReplacedHelp(click.Command): @@ -79,6 +65,17 @@ def get_help(self, ctx): return CommandWithReplacedHelp +from ..decorators import ocrd_loglevel +from .ocrd_tool import ocrd_tool_cli +from .workspace import workspace_cli +from .process import process_cli +from .bashlib import bashlib_cli +from .validate import validate_cli +from .resmgr import resmgr_cli +from .zip import zip_cli +from .log import log_cli +from .network import network_cli + @click.group(epilog=_epilog) @click.version_option(package_name='ocrd') @ocrd_loglevel @@ -96,3 +93,5 @@ def cli(**kwargs): # pylint: disable=unused-argument cli.add_command(log_cli) cli.add_command(resmgr_cli) cli.add_command(network_cli) + +__all__ = ['cli'] From 4077e8d8f8c306d524bdb0faf5faa9557999d556 Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 16 Aug 2024 16:09:30 +0200 Subject: [PATCH 076/249] s,PcGtsType,OcrdPage, --- src/ocrd/processor/base.py | 2 +- src/ocrd/processor/builtin/dummy_processor.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 79b52dde6..344569677 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -383,7 +383,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option def add_metadata(self, pcgts: OcrdPage) -> None: """ Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing - the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``. + the processing step and runtime parameters to :py:class:`~ocrd_models.OcrdPage` ``pcgts``. """ metadata_obj = pcgts.get_Metadata() assert metadata_obj is not None diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index 5ef76d2fa..1b3f7a5aa 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -9,7 +9,6 @@ from ocrd.processor.ocrd_page_result import OcrdPageResult from ocrd_models.ocrd_file import ClientSideOcrdFile, OcrdFile from ocrd_models.ocrd_page import OcrdPage, to_xml -from ocrd_models.ocrd_page_generateds import PcGtsType from ocrd_utils import ( getLogger, make_file_id, @@ -54,7 +53,7 @@ def process_page_file(self, *input_files: Optional[Union[OcrdFile, ClientSideOcr content=content) file_id = file_id + '_PAGE' pcgts = page_from_file(output_file) - assert isinstance(pcgts, PcGtsType) + assert isinstance(pcgts, OcrdPage) pcgts = self.process_page_pcgts(pcgts).pcgts pcgts.set_pcGtsId(file_id) self.add_metadata(pcgts) From cd4c96c94c6424628de2ccf2eb503d6933eaad9d Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 19 Aug 2024 14:17:14 +0200 Subject: [PATCH 077/249] add config.OCRD_DOWNLOAD_INPUT --- src/ocrd/cli/__init__.py | 2 ++ src/ocrd/processor/base.py | 6 ++++-- src/ocrd_utils/config.py | 18 +++++++++++++----- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index 322cbde19..bf262b0b9 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -29,6 +29,8 @@ \b {config.describe('OCRD_DOWNLOAD_TIMEOUT')} \b +{config.describe('OCRD_DOWNLOAD_INPUT')} +\b {config.describe('OCRD_METS_CACHING')} \b {config.describe('OCRD_MAX_PROCESSOR_CACHE')} diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 344569677..78f8b1237 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -29,6 +29,7 @@ VERSION as OCRD_VERSION, MIMETYPE_PAGE, MIME_TO_EXT, + config, getLogger, initLogging, list_resource_candidates, @@ -111,7 +112,7 @@ def __init__( input_file_grp=None, output_file_grp=None, page_id=None, - download_files=True, + download_files=config.OCRD_DOWNLOAD_INPUT, version=None ): """ @@ -137,7 +138,8 @@ def __init__( (or empty for all pages). \ Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \ before processing. - download_files (boolean): Whether input files will be downloaded prior to processing. + download_files (boolean): Whether input files will be downloaded prior to processing, \ + defaults to :py:attr:`ocrd_utils.config.OCRD_DOWNLOAD_INPUT` which is ``True`` by default """ if ocrd_tool is not None: deprecation_warning("Passing 'ocrd_tool' as keyword argument to Processor is deprecated - " diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index d0955a8dc..22a566e7b 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -12,6 +12,8 @@ from tempfile import gettempdir from textwrap import fill, indent +_validator_boolean = lambda val: isinstance(val, bool) or str.lower(val) in ('true', 'false', '0', '1') +_parser_boolean = lambda val: bool(val) if isinstance(val, (int, bool)) else str.lower(val) in ('true', '1') class OcrdEnvVariable(): @@ -102,8 +104,8 @@ def raw_value(self, name): config.add('OCRD_METS_CACHING', description='If set to `true`, access to the METS file is cached, speeding in-memory search and modification.', - validator=lambda val: val in ('true', 'false', '0', '1'), - parser=lambda val: val in ('true', '1')) + validator=_validator_boolean, + parser=_parser_boolean) config.add('OCRD_MAX_PROCESSOR_CACHE', description="Maximum number of processor instances (for each set of parameters) to be kept in memory (including loaded models) for processing workers or processor servers.", @@ -125,7 +127,7 @@ def raw_value(self, name): description="If set, then the CPU profile is written to this file for later peruse with a analysis tools like snakeviz") config.add("OCRD_DOWNLOAD_RETRIES", - description="Number of times to retry failed attempts for downloads of resource or workspace files.", + description="Number of times to retry failed attempts for downloads of resources or workspace files.", validator=int, parser=int) @@ -141,6 +143,12 @@ def _ocrd_download_timeout_parser(val): description="Timeout in seconds for connecting or reading (comma-separated) when downloading.", parser=_ocrd_download_timeout_parser) +config.add("OCRD_DOWNLOAD_INPUT", + description="Whether to download files not present locally during processing", + default=(True, True), + validator=_validator_boolean, + parser=_parser_boolean) + config.add("OCRD_NETWORK_SERVER_ADDR_PROCESSING", description="Default address of Processing Server to connect to (for `ocrd network client processing`).", default=(True, '')) @@ -190,5 +198,5 @@ def _ocrd_download_timeout_parser(val): config.add("OCRD_LOGGING_DEBUG", description="Print information about the logging setup to STDERR", default=(True, False), - validator=lambda val: isinstance(val, bool) or str.lower(val) in ('true', 'false', '0', '1'), - parser=lambda val: val if isinstance(val, (int, bool)) else str.lower(val) in ('true', '1')) + validator=_validator_boolean, + parser=_parser_boolean) From 312525517f9c01f25634d35bbef256b5c56372e7 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 19 Aug 2024 14:51:39 +0200 Subject: [PATCH 078/249] define self.logger in processor base constructor --- src/ocrd/processor/base.py | 14 +++++++++----- src/ocrd/processor/builtin/dummy_processor.py | 3 +-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 344569677..5d9637b80 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -47,6 +47,8 @@ # XXX imports must remain for backwards-compatibility from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import +_logger = getLogger('ocrd.processor.base') + class ResourceNotFoundError(FileNotFoundError): """ An exception signifying the requested processor resource @@ -175,6 +177,9 @@ def __init__( if not report.is_valid: raise ValueError("Invalid parameters %s" % report.errors) self.parameter = parameter + # NOTE: this is the logger to be used by processor implementations, + # `processor.base` default implementations should use :py:attr:`_logger` + self.logger = getLogger(f'ocrd.processor.{self.__class__.__name__}') # workaround for deprecated#72 (@deprecated decorator does not work for subclasses): setattr(self, 'process', deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process'))) @@ -562,7 +567,6 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): if not self.input_file_grp: raise ValueError("Processor is missing input fileGrp") - LOG = getLogger('ocrd.processor.base') ifgs = self.input_file_grp.split(",") # Iterating over all files repeatedly may seem inefficient at first sight, # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering @@ -582,13 +586,13 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.") if on_error == 'abort': raise ValueError(msg) - LOG.warning(msg) + _logger.warning(msg) for file_ in files_: if not file_.pageId: continue ift = pages.setdefault(file_.pageId, [None]*len(ifgs)) if ift[i]: - LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg) + _logger.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg) # fileGrp has multiple files for this page ID if mimetype: # filter was active, this must not happen @@ -627,14 +631,14 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): else: raise Exception("Unknown 'on_error' strategy '%s'" % on_error) else: - LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg) + _logger.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg) ift[i] = file_ ifts = list() for page, ifiles in pages.items(): for i, ifg in enumerate(ifgs): if not ifiles[i]: # other fallback options? - LOG.error('found no page %s in file group %s', + _logger.error('found no page %s in file group %s', page, ifg) if ifiles[0] or not require_first: ifts.append(tuple(ifiles)) diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index 1b3f7a5aa..29082e72d 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -32,7 +32,6 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional return OcrdPageResult(input_pcgts[0]) def process_page_file(self, *input_files: Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None: - LOG = getLogger('ocrd.dummy') input_file = input_files[0] assert input_file assert input_file.local_filename @@ -57,7 +56,7 @@ def process_page_file(self, *input_files: Optional[Union[OcrdFile, ClientSideOcr pcgts = self.process_page_pcgts(pcgts).pcgts pcgts.set_pcGtsId(file_id) self.add_metadata(pcgts) - LOG.info("Add PAGE-XML %s generated for %s", file_id, output_file) + self.logger.info("Add PAGE-XML %s generated for %s", file_id, output_file) self.workspace.add_file(file_id=file_id, file_grp=self.output_file_grp, page_id=input_file.pageId, From dcf7c52e0e3c6de3105ff6bab0633cbae2a24ae7 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 19 Aug 2024 18:58:53 +0200 Subject: [PATCH 079/249] OcrdPage proxy object for PcGtsType, including etree and mappings --- src/ocrd_modelfactory/__init__.py | 27 ++++++++++++-------------- src/ocrd_models/ocrd_page.py | 28 +++++++++++++++++++++++---- src/ocrd_validators/page_validator.py | 4 ++-- 3 files changed, 38 insertions(+), 21 deletions(-) diff --git a/src/ocrd_modelfactory/__init__.py b/src/ocrd_modelfactory/__init__.py index a98499b2e..c0600e51f 100644 --- a/src/ocrd_modelfactory/__init__.py +++ b/src/ocrd_modelfactory/__init__.py @@ -14,9 +14,10 @@ from ocrd_utils import VERSION, MIMETYPE_PAGE, guess_media_type from ocrd_models import OcrdExif, OcrdFile, ClientSideOcrdFile from ocrd_models.ocrd_page import ( - PcGtsType, PageType, MetadataType, + OcrdPage, PcGtsType, PageType, MetadataType, parse, parseEtree ) +from ocrd_utils.deprecate import deprecation_warning __all__ = [ 'exif_from_filename', @@ -39,7 +40,7 @@ def exif_from_filename(image_filename): ocrd_exif = OcrdExif(pil_img) return ocrd_exif -def page_from_image(input_file, with_tree=False): +def page_from_image(input_file : Union[OcrdFile, ClientSideOcrdFile], **kwargs) -> OcrdPage: """ Create :py:class:`~ocrd_models.ocrd_page.OcrdPage` from an :py:class:`~ocrd_models.ocrd_file.OcrdFile` @@ -48,10 +49,9 @@ def page_from_image(input_file, with_tree=False): Arguments: input_file (:py:class:`~ocrd_models.ocrd_file.OcrdFile`): file to open \ and produce a PAGE DOM for - Keyword arguments: - with_tree (boolean): whether to return XML node tree, element-node mapping \ - and reverse mapping, too (cf. :py:func:`ocrd_models.ocrd_page.parseEtree`) """ + if 'with_etree' in kwargs: + deprecation_warning('kwarg "with_etree" is obsolete now, we always return OcrdPage including etree') if not input_file.local_filename: raise ValueError("input_file must have 'local_filename' property") if not Path(input_file.local_filename).exists(): @@ -72,14 +72,12 @@ def page_from_image(input_file, with_tree=False): ), pcGtsId=input_file.ID ) - if not with_tree: - return pcgts mapping = dict() - etree = pcgts.to_etree(mapping_=mapping) + etree : ET._Element = pcgts.to_etree(mapping_=mapping) revmap = dict(((node, element) for element, node in mapping.items())) - return pcgts, etree, mapping, revmap + return OcrdPage(pcgts, etree, mapping, revmap) -def page_from_file(input_file, with_tree=False) -> Union[PcGtsType, Tuple[PcGtsType, ET._Element, dict, dict]]: +def page_from_file(input_file, **kwargs) -> OcrdPage: """ Create :py:class:`~ocrd_models.ocrd_page.OcrdPage` from an :py:class:`~ocrd_models.ocrd_file.OcrdFile` or a file path @@ -88,10 +86,9 @@ def page_from_file(input_file, with_tree=False) -> Union[PcGtsType, Tuple[PcGtsT Arguments: input_file (:py:class:`~ocrd_models.ocrd_file.OcrdFile` or `str`): file to open \ and produce a PAGE DOM for - Keyword arguments: - with_tree (boolean): whether to return XML node tree, element-node mapping \ - and reverse mapping, too (cf. :py:func:`ocrd_models.ocrd_page.parseEtree`) """ + if 'with_etree' in kwargs: + deprecation_warning('kwarg "with_etree" is obsolete now, we always return OcrdPage including etree') if not isinstance(input_file, (OcrdFile, ClientSideOcrdFile)): mimetype = guess_media_type(input_file, application_xml=MIMETYPE_PAGE) input_file = OcrdFile(ET.Element("dummy"), @@ -102,7 +99,7 @@ def page_from_file(input_file, with_tree=False) -> Union[PcGtsType, Tuple[PcGtsT if not Path(input_file.local_filename).exists(): raise FileNotFoundError("File not found: '%s' (%s)" % (input_file.local_filename, input_file)) if input_file.mimetype.startswith('image'): - return page_from_image(input_file, with_tree=with_tree) + return page_from_image(input_file) if input_file.mimetype == MIMETYPE_PAGE: - return (parseEtree if with_tree else parse)(input_file.local_filename, silence=True) + return OcrdPage(*parseEtree(input_file.local_filename, silence=True)) raise ValueError("Unsupported mimetype '%s'" % input_file.mimetype) diff --git a/src/ocrd_models/ocrd_page.py b/src/ocrd_models/ocrd_page.py index b0cc2b331..e649baace 100644 --- a/src/ocrd_models/ocrd_page.py +++ b/src/ocrd_models/ocrd_page.py @@ -2,6 +2,8 @@ API to PAGE-XML, generated with generateDS from XML schema. """ from io import StringIO +from inspect import getmembers +from lxml import etree as ET __all__ = [ 'parse', @@ -174,10 +176,28 @@ """ ) -# add alias for DOM root -OcrdPage = PcGtsType - -def to_xml(el, skip_declaration=False): +class OcrdPage(): + """ + Proxy object for :py:class:`ocrd_models.PcGtsType` that also offers access + to the underlying etree, element-node mapping and reverse mapping, too (cf. + :py:func:`ocrd_models.ocrd_page.parseEtree`) + """ + def __init__( + self, + pcgts : PcGtsType, + etree : ET._Element, + mapping : dict[str, ET._Element], + revmap : dict[ET._Element, str], + ): + self._pcgts = pcgts + self.etree = etree + self.mapping = mapping + self.revmap = revmap + + def __getattr__(self, name): + return getattr(self._pcgts, name) + +def to_xml(el, skip_declaration=False) -> str: """ Serialize ``pc:PcGts`` document as string. """ diff --git a/src/ocrd_validators/page_validator.py b/src/ocrd_validators/page_validator.py index 41ce0b9f9..d6d8a95b5 100644 --- a/src/ocrd_validators/page_validator.py +++ b/src/ocrd_validators/page_validator.py @@ -6,7 +6,7 @@ from shapely.validation import explain_validity from ocrd_utils import getLogger, polygon_from_points, deprecated_alias -from ocrd_models.ocrd_page import parse +from ocrd_models.ocrd_page import OcrdPage, parse from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( @@ -236,7 +236,7 @@ def validate_consistency(node, page_textequiv_consistency, page_textequiv_strate and whether the coordinates of an element are fully within its parent element coordinates. """ log = getLogger('ocrd.page_validator.validate_consistency') - if isinstance(node, PcGtsType): + if isinstance(node, (PcGtsType, OcrdPage)): # top-level (start recursion) node_id = node.get_pcGtsId() node = node.get_Page() # has no .id From cf45d8b30047c68fa97bfb22de0622232205bc15 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 19 Aug 2024 19:04:53 +0200 Subject: [PATCH 080/249] Processor.base: have a (hopefully) thread-safe logger for the base class --- src/ocrd/processor/base.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 5d9637b80..785a139ec 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -47,7 +47,6 @@ # XXX imports must remain for backwards-compatibility from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import -_logger = getLogger('ocrd.processor.base') class ResourceNotFoundError(FileNotFoundError): """ @@ -178,8 +177,10 @@ def __init__( raise ValueError("Invalid parameters %s" % report.errors) self.parameter = parameter # NOTE: this is the logger to be used by processor implementations, - # `processor.base` default implementations should use :py:attr:`_logger` + # `processor.base` default implementations should use + # :py:attr:`self._base_logger` self.logger = getLogger(f'ocrd.processor.{self.__class__.__name__}') + self._base_logger = getLogger('ocrd.processor.base') # workaround for deprecated#72 (@deprecated decorator does not work for subclasses): setattr(self, 'process', deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process'))) @@ -586,13 +587,13 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.") if on_error == 'abort': raise ValueError(msg) - _logger.warning(msg) + self._base_logger.warning(msg) for file_ in files_: if not file_.pageId: continue ift = pages.setdefault(file_.pageId, [None]*len(ifgs)) if ift[i]: - _logger.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg) + self._base_logger.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg) # fileGrp has multiple files for this page ID if mimetype: # filter was active, this must not happen @@ -631,14 +632,14 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): else: raise Exception("Unknown 'on_error' strategy '%s'" % on_error) else: - _logger.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg) + self._base_logger.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg) ift[i] = file_ ifts = list() for page, ifiles in pages.items(): for i, ifg in enumerate(ifgs): if not ifiles[i]: # other fallback options? - _logger.error('found no page %s in file group %s', + self._base_logger.error('found no page %s in file group %s', page, ifg) if ifiles[0] or not require_first: ifts.append(tuple(ifiles)) From 785d60736919590aaa6e2c84a6a487dc46d12468 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 20 Aug 2024 08:05:24 +0200 Subject: [PATCH 081/249] Processor.zip_input_files: warning instead of exception for missing input files --- src/ocrd/processor/base.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 344569677..958661f79 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -575,16 +575,9 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): pageId=self.page_id, fileGrp=ifg, mimetype=mimetype), # sort by MIME type so PAGE comes before images key=lambda file_: file_.mimetype) - # Warn if no files found but pageId was specified because that - # might be because of invalid page_id (range) - if self.page_id and not files_: - msg = (f"Could not find any files for --page-id {self.page_id} - " - f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.") - if on_error == 'abort': - raise ValueError(msg) - LOG.warning(msg) for file_ in files_: if not file_.pageId: + # ignore document-global files continue ift = pages.setdefault(file_.pageId, [None]*len(ifgs)) if ift[i]: @@ -629,13 +622,15 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): else: LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg) ift[i] = file_ + # Warn if no files found but pageId was specified, because that might be due to invalid page_id (range) + if self.page_id and not any(pages): + LOG.critical(f"Could not find any files for selected pageId {self.page_id}") ifts = list() for page, ifiles in pages.items(): for i, ifg in enumerate(ifgs): if not ifiles[i]: # other fallback options? - LOG.error('found no page %s in file group %s', - page, ifg) + LOG.error(f'Found no page {page} in file group {ifg}') if ifiles[0] or not require_first: ifts.append(tuple(ifiles)) return ifts From b12849da6dd4a46dd0d9a121c50f9438cb61d6e1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 20 Aug 2024 08:07:16 +0200 Subject: [PATCH 082/249] Processor.zip_input_files: introduce NonUniqueInputFile exception --- src/ocrd/processor/__init__.py | 4 ++- src/ocrd/processor/base.py | 46 ++++++++++++++++++++++--------- tests/processor/test_processor.py | 6 ++-- 3 files changed, 39 insertions(+), 17 deletions(-) diff --git a/src/ocrd/processor/__init__.py b/src/ocrd/processor/__init__.py index 0b3ce5a56..b6c1188de 100644 --- a/src/ocrd/processor/__init__.py +++ b/src/ocrd/processor/__init__.py @@ -1,6 +1,8 @@ from .base import ( Processor, - ResourceNotFoundError + ResourceNotFoundError, + NonUniqueInputFile, + MissingInputFile, ) from .ocrd_page_result import ( OcrdPageResult, diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 958661f79..516989ae2 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -55,9 +55,36 @@ class ResourceNotFoundError(FileNotFoundError): def __init__(self, name, executable): self.name = name self.executable = executable - self.message = "Could not find resource '%s' for executable '%s'. " \ - "Try 'ocrd resmgr download %s %s' to download this resource." \ - % (name, executable, executable, name) + self.message = (f"Could not find resource '{name}' for executable '{executable}'. " + f"Try 'ocrd resmgr download {executable} {name}' to download this resource.") + super().__init__(self.message) + +class NonUniqueInputFile(ValueError): + """ + An exception signifying the specified fileGrp / pageId / mimetype + selector yields multiple PAGE files, or no PAGE files but multiple images, + or multiple files of that mimetype. + """ + def __init__(self, fileGrp, pageId, mimetype): + self.fileGrp = fileGrp + self.pageId = pageId + self.mimetype = mimetype + self.message = (f"Could not determine unique input file for fileGrp {fileGrp} " + f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}") + super().__init__(self.message) + +class MissingInputFile(ValueError): + """ + An exception signifying the specified fileGrp / pageId / mimetype + selector yields no PAGE files, or no PAGE and no image files, + or no files of that mimetype. + """ + def __init__(self, fileGrp, pageId, mimetype): + self.fileGrp = fileGrp + self.pageId = pageId + self.mimetype = mimetype + self.message = (f"Could not find input file for fileGrp {fileGrp} " + f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}") super().__init__(self.message) class Processor(): @@ -352,7 +379,6 @@ def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOc file_path=image_file_path) result.pcgts.set_pcGtsId(output_file_id) self.add_metadata(result.pcgts) - # FIXME: what about non-PAGE output like JSON ??? self.workspace.add_file(file_id=output_file_id, file_grp=self.output_file_grp, page_id=page_id, @@ -592,9 +618,7 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): elif on_error == 'last': ift[i] = file_ elif on_error == 'abort': - raise ValueError( - "Multiple '%s' matches for page '%s' in fileGrp '%s'." % ( - mimetype, file_.pageId, ifg)) + raise NonUniqueInputFile(ifg, file_.pageId, mimetype) else: raise Exception("Unknown 'on_error' strategy '%s'" % on_error) elif (ift[i].mimetype == MIMETYPE_PAGE and @@ -602,9 +626,7 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): pass # keep PAGE match elif (ift[i].mimetype == MIMETYPE_PAGE and file_.mimetype == MIMETYPE_PAGE): - raise ValueError( - "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % ( - file_.pageId, ifg)) + raise NonUniqueInputFile(ifg, file_.pageId, None) else: # filter was inactive but no PAGE is in control, this must not happen if on_error == 'skip': @@ -614,9 +636,7 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): elif on_error == 'last': ift[i] = file_ elif on_error == 'abort': - raise ValueError( - "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % ( - file_.pageId, ifg)) + raise NonUniqueInputFile(ifg, file_.pageId, None) else: raise Exception("Unknown 'on_error' strategy '%s'" % on_error) else: diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 2cf8a189b..5d565ea70 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -251,7 +251,7 @@ def ocrd_tool(self): assert ('foobar3', 'foobar4') in tuples tuples = [(one.ID, two) for one, two in proc.zip_input_files(on_error='skip')] assert ('foobar3', None) in tuples - with self.assertRaisesRegex(Exception, "No PAGE-XML for page .* in fileGrp .* but multiple matches."): + with self.assertRaisesRegex(Exception, "Could not determine unique input file"): tuples = proc.zip_input_files(on_error='abort') ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar2dup', page_id='phys_0001') for page_id in [None, 'phys_0001,phys_0002']: @@ -260,7 +260,7 @@ def ocrd_tool(self): proc.workspace = ws proc.input_file_grp = 'GRP1,GRP2' proc.page_id = page_id - with self.assertRaisesRegex(Exception, "Multiple PAGE-XML matches for page"): + with self.assertRaisesRegex(Exception, "Could not determine unique input file"): tuples = proc.zip_input_files() def test_zip_input_files_require_first(self): @@ -281,7 +281,7 @@ def ocrd_tool(self): proc.page_id = page_id assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')] r = self.capture_out_err() - assert 'ERROR ocrd.processor.base - found no page phys_0001 in file group GRP1' in r.err + assert 'ERROR ocrd.processor.base - Found no page phys_0001 in file group GRP1' in r.err if __name__ == "__main__": main(__file__) From 95d36585bf7d97193e56b9143fde0416cd7b799b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 20 Aug 2024 08:08:15 +0200 Subject: [PATCH 083/249] Processor.process_workspace: zip_input_files w/o require_first --- src/ocrd/processor/base.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 516989ae2..5becbf8d8 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -314,17 +314,19 @@ def process_workspace(self, workspace: Workspace) -> None: self.verify() try: # FIXME: add page parallelization by running multiprocessing.Pool (#322) - for input_file_tuple in self.zip_input_files(on_error='abort'): + for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False): # FIXME: add error handling by catching exceptions in various ways (#579) # for example: # - ResourceNotFoundError → use ResourceManager to download (once), then retry # - transient (I/O or OOM) error → maybe sleep, retry # - persistent (data) error → skip / dummy / raise input_files : List[Optional[Union[OcrdFile, ClientSideOcrdFile]]] = [None] * len(input_file_tuple) + log.info("processing page %s", + next(input_file.pageId + for input_file in input_file_tuple + if input_file)) for i, input_file in enumerate(input_file_tuple): - if i == 0: - log.info("processing page %s", input_file.pageId) - elif input_file is None: + if input_file is None: # file/page not found in this file grp continue input_files[i] = input_file From c7298411bafe74287e15646eaa3b9d20b90c2e65 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 20 Aug 2024 09:33:37 +0200 Subject: [PATCH 084/249] Processor.zip_input_files: introduce MissingInputFile exception and config.OCRD_MISSING_INPUT --- src/ocrd/cli/__init__.py | 2 ++ src/ocrd/processor/base.py | 20 +++++++++++++++----- src/ocrd_utils/config.py | 6 ++++++ tests/processor/test_processor.py | 2 +- 4 files changed, 24 insertions(+), 6 deletions(-) diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index bf262b0b9..418d7927a 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -31,6 +31,8 @@ \b {config.describe('OCRD_DOWNLOAD_INPUT')} \b +{config.describe('OCRD_MISSING_INPUT')} +\b {config.describe('OCRD_METS_CACHING')} \b {config.describe('OCRD_MAX_PROCESSOR_CACHE')} diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 0ec074742..fddf6383a 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -338,7 +338,7 @@ def process_workspace(self, workspace: Workspace) -> None: input_files[i] = self.workspace.download_file(input_file) except ValueError as e: log.error(repr(e)) - log.warning("skipping file %s for page %s", input_file, input_file.pageId) + log.warning(f"failed downloading file {input_file} for page {input_file.pageId}") self.process_page_file(*input_files) except NotImplementedError: # fall back to deprecated method @@ -611,10 +611,12 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): continue ift = pages.setdefault(file_.pageId, [None]*len(ifgs)) if ift[i]: - LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg) + LOG.debug(f"another file {file_.ID} for page {file_.pageId} in input file group {ifg}") # fileGrp has multiple files for this page ID if mimetype: # filter was active, this must not happen + LOG.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} " + f"conflicts with file {ift[i].ID} of same MIME type {mimetype} - on_error={on_error}") if on_error == 'skip': ift[i] = None elif on_error == 'first': @@ -633,6 +635,8 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): raise NonUniqueInputFile(ifg, file_.pageId, None) else: # filter was inactive but no PAGE is in control, this must not happen + LOG.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} " + f"conflicts with file {ift[i].ID} but no PAGE available - on_error={on_error}") if on_error == 'skip': ift[i] = None elif on_error == 'first': @@ -644,7 +648,7 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): else: raise Exception("Unknown 'on_error' strategy '%s'" % on_error) else: - LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg) + LOG.debug(f"adding file {file_.ID} for page {file_.pageId} to input file group {ifg}") ift[i] = file_ # Warn if no files found but pageId was specified, because that might be due to invalid page_id (range) if self.page_id and not any(pages): @@ -653,8 +657,14 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): for page, ifiles in pages.items(): for i, ifg in enumerate(ifgs): if not ifiles[i]: - # other fallback options? - LOG.error(f'Found no page {page} in file group {ifg}') + # could be from non-unique with on_error=skip or from true gap + LOG.error(f'Found no file for page {page} in file group {ifg}') + if config.OCRD_MISSING_INPUT == 'abort': + raise MissingInputFile(ifg, page, mimetype) + if not any(ifiles): + # must be from non-unique with on_error=skip + LOG.warning(f'Found no files for {page} - skipping') + continue if ifiles[0] or not require_first: ifts.append(tuple(ifiles)) return ifts diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 22a566e7b..11af20249 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -149,6 +149,12 @@ def _ocrd_download_timeout_parser(val): validator=_validator_boolean, parser=_parser_boolean) +config.add("OCRD_MISSING_INPUT", + description="How to deal with missing input files (for some fileGrp/pageId) during processing [SKIP|ABORT]", + default=(True, 'SKIP'), + validator=lambda val: val in ['SKIP', 'ABORT'], + parser=str) + config.add("OCRD_NETWORK_SERVER_ADDR_PROCESSING", description="Default address of Processing Server to connect to (for `ocrd network client processing`).", default=(True, '')) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 5d565ea70..aa2124001 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -281,7 +281,7 @@ def ocrd_tool(self): proc.page_id = page_id assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')] r = self.capture_out_err() - assert 'ERROR ocrd.processor.base - Found no page phys_0001 in file group GRP1' in r.err + assert 'ERROR ocrd.processor.base - Found no file for page phys_0001 in file group GRP1' in r.err if __name__ == "__main__": main(__file__) From 7df81af6a7a9b121569ac4288d079bd3dbd7f884 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 20 Aug 2024 11:24:48 +0200 Subject: [PATCH 085/249] OcrdPage: clearer docstring Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- src/ocrd_models/ocrd_page.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ocrd_models/ocrd_page.py b/src/ocrd_models/ocrd_page.py index e649baace..b28777e72 100644 --- a/src/ocrd_models/ocrd_page.py +++ b/src/ocrd_models/ocrd_page.py @@ -178,7 +178,8 @@ class OcrdPage(): """ - Proxy object for :py:class:`ocrd_models.PcGtsType` that also offers access + Proxy object for :py:class:`ocrd_models.PcGtsType` (i.e. PRImA PAGE-XML + for page content, rendered as object model by generateDS) that also offers access to the underlying etree, element-node mapping and reverse mapping, too (cf. :py:func:`ocrd_models.ocrd_page.parseEtree`) """ From 0ab694201c7c6fe2d55113ca9a5158c6f6834387 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 20 Aug 2024 12:19:36 +0200 Subject: [PATCH 086/249] jsonschema: switch from draft6 to draft2019-09 --- src/ocrd_validators/json_validator.py | 8 ++++---- src/ocrd_validators/parameter_validator.py | 4 ++-- src/ocrd_validators/resource_list_validator.py | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/ocrd_validators/json_validator.py b/src/ocrd_validators/json_validator.py index c920fc7c2..0edb73ed1 100644 --- a/src/ocrd_validators/json_validator.py +++ b/src/ocrd_validators/json_validator.py @@ -3,7 +3,7 @@ """ import json -from jsonschema import Draft6Validator, validators # pylint: disable=import-error +from jsonschema import Draft201909Validator, validators # pylint: disable=import-error from ocrd_models import ValidationReport @@ -28,7 +28,7 @@ def set_defaults(validator, properties, instance, schema): return validators.extend(validator_class, {"properties": set_defaults}) -DefaultValidatingDraft6Validator = extend_with_default(Draft6Validator) +DefaultValidatingDraft20199Validator = extend_with_default(Draft201909Validator) # # ------------------------------------------------- @@ -52,13 +52,13 @@ def validate(obj, schema): obj = json.loads(obj) return JsonValidator(schema)._validate(obj) # pylint: disable=protected-access - def __init__(self, schema, validator_class=Draft6Validator): + def __init__(self, schema, validator_class=Draft201909Validator): """ Construct a JsonValidator. Args: schema (dict): - validator_class (Draft6Validator|DefaultValidatingDraft6Validator): + validator_class (Draft20199Validator|DefaultValidatingDraft20199Validator): """ self.validator = validator_class(schema) diff --git a/src/ocrd_validators/parameter_validator.py b/src/ocrd_validators/parameter_validator.py index 20dd6ff2b..26364f70f 100644 --- a/src/ocrd_validators/parameter_validator.py +++ b/src/ocrd_validators/parameter_validator.py @@ -1,7 +1,7 @@ """ Validate parameters against ocrd-tool.json. """ -from .json_validator import JsonValidator, DefaultValidatingDraft6Validator +from .json_validator import DefaultValidatingDraft20199Validator, JsonValidator # # ------------------------------------------------- @@ -45,4 +45,4 @@ def __init__(self, ocrd_tool): "required": required, "additionalProperties": False, "properties": p - }, DefaultValidatingDraft6Validator) + }, DefaultValidatingDraft20199Validator) diff --git a/src/ocrd_validators/resource_list_validator.py b/src/ocrd_validators/resource_list_validator.py index 72a11c34d..d1a77b59b 100644 --- a/src/ocrd_validators/resource_list_validator.py +++ b/src/ocrd_validators/resource_list_validator.py @@ -4,7 +4,7 @@ See `specs `_. """ from .constants import RESOURCE_LIST_SCHEMA -from .json_validator import JsonValidator, DefaultValidatingDraft6Validator +from .json_validator import DefaultValidatingDraft20199Validator, JsonValidator # # ------------------------------------------------- @@ -20,5 +20,5 @@ def validate(obj, schema=RESOURCE_LIST_SCHEMA): """ Validate against ``resource_list.schema.yml`` schema. """ - return JsonValidator(schema, validator_class=DefaultValidatingDraft6Validator)._validate(obj) + return JsonValidator(schema, validator_class=DefaultValidatingDraft20199Validator)._validate(obj) From 66c50b38eab5521d82361b86ad64b7d5f652f198 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 20 Aug 2024 14:27:31 +0200 Subject: [PATCH 087/249] require jsonschema>4 for draft 2019-09 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ed5fd56d5..3d053075a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,7 +12,7 @@ gdown httpx>=0.22.0 importlib_metadata ; python_version < '3.8' importlib_resources ; python_version < '3.10' -jsonschema +jsonschema > 4 lxml memory-profiler >= 0.58.0 # XXX explicitly do not restrict the numpy version because different From 94e2e60d933910c3088885ebb4ca006dc80c5246 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 20 Aug 2024 16:59:14 +0200 Subject: [PATCH 088/249] OcrdToolValidator: set defaults, handle deprecated --- src/ocrd_validators/json_validator.py | 15 +++++++++++---- src/ocrd_validators/ocrd_tool_validator.py | 7 +++++-- tests/validator/test_json_validator.py | 4 ++-- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/src/ocrd_validators/json_validator.py b/src/ocrd_validators/json_validator.py index 0edb73ed1..ccd27b92a 100644 --- a/src/ocrd_validators/json_validator.py +++ b/src/ocrd_validators/json_validator.py @@ -2,11 +2,15 @@ Validating JSON-Schema """ import json +from warnings import warn -from jsonschema import Draft201909Validator, validators # pylint: disable=import-error +from jsonschema import Draft201909Validator, ValidationError, validators # pylint: disable=import-error from ocrd_models import ValidationReport +class JsonSchemaDeprecationWarning(ValidationError): + pass + # http://python-jsonschema.readthedocs.io/en/latest/faq/ def extend_with_default(validator_class): """ @@ -14,18 +18,20 @@ def extend_with_default(validator_class): """ validate_properties = validator_class.VALIDATORS["properties"] - def set_defaults(validator, properties, instance, schema): + def set_defaults_and_handle_deprecate(validator, properties, instance, schema): """ Set defaults in subschemas """ for prop, subschema in properties.items(): if "default" in subschema: instance.setdefault(prop, subschema["default"]) + if subschema.get('deprecated', False): + yield JsonSchemaDeprecationWarning(f"Property {prop} has been deprecated, ocrd-tool.json should be updated.") for error in validate_properties(validator, properties, instance, schema): yield error - return validators.extend(validator_class, {"properties": set_defaults}) + return validators.extend(validator_class, {"properties": set_defaults_and_handle_deprecate}) DefaultValidatingDraft20199Validator = extend_with_default(Draft201909Validator) @@ -74,6 +80,7 @@ def _validate(self, obj): report = ValidationReport() if not self.validator.is_valid(obj): for v in self.validator.iter_errors(obj): + meth = f'add_{"warning" if isinstance(v, JsonSchemaDeprecationWarning) else "error"}' # print(">>>>>>>>> v='%s', obj='%s'" % (v, obj)) - report.add_error("[%s] %s" % ('.'.join(str(vv) for vv in v.path), v.message)) + getattr(report, meth)("[%s] %s" % ('.'.join(str(vv) for vv in v.path), v.message)) return report diff --git a/src/ocrd_validators/ocrd_tool_validator.py b/src/ocrd_validators/ocrd_tool_validator.py index b408bd86e..827001ef7 100644 --- a/src/ocrd_validators/ocrd_tool_validator.py +++ b/src/ocrd_validators/ocrd_tool_validator.py @@ -4,7 +4,7 @@ See `specs `_. """ from .constants import OCRD_TOOL_SCHEMA -from .json_validator import JsonValidator +from .json_validator import DefaultValidatingDraft20199Validator, JsonValidator # # ------------------------------------------------- @@ -20,4 +20,7 @@ def validate(obj, schema=OCRD_TOOL_SCHEMA): """ Validate against ``ocrd-tool.json`` schema. """ - return JsonValidator.validate(obj, schema) + return OcrdToolValidator(schema)._validate(obj) # pylint: disable=protected-access + + def __init__(self, schema, validator_class=...): + super().__init__(schema, DefaultValidatingDraft20199Validator) diff --git a/tests/validator/test_json_validator.py b/tests/validator/test_json_validator.py index 8a8387d4b..25771b701 100644 --- a/tests/validator/test_json_validator.py +++ b/tests/validator/test_json_validator.py @@ -1,5 +1,5 @@ from tests.base import TestCase, main -from ocrd_validators.json_validator import JsonValidator, DefaultValidatingDraft6Validator +from ocrd_validators.json_validator import JsonValidator, DefaultValidatingDraft20199Validator class TestParameterValidator(TestCase): @@ -15,7 +15,7 @@ def setUp(self): } } } - self.defaults_validator = JsonValidator(self.schema, DefaultValidatingDraft6Validator) + self.defaults_validator = JsonValidator(self.schema, DefaultValidatingDraft20199Validator) super().setUp() def test_validate_string(self): From 2e7bdc295dad859fbd1374db4f53bf097bc7d5ec Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 20 Aug 2024 17:00:27 +0200 Subject: [PATCH 089/249] processor.base: validate/setdefault ocrd-tool.json on first access --- src/ocrd/processor/base.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 344569677..8620881c7 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -20,6 +20,7 @@ import inspect import tarfile import io +from warnings import warn from deprecated import deprecated from ocrd.workspace import Workspace @@ -43,6 +44,7 @@ from ocrd_validators import ParameterValidator from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType, OcrdPage, to_xml from ocrd_modelfactory import page_from_file +from ocrd_validators.ocrd_tool_validator import OcrdToolValidator # XXX imports must remain for backwards-compatibility from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import @@ -71,15 +73,20 @@ class Processor(): """ @property - def metadata(self): + def metadata(self) -> dict: """the ocrd-tool.json dict of the package""" if hasattr(self, '_metadata'): return self._metadata self._metadata = json.loads(resource_string(self.__module__.split('.')[0], 'ocrd-tool.json')) + report = OcrdToolValidator.validate(self._metadata) + if not report.is_valid: + # FIXME: remove when bertsky/core#10 is merged + self.logger = getLogger(f'ocrd.processor.{self.__class__.__name__}') + self.logger.error(f"The ocrd-tool.json of this processor is {'problematic' if not report.errors else 'invalid'}:\n{report.to_xml()}.\nPlease open an issue at {self._metadata['git_url']}.") return self._metadata @property - def version(self): + def version(self) -> str: """the version of the package""" if hasattr(self, '_version'): return self._version @@ -87,7 +94,7 @@ def version(self): return self._version @property - def executable(self): + def executable(self) -> str: """the executable name of this processor tool""" if hasattr(self, '_executable'): return self._executable @@ -95,7 +102,7 @@ def executable(self): return self._executable @property - def ocrd_tool(self): + def ocrd_tool(self) -> dict: """the ocrd-tool.json dict of this processor tool""" if hasattr(self, '_ocrd_tool'): return self._ocrd_tool From 346f166737bf8d90aeeecccf0075101ee333752a Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 20 Aug 2024 17:00:48 +0200 Subject: [PATCH 090/249] update spec and ocrd_tool.schema.yml --- repo/spec | 2 +- requirements.txt | 2 +- src/ocrd_validators/ocrd_tool.schema.yml | 13 +++++++++++-- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/repo/spec b/repo/spec index cb1ba2e72..df2a07e3f 160000 --- a/repo/spec +++ b/repo/spec @@ -1 +1 @@ -Subproject commit cb1ba2e72bd176f1a1076eea38d6438c647e68e7 +Subproject commit df2a07e3fda634b2eda5785afe67399b61a81173 diff --git a/requirements.txt b/requirements.txt index 3d053075a..e78c18661 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,7 +12,7 @@ gdown httpx>=0.22.0 importlib_metadata ; python_version < '3.8' importlib_resources ; python_version < '3.10' -jsonschema > 4 +jsonschema>=4 lxml memory-profiler >= 0.58.0 # XXX explicitly do not restrict the numpy version because different diff --git a/src/ocrd_validators/ocrd_tool.schema.yml b/src/ocrd_validators/ocrd_tool.schema.yml index 5de65a04e..bdf834b6a 100644 --- a/src/ocrd_validators/ocrd_tool.schema.yml +++ b/src/ocrd_validators/ocrd_tool.schema.yml @@ -108,6 +108,12 @@ properties: maximum: type: number description: Maximum value for number parameters, including the maximum + minProperties: + type: number + description: Minimum number of properties of an object + maxProperties: + type: number + description: Maximum number of properties of an object exclusiveMinimum: type: number description: Minimum value for number parameters, excluding the minimum @@ -121,8 +127,11 @@ properties: type: object description: Describe the properties of an object value additionalProperties: - type: boolean - description: Whether an object value may contain properties not explicitly defined + oneOf: + - type: boolean + description: Whether an object value may contain properties not explicitly defined + - type: object + description: Schema any additional properties need to adhere to required: type: boolean description: Whether this parameter is required From 577baa529103de170f6b1259ae6b161b281f475c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 00:06:14 +0200 Subject: [PATCH 091/249] processor parameter decorator: no '{}' default (unnecessary) --- src/ocrd/decorators/parameter_option.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/decorators/parameter_option.py b/src/ocrd/decorators/parameter_option.py index 55abbc2a5..2f8be3d86 100644 --- a/src/ocrd/decorators/parameter_option.py +++ b/src/ocrd/decorators/parameter_option.py @@ -10,7 +10,7 @@ def _handle_param_option(ctx, param, value): parameter_option = option('-p', '--parameter', help="Parameters, either JSON string or path to JSON file", multiple=True, - default=['{}'], + default=[], # now handled in ocrd_cli_wrap_processor to resolve processor preset files # callback=_handle_param_option callback=lambda ctx, param, kv: list(kv)) From f00ecda84717d274126a736b67e3ab29e5bae83d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 00:07:01 +0200 Subject: [PATCH 092/249] =?UTF-8?q?Processor:=20add=20error=20handling?= =?UTF-8?q?=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - introduce `config.OCRD_MISSING_OUTPUT` and catch exceptions during `process_page_file`: - `ABORT`: re-raise - `SKIP`: ignore and continue with next page - `COPY`: ignore and provide input PAGE-XML as output (with just `@PcGtsId` and `Metadata` added to simulate the processing step) - introduce `config.OCRD_EXISTING_OUTPUT`: - `ABORT`: re-raise FileExistsError - `SKIP`: ignore and continue with next page - `OVERWRITE`: force overwriting the exact output files (instead of removing output files indiscriminately) - :fire: remove `Workspace.overwrite_mode`, have `--overwrite` merely delegate to `config.OCRD_EXISTING_OUTPUT=OVERWRITE` - introduce `--debug`, just delegate to `config.OCRD_MISSING_OUTPUT=ABORT` - `cli.bashlib.input-files`: delegate everything to `ocrd_cli_wrap_processor` (for CLI handling) and `process_workspace` (for error handling), but override `process_page_file` to (never fail and) print bash-friendly strings for actual processing - update tests, add `test_processor.test_run_output_missing` covering all `OCRD_MISSING_OUTPUT` options and the newly `OCRD_EXISTING_OUTPUT=SKIP` --- src/ocrd/cli/__init__.py | 4 + src/ocrd/cli/bashlib.py | 66 +++++++------- src/ocrd/decorators/__init__.py | 22 +---- src/ocrd/decorators/ocrd_cli_options.py | 1 + src/ocrd/processor/base.py | 89 +++++++++++++++---- src/ocrd/processor/builtin/dummy_processor.py | 13 +-- src/ocrd/processor/helpers.py | 8 +- src/ocrd/workspace.py | 14 +-- src/ocrd_utils/config.py | 12 +++ tests/cli/test_bashlib.py | 2 +- tests/data/__init__.py | 39 +++++++- tests/processor/test_processor.py | 37 +++++++- tests/test_workspace.py | 23 +++-- 13 files changed, 224 insertions(+), 106 deletions(-) diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index 418d7927a..3722e3c21 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -33,6 +33,10 @@ \b {config.describe('OCRD_MISSING_INPUT')} \b +{config.describe('OCRD_MISSING_OUTPUT')} +\b +{config.describe('OCRD_EXISTING_OUTPUT')} +\b {config.describe('OCRD_METS_CACHING')} \b {config.describe('OCRD_MAX_PROCESSOR_CACHE')} diff --git a/src/ocrd/cli/bashlib.py b/src/ocrd/cli/bashlib.py index 2c57bb412..26139cb48 100644 --- a/src/ocrd/cli/bashlib.py +++ b/src/ocrd/cli/bashlib.py @@ -20,13 +20,16 @@ from ocrd.decorators import ( parameter_option, parameter_override_option, - ocrd_loglevel + ocrd_loglevel, + ocrd_cli_wrap_processor ) from ocrd_utils import ( is_local_filename, get_local_filename, initLogging, - make_file_id + getLogger, + make_file_id, + config ) from ocrd.resolver import Resolver from ocrd.processor import Processor @@ -81,11 +84,15 @@ def bashlib_constants(name): @bashlib_cli.command('input-files') @click.option('-m', '--mets', help="METS to process", default=DEFAULT_METS_BASENAME) @click.option('-w', '--working-dir', help="Working Directory") -@click.option('-I', '--input-file-grp', help='File group(s) used as input.', default='INPUT') -@click.option('-O', '--output-file-grp', help='File group(s) used as output.', default='OUTPUT') +@click.option('-I', '--input-file-grp', help='File group(s) used as input.', default=None) +@click.option('-O', '--output-file-grp', help='File group(s) used as output.', default=None) # repeat some other processor options for convenience (will be ignored here) @click.option('-g', '--page-id', help="ID(s) of the pages to process") -@click.option('--overwrite', is_flag=True, default=False, help="Remove output pages/images if they already exist") +@click.option('--overwrite', is_flag=True, default=False, help="Remove output pages/images if they already exist\n" + "(with '--page-id', remove only those).\n" + "Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE") +@click.option('--debug', is_flag=True, default=False, help="Abort on any errors with full stack trace.\n" + "Short-hand for OCRD_MISSING_OUTPUT=ABORT") @parameter_option @parameter_override_option @ocrd_loglevel @@ -100,37 +107,26 @@ def bashlib_input_files(**kwargs): (The printing format is one associative array initializer per line.) """ - initLogging() - mets = kwargs.pop('mets') - working_dir = kwargs.pop('working_dir') - if is_local_filename(mets) and not isfile(get_local_filename(mets)): - msg = "File does not exist: %s" % mets - raise FileNotFoundError(msg) - resolver = Resolver() - workspace = resolver.workspace_from_url(mets, working_dir) class BashlibProcessor(Processor): @property def ocrd_tool(self): - return {} + return {'executable': '', 'steps': ['']} @property - def executable(self): - return '' - processor = BashlibProcessor(None) - # go half way of the normal run_processor / process_workspace call tree - processor.workspace = workspace - processor.page_id = kwargs['page_id'] - processor.input_file_grp = kwargs['input_file_grp'] - processor.output_file_grp = kwargs['output_file_grp'] - for input_files in processor.zip_input_files(mimetype=None, on_error='abort'): - # ensure all input files exist locally (without persisting them in the METS) - # - this mimics the default behaviour of all Pythonic processors - input_files = [workspace.download_file(input_file) if input_file else None - for input_file in input_files] - for field in ['url', 'local_filename', 'ID', 'mimetype', 'pageId']: - # make this bash-friendly (show initialization for associative array) - if len(input_files) > 1: - # single quotes allow us to preserve the list value inside the alist - print("[%s]='%s'" % (field, ' '.join(str(getattr(res, field)) for res in input_files)), end=' ') - else: - print("[%s]='%s'" % (field, str(getattr(input_files[0], field))), end=' ') - print("[outputFileId]='%s'" % make_file_id(input_files[0], kwargs['output_file_grp'])) + def version(self): + return '1.0' + # go half way of the normal run_processor / process_workspace call tree + # by just delegating to process_workspace, overriding process_page_file + # to ensure all input files exist locally (without persisting them in the METS) + # and print what needs to be acted on in bash-friendly way + def process_page_file(self, *input_files): + for field in ['url', 'local_filename', 'ID', 'mimetype', 'pageId']: + # make this bash-friendly (show initialization for associative array) + if len(input_files) > 1: + # single quotes allow us to preserve the list value inside the alist + value = ' '.join(str(getattr(res, field)) for res in input_files) + else: + value = str(getattr(input_files[0], field)) + print(f"[{field}]='{value}'", end=' ') + output_file_id = make_file_id(input_files[0], kwargs['output_file_grp']) + print(f"[outputFileId]='{output_file_id}'") + ocrd_cli_wrap_processor(BashlibProcessor, **kwargs) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index d9d1fb69d..364ef4c84 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -36,6 +36,7 @@ def ocrd_cli_wrap_processor( profile_file=None, version=False, overwrite=False, + debug=False, resolve_resource=None, show_resource=None, list_resources=False, @@ -117,25 +118,10 @@ def resolve(name): resolver.resolve_mets_arguments(working_dir, mets, None, mets_server_url) workspace = resolver.workspace_from_url(mets, working_dir, mets_server_url=mets_server_url) page_id = kwargs.get('page_id') - # XXX not possible while processors do not adhere to # https://github.com/OCR-D/core/issues/505 - # if overwrite - # if 'output_file_grp' not in kwargs or not kwargs['output_file_grp']: - # raise Exception("--overwrite requires --output-file-grp") - # LOG.info("Removing files because of --overwrite") - # for grp in kwargs['output_file_grp'].split(','): - # if page_id: - # for one_page_id in kwargs['page_id'].split(','): - # LOG.debug("Removing files in output file group %s with page ID %s", grp, one_page_id) - # for file in workspace.mets.find_files(pageId=one_page_id, fileGrp=grp): - # workspace.remove_file(file, force=True, keep_file=False, page_recursive=True) - # else: - # LOG.debug("Removing all files in output file group %s ", grp) - # # TODO: can be reduced to `page_same_group=True` as soon as core#505 has landed (in all processors) - # workspace.remove_file_group(grp, recursive=True, force=True, keep_files=False, page_recursive=True, page_same_group=False) - # workspace.save_mets() - # XXX While https://github.com/OCR-D/core/issues/505 is open, set 'overwrite_mode' globally on the workspace if overwrite: - workspace.overwrite_mode = True + config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' + if debug: + config.OCRD_MISSING_OUTPUT = 'ABORT' report = WorkspaceValidator.check_file_grp(workspace, kwargs['input_file_grp'], '' if overwrite else kwargs['output_file_grp'], page_id) if not report.is_valid: raise Exception("Invalid input/output file grps:\n\t%s" % '\n\t'.join(report.errors)) diff --git a/src/ocrd/decorators/ocrd_cli_options.py b/src/ocrd/decorators/ocrd_cli_options.py index e640a2003..e069b3ea8 100644 --- a/src/ocrd/decorators/ocrd_cli_options.py +++ b/src/ocrd/decorators/ocrd_cli_options.py @@ -33,6 +33,7 @@ def cli(mets_url): option('-O', '--output-file-grp', default=None), option('-g', '--page-id'), option('--overwrite', is_flag=True, default=False), + option('--debug', is_flag=True, default=False), option('--profile', is_flag=True, default=False), option('--profile-file', type=Path(dir_okay=False, writable=True)), parameter_option, diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index fddf6383a..0ec2711f6 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -21,6 +21,7 @@ import tarfile import io from deprecated import deprecated +from requests import HTTPError from ocrd.workspace import Workspace from ocrd_models.ocrd_file import ClientSideOcrdFile, OcrdFile @@ -317,16 +318,11 @@ def process_workspace(self, workspace: Workspace) -> None: try: # FIXME: add page parallelization by running multiprocessing.Pool (#322) for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False): - # FIXME: add error handling by catching exceptions in various ways (#579) - # for example: - # - ResourceNotFoundError → use ResourceManager to download (once), then retry - # - transient (I/O or OOM) error → maybe sleep, retry - # - persistent (data) error → skip / dummy / raise input_files : List[Optional[Union[OcrdFile, ClientSideOcrdFile]]] = [None] * len(input_file_tuple) - log.info("processing page %s", - next(input_file.pageId - for input_file in input_file_tuple - if input_file)) + page_id = next(input_file.pageId + for input_file in input_file_tuple + if input_file) + log.info(f"processing page {page_id}") for i, input_file in enumerate(input_file_tuple): if input_file is None: # file/page not found in this file grp @@ -336,14 +332,71 @@ def process_workspace(self, workspace: Workspace) -> None: continue try: input_files[i] = self.workspace.download_file(input_file) - except ValueError as e: + except (ValueError, FileNotFoundError, HTTPError) as e: log.error(repr(e)) - log.warning(f"failed downloading file {input_file} for page {input_file.pageId}") - self.process_page_file(*input_files) + log.warning(f"failed downloading file {input_file} for page {page_id}") + # FIXME: differentiate error cases in various ways: + # - ResourceNotFoundError → use ResourceManager to download (once), then retry + # - transient (I/O or OOM) error → maybe sleep, retry + # - persistent (data) error → skip / dummy / raise + try: + self.process_page_file(*input_files) + except Exception as err: + # we have to be broad here, but want to exclude NotImplementedError + if isinstance(err, NotImplementedError): + raise err + if isinstance(err, FileExistsError): + if config.OCRD_EXISTING_OUTPUT == 'ABORT': + raise err + if config.OCRD_EXISTING_OUTPUT == 'SKIP': + continue + if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE': + # too late here, must not happen + raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE") + # FIXME: re-usable/actionable logging + log.exception(f"Failure on page {page_id}: {err}") + if config.OCRD_MISSING_OUTPUT == 'ABORT': + raise err + if config.OCRD_MISSING_OUTPUT == 'SKIP': + continue + if config.OCRD_MISSING_OUTPUT == 'COPY': + self._copy_page_file(input_files[0]) + else: + desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False) + raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}") except NotImplementedError: # fall back to deprecated method self.process() + def _copy_page_file(self, input_file : Union[OcrdFile, ClientSideOcrdFile]) -> None: + """ + Copy the given ``input_file`` of the :py:attr:`workspace`, + representing one physical page (passed as one opened + :py:class:`~ocrd_models.OcrdFile` per input fileGrp) + and add it as if it was a processing result. + """ + log = getLogger('ocrd.processor.base') + input_pcgts : OcrdPage + assert isinstance(input_file, (OcrdFile, ClientSideOcrdFile)) + log.debug(f"parsing file {input_file.ID} for page {input_file.pageId}") + try: + input_pcgts = page_from_file(input_file) + except ValueError as err: + # not PAGE and not an image to generate PAGE for + log.error(f"non-PAGE input for page {input_file.pageId}: {err}") + return + output_file_id = make_file_id(input_file, self.output_file_grp) + input_pcgts.set_pcGtsId(output_file_id) + self.add_metadata(input_pcgts) + self.workspace.add_file(file_id=output_file_id, + file_grp=self.output_file_grp, + page_id=input_file.pageId, + local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), + mimetype=MIMETYPE_PAGE, + content=to_xml(input_pcgts), + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) + def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None: """ Process the given ``input_files`` of the :py:attr:`workspace`, @@ -366,9 +419,9 @@ def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOc page_ = page_from_file(input_file) assert isinstance(page_, OcrdPage) input_pcgts[i] = page_ - except ValueError as e: + except ValueError as err: # not PAGE and not an image to generate PAGE for - log.info("non-PAGE input for page %s: %s", page_id, e) + log.error("non-PAGE input for page %s: %s", page_id, err) output_file_id = make_file_id(input_files[0], self.output_file_grp) result = self.process_page_pcgts(*input_pcgts, page_id=page_id) for image_result in result.images: @@ -380,7 +433,9 @@ def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOc image_file_id, self.output_file_grp, page_id=page_id, - file_path=image_file_path) + file_path=image_file_path, + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) result.pcgts.set_pcGtsId(output_file_id) self.add_metadata(result.pcgts) self.workspace.add_file(file_id=output_file_id, @@ -388,7 +443,9 @@ def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOc page_id=page_id, local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), mimetype=MIMETYPE_PAGE, - content=to_xml(result.pcgts)) + content=to_xml(result.pcgts), + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: """ diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index 1b3f7a5aa..f8890274a 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -15,7 +15,8 @@ MIME_TO_EXT, MIMETYPE_PAGE, parse_json_string_with_comments, - resource_string + resource_string, + config ) from ocrd_modelfactory import page_from_file @@ -43,14 +44,15 @@ def process_page_file(self, *input_files: Optional[Union[OcrdFile, ClientSideOcr local_filename = join(self.output_file_grp, file_id + ext) LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id) with open(input_file.local_filename, 'rb') as f: - content = f.read() output_file = self.workspace.add_file( file_id=file_id, file_grp=self.output_file_grp, page_id=input_file.pageId, mimetype=input_file.mimetype, local_filename=local_filename, - content=content) + content=f.read(), + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) file_id = file_id + '_PAGE' pcgts = page_from_file(output_file) assert isinstance(pcgts, OcrdPage) @@ -63,8 +65,9 @@ def process_page_file(self, *input_files: Optional[Union[OcrdFile, ClientSideOcr page_id=input_file.pageId, local_filename=join(self.output_file_grp, file_id + '.xml'), mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - + content=to_xml(pcgts), + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) else: if self.parameter['copy_files']: LOG.info("Not copying %s because it is a PAGE-XML file, which gets identity-transformed", input_file.local_filename) diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index dff14cfca..08ca0a468 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -160,6 +160,7 @@ def run_cli( workspace=None, page_id=None, overwrite=None, + debug=None, log_level=None, log_filename=None, input_file_grp=None, @@ -202,6 +203,8 @@ def run_cli( args += ['--parameter', parameter] if overwrite: args += ['--overwrite'] + if debug: + args += ['--debug'] if mets_server_url: args += ['--mets-server-url', mets_server_url] log = getLogger('ocrd.processor.helpers.run_cli') @@ -270,7 +273,10 @@ def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None) -O, --output-file-grp USE File group(s) used as output -g, --page-id ID Physical page ID(s) to process instead of full document [] --overwrite Remove existing output pages/images - (with "--page-id", remove only those) + (with "--page-id", remove only those). + Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE + --debug Abort on any errors with full stack trace. + Short-hand for OCRD_MISSING_OUTPUT=ABORT --profile Enable profiling --profile-file PROF-PATH Write cProfile stats to PROF-PATH. Implies "--profile" -p, --parameter JSON-PATH Parameters, either verbatim JSON string diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index 509b8123b..2f94913ed 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -42,7 +42,8 @@ MIME_TO_EXT, MIME_TO_PIL, MIMETYPE_PAGE, - REGEX_PREFIX + REGEX_PREFIX, + config ) from .workspace_backup import WorkspaceBackupManager @@ -75,7 +76,6 @@ class Workspace(): `OcrdMets` of this workspace. If `None`, then the METS will be read from and written to the filesystem directly. baseurl (string, None) : Base URL to prefix to relative URL. - overwrite_mode (boolean, False) : Whether to force add operations on this workspace globally """ def __init__( @@ -91,7 +91,6 @@ def __init__( self.resolver = resolver self.directory = directory self.mets_target = str(Path(directory, mets_basename)) - self.overwrite_mode = False self.is_remote = bool(mets_server_url) if mets is None: if self.is_remote: @@ -243,8 +242,6 @@ def remove_file(self, file_id, force=False, keep_file=False, page_recursive=Fals """ log = getLogger('ocrd.workspace.remove_file') log.debug('Deleting mets:file %s', file_id) - if self.overwrite_mode: - force = True if isinstance(file_id, OcrdFile): file_id = file_id.ID try: @@ -296,9 +293,6 @@ def remove_file_group(self, USE, recursive=False, force=False, keep_files=False, page_same_group (boolean): Remove only images in the same file group as the PAGE-XML. Has no effect unless ``page_recursive`` is `True`. """ - if not force and self.overwrite_mode: - force = True - if (not USE.startswith(REGEX_PREFIX)) and (USE not in self.mets.file_groups) and (not force): raise Exception("No such fileGrp: %s" % USE) @@ -419,8 +413,6 @@ def add_file(self, file_grp, content=None, **kwargs) -> Union[OcrdFile, ClientSi raise ValueError("workspace.add_file must be passed a 'page_id' kwarg, even if it is None.") if content is not None and not kwargs.get('local_filename'): raise Exception("'content' was set but no 'local_filename'") - if self.overwrite_mode: - kwargs['force'] = True with pushd_popd(self.directory): if kwargs.get('local_filename'): @@ -1101,8 +1093,6 @@ def save_image_file(self, image : Image.Image, The (absolute) path of the created file. """ log = getLogger('ocrd.workspace.save_image_file') - if self.overwrite_mode: - force = True saveargs = dict() if 'dpi' in image.info: saveargs['dpi'] = image.info['dpi'] diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 11af20249..fa4c34d63 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -155,6 +155,18 @@ def _ocrd_download_timeout_parser(val): validator=lambda val: val in ['SKIP', 'ABORT'], parser=str) +config.add("OCRD_MISSING_OUTPUT", + description="How to deal with missing output files (for some fileGrp/pageId) during processing [SKIP|COPY|ABORT]", + default=(True, 'SKIP'), + validator=lambda val: val in ['SKIP', 'COPY', 'ABORT'], + parser=str) + +config.add("OCRD_EXISTING_OUTPUT", + description="How to deal with already existing output files (for some fileGrp/pageId) during processing [SKIP|OVERWRITE|ABORT]", + default=(True, 'SKIP'), + validator=lambda val: val in ['SKIP', 'OVERWRITE', 'ABORT'], + parser=str) + config.add("OCRD_NETWORK_SERVER_ADDR_PROCESSING", description="Default address of Processing Server to connect to (for `ocrd network client processing`).", default=(True, '')) diff --git a/tests/cli/test_bashlib.py b/tests/cli/test_bashlib.py index 15af49350..b1ab68c7f 100644 --- a/tests/cli/test_bashlib.py +++ b/tests/cli/test_bashlib.py @@ -98,7 +98,7 @@ def test_constants_fail(self): def test_input_files(self): with copy_of_directory(assets.path_to('kant_aufklaerung_1784/data')) as wsdir: with pushd_popd(wsdir): - _, out, err = self.invoke_cli(bashlib_cli, ['input-files', '-I', 'OCR-D-IMG']) + _, out, err = self.invoke_cli(bashlib_cli, ['input-files', '-I', 'OCR-D-IMG', '-O', 'OUTPUT']) assert ("[url]='' [local_filename]='OCR-D-IMG/INPUT_0017.tif' [ID]='INPUT_0017' [mimetype]='image/tiff' " "[pageId]='PHYS_0017' [outputFileId]='OUTPUT_PHYS_0017'") in out diff --git a/tests/data/__init__.py b/tests/data/__init__.py index e7ef30fc2..53fa227d0 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -1,8 +1,9 @@ import json import os +import re from pytest import warns from ocrd import Processor -from ocrd_utils import make_file_id +from ocrd_utils import make_file_id, config DUMMY_TOOL = { 'executable': 'ocrd-test', @@ -94,7 +95,41 @@ def process(self): page_id=input_file.pageId, mimetype=input_file.mimetype, local_filename=os.path.join(self.output_file_grp, file_id), - content='CONTENT') + content='CONTENT', + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) + +class DummyProcessorWithOutputFailures(Processor): + @property + def ocrd_tool(self): + return DUMMY_TOOL + + @property + def version(self): + return '0.0.1' + + @property + def executable(self): + return 'ocrd-test' + + def __init__(self, *args, **kwargs): + kwargs['download_files'] = False + super().__init__(*args, **kwargs) + + # no error handling with old process(), so override new API + def process_page_file(self, input_file): + n = int(re.findall(r'\d+', input_file.pageId)[-1]) + if n % 2: + raise Exception(f"intermittent failure on page {input_file.pageId}") + output_file_id = make_file_id(input_file, self.output_file_grp) + self.workspace.add_file(file_id=output_file_id, + file_grp=self.output_file_grp, + page_id=input_file.pageId, + local_filename=os.path.join(self.output_file_grp, output_file_id), + mimetype=input_file.mimetype, + content='CONTENT', + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) class IncompleteProcessor(Processor): @property diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index aa2124001..064142574 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -5,7 +5,13 @@ from pathlib import Path from os import environ from tests.base import CapturingTestCase as TestCase, assets, main, copy_of_directory # pylint: disable=import-error, no-name-in-module -from tests.data import DummyProcessor, DummyProcessorWithRequiredParameters, DummyProcessorWithOutput, IncompleteProcessor +from tests.data import ( + DummyProcessor, + DummyProcessorWithRequiredParameters, + DummyProcessorWithOutput, + DummyProcessorWithOutputFailures, + IncompleteProcessor +) from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging from ocrd.resolver import Resolver @@ -145,20 +151,43 @@ def test_run_output0(self): output_file_grp="OCR-D-OUT") assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == 2 + def test_run_output_missing(self): + ws = self.workspace + from ocrd_utils import config + config.OCRD_MISSING_OUTPUT = 'SKIP' + run_processor(DummyProcessorWithOutputFailures, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT") + # only half succeed + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) // 2 + config.OCRD_MISSING_OUTPUT = 'ABORT' + with pytest.raises(Exception) as exc: + run_processor(DummyProcessorWithOutputFailures, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT") + assert "intermittent" in str(exc.value) + config.OCRD_MISSING_OUTPUT = 'COPY' + config.OCRD_EXISTING_OUTPUT = 'SKIP' + run_processor(DummyProcessorWithOutputFailures, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT") + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + def test_run_output_overwrite(self): with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0002') - ws.overwrite_mode = True + from ocrd_utils import config + config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' ws.add_file('OCR-D-OUT', mimetype=MIMETYPE_PAGE, file_id='OCR-D-OUT_phys_0001', page_id='phys_0001') - ws.overwrite_mode = False + config.OCRD_EXISTING_OUTPUT = 'ABORT' with pytest.raises(Exception) as exc: run_processor(DummyProcessorWithOutput, workspace=ws, input_file_grp="GRP1", output_file_grp="OCR-D-OUT") assert str(exc.value) == "File with ID='OCR-D-OUT_phys_0001' already exists" - ws.overwrite_mode = True + config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' run_processor(DummyProcessorWithOutput, workspace=ws, input_file_grp="GRP1", output_file_grp="OCR-D-OUT") diff --git a/tests/test_workspace.py b/tests/test_workspace.py index 2fe5f450a..1ae007ae5 100644 --- a/tests/test_workspace.py +++ b/tests/test_workspace.py @@ -270,9 +270,9 @@ def test_remove_file_force(sbb_data_workspace): # TODO check semantics - can a non-existent thing be removed? assert not sbb_data_workspace.remove_file('non-existing-id', force=True) - # should also succeed - sbb_data_workspace.overwrite_mode = True - assert not sbb_data_workspace.remove_file('non-existing-id', force=False) + with pytest.raises(FileNotFoundError) as not_found_exc: + sbb_data_workspace.remove_file('non-existing-id', force=False) + assert "not found in METS" in str(not_found_exc.value) def test_remove_file_remote_not_available_raises_exception(plain_workspace): @@ -292,9 +292,9 @@ def test_remove_file_remote(plain_workspace): assert plain_workspace.remove_file('page1_img', force=True) # TODO check returned value - # should also "succeed", because overwrite_mode is set which also sets 'force' to 'True' - plain_workspace.overwrite_mode = True - assert not plain_workspace.remove_file('page1_img') + with pytest.raises(FileNotFoundError) as not_found_exc: + plain_workspace.remove_file('page1_img') + assert "not found in METS" in str(not_found_exc.value) def test_rename_file_group(tmp_path): @@ -341,9 +341,6 @@ def test_remove_file_group_force(sbb_data_workspace): # check function and tests semantics # should succeed assert not sbb_data_workspace.remove_file_group('I DO NOT EXIST', force=True) - # should also succeed - sbb_data_workspace.overwrite_mode = True - assert not sbb_data_workspace.remove_file_group('I DO NOT EXIST', force=False) def test_remove_file_group_rmdir(sbb_data_tmp, sbb_data_workspace): @@ -432,9 +429,11 @@ def test_save_image_file(plain_workspace): assert exists(join(plain_workspace.directory, 'IMG', 'page1_img.jpg')) # should succeed assert plain_workspace.save_image_file(img, 'page1_img', 'IMG', page_id='page1', mimetype='image/jpeg', force=True) - # should also succeed - plain_workspace.overwrite_mode = True - assert plain_workspace.save_image_file(img, 'page1_img', 'IMG', page_id='page1', mimetype='image/jpeg') + # should fail + with pytest.raises(FileExistsError) as exists_exc: + plain_workspace.save_image_file(img, 'page1_img', 'IMG', page_id='page1', mimetype='image/jpeg') + assert "neither force nor ignore are set" in str(exists_exc.value) + # check file_path kwarg assert plain_workspace.save_image_file(img, 'page1_img2', 'IMG', page_id='page1', file_path='IMG/page1_img2.png') assert exists(join(plain_workspace.directory, 'IMG', 'page1_img2.png')) From fdd5d168d0753caad8a19efd49884c52d7934183 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 18:01:31 +0200 Subject: [PATCH 093/249] ocrd_utils.config: add variables to module docstring --- src/ocrd_utils/config.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index fa4c34d63..28f95b216 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -62,7 +62,11 @@ def __init__(self): self._variables = {} def add(self, name, *args, **kwargs): - self._variables[name] = OcrdEnvVariable(name, *args, **kwargs) + var = OcrdEnvVariable(name, *args, **kwargs) + # make visible in ocrd_utils.config docstring (apidoc) + txt = var.describe(wrap_text=False, indent_text=True) + globals()['__doc__'] += "\n\n - " + txt + "\n\n" + self._variables[name] = var return self._variables[name] def has_default(self, name): From 6d87f9e6494a0768541e5b18ae557fd594d8319b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 18:02:36 +0200 Subject: [PATCH 094/249] improve docstrings, re-generate docs --- .../ocrd/ocrd.processor.ocrd_page_result.rst | 7 ++ docs/api/ocrd/ocrd.processor.rst | 1 + src/ocrd/cli/validate.py | 4 +- src/ocrd/cli/workspace.py | 1 + src/ocrd/processor/base.py | 75 ++++++++++--------- src/ocrd/workspace.py | 2 - src/ocrd_models/ocrd_exif.py | 1 + src/ocrd_models/ocrd_mets.py | 6 +- src/ocrd_utils/config.py | 28 ++++++- 9 files changed, 80 insertions(+), 45 deletions(-) create mode 100644 docs/api/ocrd/ocrd.processor.ocrd_page_result.rst diff --git a/docs/api/ocrd/ocrd.processor.ocrd_page_result.rst b/docs/api/ocrd/ocrd.processor.ocrd_page_result.rst new file mode 100644 index 000000000..e13d50e15 --- /dev/null +++ b/docs/api/ocrd/ocrd.processor.ocrd_page_result.rst @@ -0,0 +1,7 @@ +ocrd.processor.ocrd\_page\_result module +======================================== + +.. automodule:: ocrd.processor.ocrd_page_result + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd/ocrd.processor.rst b/docs/api/ocrd/ocrd.processor.rst index 801114d2a..7507d8439 100644 --- a/docs/api/ocrd/ocrd.processor.rst +++ b/docs/api/ocrd/ocrd.processor.rst @@ -22,3 +22,4 @@ Submodules ocrd.processor.base ocrd.processor.helpers + ocrd.processor.ocrd_page_result diff --git a/src/ocrd/cli/validate.py b/src/ocrd/cli/validate.py index b26803d05..61d26988a 100644 --- a/src/ocrd/cli/validate.py +++ b/src/ocrd/cli/validate.py @@ -40,7 +40,7 @@ def validate_cli(): @click.argument('ocrd_tool', required=False, nargs=1) def validate_ocrd_tool(ocrd_tool): ''' - Validate OCRD_TOOL as an ocrd-tool.json file. + Validate OCRD_TOOL as an `ocrd-tool.json` file. ''' if not ocrd_tool: ocrd_tool = 'ocrd-tool.json' @@ -107,7 +107,7 @@ def validate_page(page, **kwargs): @click.argument('tasks', nargs=-1, required=True) def validate_process(tasks, workspace, mets_basename, overwrite, page_id): ''' - Validate a sequence of tasks passable to 'ocrd process' + Validate a sequence of tasks passable to `ocrd process` ''' if workspace: _inform_of_result(validate_tasks([ProcessorTask.parse(t) for t in tasks], diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index 0c70fd3a3..e2186a727 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -308,6 +308,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi echo PHYS_0002 BIN FILE_0002_BIN BIN/FILE_0002_BIN.xml; \\ } | ocrd workspace bulk-add -r '(?P.*) (?P.*) (?P.*) (?P.*)' \\ -G '{{ filegrp }}' -g '{{ pageid }}' -i '{{ fileid }}' -S '{{ local_filename }}' - + """ log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name workspace = Workspace( diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 0ec2711f6..d53c3da0b 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -144,22 +144,21 @@ def __init__( version=None ): """ - Instantiate, but do not process. Unless ``list_resources`` or - ``show_resource`` or ``show_help`` or ``show_version`` or - ``dump_json`` or ``dump_module_dir`` is true, setup for processing - (parsing and validating parameters, entering the workspace directory). + Instantiate, but do not setup (neither for processing nor other usage). + If given, do parse and validate :py:data:`.parameter`. Args: workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \ + If not ``None``, then `chdir` to that directory. Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \ before processing. Keyword Args: parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \ Can be ``None`` even for processing, but then needs to be set before running. - input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input. \ + input_file_grp (string): comma-separated list of METS ``fileGrp`` used for input. \ Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \ before processing. - output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output. \ + output_file_grp (string): comma-separated list of METS ``fileGrp`` used for output. \ Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \ before processing. page_id (string): comma-separated list of METS physical ``page`` IDs to process \ @@ -287,29 +286,32 @@ def setup(self) -> None: """ pass - @deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()') + @deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()') def process(self) -> None: """ - Process all files of the :py:attr:`workspace` - from the given :py:attr:`input_file_grp` - to the given :py:attr:`output_file_grp` - for the given :py:attr:`page_id` (or all pages) - under the given :py:attr:`parameter`. + Process all files of the :py:data:`workspace` + from the given :py:data:`input_file_grp` + to the given :py:data:`output_file_grp` + for the given :py:data:`page_id` (or all pages) + under the given :py:data:`parameter`. - (This contains the main functionality and needs to be overridden by subclasses.) + (This contains the main functionality and needs to be + overridden by subclasses.) """ raise NotImplementedError() def process_workspace(self, workspace: Workspace) -> None: """ Process all files of the given ``workspace``, - from the given :py:attr:`input_file_grp` - to the given :py:attr:`output_file_grp` - for the given :py:attr:`page_id` (or all pages) - under the given :py:attr:`parameter`. + from the given :py:data:`input_file_grp` + to the given :py:data:`output_file_grp` + for the given :py:data:`page_id` (or all pages) + under the given :py:data:`parameter`. (This will iterate over pages and files, calling - :py:meth:`process_page`, handling exceptions.) + :py:meth:`.process_page_file` and handling exceptions. + It should be overridden by subclasses to handle cases + like post-processing or computation across pages.) """ log = getLogger('ocrd.processor.base') with pushd_popd(workspace.directory): @@ -370,7 +372,7 @@ def process_workspace(self, workspace: Workspace) -> None: def _copy_page_file(self, input_file : Union[OcrdFile, ClientSideOcrdFile]) -> None: """ - Copy the given ``input_file`` of the :py:attr:`workspace`, + Copy the given ``input_file`` of the :py:data:`workspace`, representing one physical page (passed as one opened :py:class:`~ocrd_models.OcrdFile` per input fileGrp) and add it as if it was a processing result. @@ -399,14 +401,14 @@ def _copy_page_file(self, input_file : Union[OcrdFile, ClientSideOcrdFile]) -> N def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None: """ - Process the given ``input_files`` of the :py:attr:`workspace`, + Process the given ``input_files`` of the :py:data:`workspace`, representing one physical page (passed as one opened - :py:class:`~ocrd_models.OcrdFile` per input fileGrp) - under the given :py:attr:`parameter`, and make sure the + :py:class:`.OcrdFile` per input fileGrp) + under the given :py:data:`.parameter`, and make sure the results get added accordingly. - (This uses process_page_pcgts, but can be overridden by subclasses - to handle cases like multiple fileGrps, non-PAGE input etc.) + (This uses :py:meth:`.process_page_pcgts`, but should be overridden by subclasses + to handle cases like multiple output fileGrps, non-PAGE input etc.) """ log = getLogger('ocrd.processor.base') input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files) @@ -449,28 +451,28 @@ def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOc def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: """ - Process the given ``input_pcgts`` of the :py:attr:`workspace`, + Process the given ``input_pcgts`` of the :py:data:`.workspace`, representing one physical page (passed as one parsed - :py:class:`~ocrd_models.OcrdPage` per input fileGrp) - under the given :py:attr:`parameter`, and return the - resulting :py:class:`~ocrd.processor.OcrdPageResult`. + :py:class:`.OcrdPage` per input fileGrp) + under the given :py:data:`.parameter`, and return the + resulting :py:class:`.OcrdPageResult`. Optionally, add to the ``images`` attribute of the resulting - :py:class:`~ocrd.processor.OcrdPageResult` instances - of :py:class:`~ocrd.processor.OcrdPageResultImage`, + :py:class:`.OcrdPageResult` instances of :py:class:`.OcrdPageResultImage`, which have required fields for ``pil`` (:py:class:`PIL.Image` image data), ``file_id_suffix`` (used for generating IDs of the saved image) and ``alternative_image`` (reference of the :py:class:`ocrd_models.ocrd_page.AlternativeImageType` for setting the filename of the saved image). - (This contains the main functionality and must be overridden by subclasses.) + (This contains the main functionality and must be overridden by subclasses, + unless it does not get called by some overriden :py:meth:`.process_page_file`.) """ raise NotImplementedError() def add_metadata(self, pcgts: OcrdPage) -> None: """ Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing - the processing step and runtime parameters to :py:class:`~ocrd_models.OcrdPage` ``pcgts``. + the processing step and runtime parameters to :py:class:`.OcrdPage` ``pcgts``. """ metadata_obj = pcgts.get_Metadata() assert metadata_obj is not None @@ -496,7 +498,7 @@ def add_metadata(self, pcgts: OcrdPage) -> None: def resolve_resource(self, val): """ Resolve a resource name to an absolute file path with the algorithm in - https://ocr-d.de/en/spec/ocrd_tool#file-parameters + `spec `_ Args: val (string): resource value to resolve @@ -522,7 +524,7 @@ def resolve_resource(self, val): def show_resource(self, val): """ Resolve a resource name to a file path with the algorithm in - https://ocr-d.de/en/spec/ocrd_tool#file-parameters, + `spec `_, then print its contents to stdout. Args: @@ -593,7 +595,8 @@ def input_files(self): files for that page) - Otherwise raise an error (complaining that only PAGE-XML warrants having multiple images for a single page) - Algorithm _ + + See `algorithm `_ Returns: A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects. @@ -635,11 +638,13 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): - if ``last``, then the last matching file for the page will be silently selected (as if the last was the only match) - if ``abort``, then an exception will be raised. + Multiple matches for PAGE-XML will always raise an exception. Keyword Args: require_first (boolean): If true, then skip a page entirely whenever it is not available in the first input `fileGrp`. + on_error (string): How to handle multiple file matches per page. mimetype (string): If not `None`, filter by the specified MIME type (literal or regex prefixed by `//`). Otherwise prefer PAGE or image. diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index 2f94913ed..3523d9f15 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -606,7 +606,6 @@ def image_from_page(self, page, page_id, Cropping uses a polygon mask (not just the bounding box rectangle). Areas outside the polygon will be filled according to ``fill``: - \b - if `"background"` (the default), then fill with the median color of the image; - else if `"none"`, then avoid masking polygons where possible @@ -850,7 +849,6 @@ def image_from_segment(self, segment, parent_image, parent_coords, Cropping uses a polygon mask (not just the bounding box rectangle). Areas outside the polygon will be filled according to `fill`: - \b - if `"background"` (the default), then fill with the median color of the image; - else if `"none"`, then avoid masking polygons where possible diff --git a/src/ocrd_models/ocrd_exif.py b/src/ocrd_models/ocrd_exif.py index 406e60a85..82b8b7e1c 100644 --- a/src/ocrd_models/ocrd_exif.py +++ b/src/ocrd_models/ocrd_exif.py @@ -21,6 +21,7 @@ class OcrdExif(): * ``RGB`` for 24-bit truecolor, * ``I`` for 32-bit signed integer grayscale, * ``F`` for floating-point grayscale + (see PIL concept **mode**) resolution (int): pixel density xResolution (int): pixel density diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index d6da3e1cd..4d1e6cba5 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -194,7 +194,7 @@ def unique_identifier(self, purl : str) -> None: @property def agents(self) -> List[OcrdAgent]: """ - List all :py:class:`ocrd_models.ocrd_agent.OcrdAgent`s + List all :py:class:`ocrd_models.ocrd_agent.OcrdAgent` entries. """ return [OcrdAgent(el_agent) for el_agent in self._tree.getroot().findall('mets:metsHdr/mets:agent', NS)] @@ -218,7 +218,7 @@ def add_agent(self, *args, **kwargs) -> OcrdAgent: @property def file_groups(self) -> List[str]: """ - List the `@USE` of all `mets:fileGrp` entries. + List the ``@USE`` of all ``mets:fileGrp`` entries. """ # WARNING: Actually we cannot return strings in place of elements! @@ -894,7 +894,7 @@ def merge(self, other_mets, force : bool = False, Add all files from other_mets. Accepts the same kwargs as :py:func:`find_files` Keyword Args: - force (boolean): Whether to :py:meth:`add_file`s with force (overwriting existing ``mets:file``s) + force (boolean): Whether to do :py:meth:`add_file` with ``force`` (overwriting existing ``mets:file`` entries) fileGrp_mapping (dict): Map :py:attr:`other_mets` fileGrp to fileGrp in this METS fileId_mapping (dict): Map :py:attr:`other_mets` file ID to file ID in this METS pageId_mapping (dict): Map :py:attr:`other_mets` page ID to page ID in this METS diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 28f95b216..851fb42a8 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -120,9 +120,11 @@ def raw_value(self, name): description="""\ Whether to enable gathering runtime statistics on the `ocrd.profile` logger (comma-separated): + - `CPU`: yields CPU and wall-time, - `RSS`: also yields peak memory (resident set size) - `PSS`: also yields peak memory (proportional set size) + """, validator=lambda val : all(t in ('', 'CPU', 'RSS', 'PSS') for t in val.split(',')), default=(True, '')) @@ -154,19 +156,39 @@ def _ocrd_download_timeout_parser(val): parser=_parser_boolean) config.add("OCRD_MISSING_INPUT", - description="How to deal with missing input files (for some fileGrp/pageId) during processing [SKIP|ABORT]", + description="""\ +How to deal with missing input files (for some fileGrp/pageId) during processing: + + - `SKIP`: ignore and proceed with next page's input + - `ABORT`: throw :py:class:`.MissingInputFile` + +""", default=(True, 'SKIP'), validator=lambda val: val in ['SKIP', 'ABORT'], parser=str) config.add("OCRD_MISSING_OUTPUT", - description="How to deal with missing output files (for some fileGrp/pageId) during processing [SKIP|COPY|ABORT]", + description="""\ +How to deal with missing output files (for some fileGrp/pageId) during processing: + + - `SKIP`: ignore and proceed processing next page + - `COPY`: fall back to copying input PAGE to output fileGrp for page + - `ABORT`: re-throw whatever caused processing to fail + +""", default=(True, 'SKIP'), validator=lambda val: val in ['SKIP', 'COPY', 'ABORT'], parser=str) config.add("OCRD_EXISTING_OUTPUT", - description="How to deal with already existing output files (for some fileGrp/pageId) during processing [SKIP|OVERWRITE|ABORT]", + description="""\ +How to deal with already existing output files (for some fileGrp/pageId) during processing: + + - `SKIP`: ignore and proceed processing next page + - `OVERWRITE`: force writing result to output fileGrp for page + - `ABORT`: re-throw :py:class:`FileExistsError` + +""", default=(True, 'SKIP'), validator=lambda val: val in ['SKIP', 'OVERWRITE', 'ABORT'], parser=str) From 9942bbe6dc42246c0a7e6eda85444aa0f745face Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 21 Aug 2024 18:05:38 +0200 Subject: [PATCH 095/249] Processor.zip_input_files: more verbose log msg Co-authored-by: Konstantin Baierer --- src/ocrd/processor/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index d53c3da0b..55b461942 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -714,7 +714,7 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): ift[i] = file_ # Warn if no files found but pageId was specified, because that might be due to invalid page_id (range) if self.page_id and not any(pages): - LOG.critical(f"Could not find any files for selected pageId {self.page_id}") + LOG.critical(f"Could not find any files for selected pageId {self.page_id}.\ncompare '{self.page_id}' with the output of 'orcd workspace list-page'.") ifts = list() for page, ifiles in pages.items(): for i, ifg in enumerate(ifgs): From 8a584e9dcc5794baa9e08556a943bc7e9eb9991f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 21 Aug 2024 18:07:05 +0200 Subject: [PATCH 096/249] test_processor: test for specific exception Co-authored-by: Konstantin Baierer --- tests/processor/test_processor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 064142574..c263d99fc 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -280,7 +280,7 @@ def ocrd_tool(self): assert ('foobar3', 'foobar4') in tuples tuples = [(one.ID, two) for one, two in proc.zip_input_files(on_error='skip')] assert ('foobar3', None) in tuples - with self.assertRaisesRegex(Exception, "Could not determine unique input file"): + with self.assertRaisesRegex(NonUniqueInputFile, "Could not determine unique input file"): tuples = proc.zip_input_files(on_error='abort') ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar2dup', page_id='phys_0001') for page_id in [None, 'phys_0001,phys_0002']: @@ -289,7 +289,7 @@ def ocrd_tool(self): proc.workspace = ws proc.input_file_grp = 'GRP1,GRP2' proc.page_id = page_id - with self.assertRaisesRegex(Exception, "Could not determine unique input file"): + with self.assertRaisesRegex(NonUniqueInputFile, "Could not determine unique input file"): tuples = proc.zip_input_files() def test_zip_input_files_require_first(self): From 8077d45056c9d2682bee5bb5017f79eb0a7b336a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 19:54:26 +0200 Subject: [PATCH 097/249] test_processor: fix missing import --- tests/processor/test_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index c263d99fc..0cbae7d54 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -15,7 +15,7 @@ from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging from ocrd.resolver import Resolver -from ocrd.processor.base import Processor, run_processor, run_cli +from ocrd.processor import Processor, run_processor, run_cli, NonUniqueInputFile from unittest import mock import pytest From cf7b193fe52477448897a0877188e188ba3f2f9a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 20:24:08 +0200 Subject: [PATCH 098/249] OcrdPage: fix typeing typo --- src/ocrd_models/ocrd_page.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ocrd_models/ocrd_page.py b/src/ocrd_models/ocrd_page.py index b28777e72..87e644fd9 100644 --- a/src/ocrd_models/ocrd_page.py +++ b/src/ocrd_models/ocrd_page.py @@ -2,6 +2,7 @@ API to PAGE-XML, generated with generateDS from XML schema. """ from io import StringIO +from typing import Dict from inspect import getmembers from lxml import etree as ET @@ -187,8 +188,8 @@ def __init__( self, pcgts : PcGtsType, etree : ET._Element, - mapping : dict[str, ET._Element], - revmap : dict[ET._Element, str], + mapping : Dict[str, ET._Element], + revmap : Dict[ET._Element, str], ): self._pcgts = pcgts self.etree = etree From 9af8670dbbcdd06addde68a29c9b7a91f7f1a0c9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 20:40:40 +0200 Subject: [PATCH 099/249] dummy_processor: fix typos from logging --- src/ocrd/processor/builtin/dummy_processor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index 1e3f52ebe..c2f0eec4f 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -41,7 +41,7 @@ def process_page_file(self, *input_files: Optional[Union[OcrdFile, ClientSideOcr file_id = make_file_id(input_file, self.output_file_grp) ext = MIME_TO_EXT.get(input_file.mimetype, '') local_filename = join(self.output_file_grp, file_id + ext) - LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id) + self.logger.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id) with open(input_file.local_filename, 'rb') as f: output_file = self.workspace.add_file( file_id=file_id, @@ -69,9 +69,9 @@ def process_page_file(self, *input_files: Optional[Union[OcrdFile, ClientSideOcr ) else: if self.parameter['copy_files']: - LOG.info("Not copying %s because it is a PAGE-XML file, which gets identity-transformed", input_file.local_filename) + self.logger.info("Not copying %s because it is a PAGE-XML file, which gets identity-transformed", input_file.local_filename) else: - LOG.info("Not copying %s because it is not a PAGE-XML file and copy_files was false", input_file.local_filename) + self.logger.info("Not copying %s because it is not a PAGE-XML file and copy_files was false", input_file.local_filename) # we can rely on base implementation verbatim super().process_page_file(input_file) From c6d9736b1ecbb2041c7686873568c50f09360fe6 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 22:28:29 +0200 Subject: [PATCH 100/249] tests report.is_valid: improve output on failure --- tests/cli/test_validate.py | 23 +++++++++---------- tests/validator/test_json_validator.py | 6 ++--- tests/validator/test_ocrd_tool_validator.py | 4 ++-- tests/validator/test_parameter_validator.py | 2 +- .../validator/test_resource_list_validator.py | 3 +-- tests/validator/test_xsd_validator.py | 8 +++---- 6 files changed, 22 insertions(+), 24 deletions(-) diff --git a/tests/cli/test_validate.py b/tests/cli/test_validate.py index 0682ea7a0..12e87f4dc 100644 --- a/tests/cli/test_validate.py +++ b/tests/cli/test_validate.py @@ -57,24 +57,24 @@ def test_validate_ocrd_tool(self): json_path.write_text(OCRD_TOOL) # normal call - code, _, err = self.invoke_cli(validate_cli, ['tool-json', str(json_path)]) - self.assertEqual(code, 0, err) + code, out, err = self.invoke_cli(validate_cli, ['tool-json', str(json_path)]) + self.assertEqual(code, 0, out + err) # relative path with pushd_popd(tempdir): - code, _, err = self.invoke_cli(validate_cli, ['tool-json', 'ocrd-tool.json']) - self.assertEqual(code, 0, err) + code, out, err = self.invoke_cli(validate_cli, ['tool-json', 'ocrd-tool.json']) + self.assertEqual(code, 0, out + err) # default path with pushd_popd(tempdir): - code, _, err = self.invoke_cli(validate_cli, ['tool-json']) - self.assertEqual(code, 0, err) + code, out, err = self.invoke_cli(validate_cli, ['tool-json']) + self.assertEqual(code, 0, out + err) def test_validate_parameter(self): with TemporaryDirectory() as tempdir: json_path = Path(tempdir, 'ocrd-tool.json') json_path.write_text(OCRD_TOOL) with pushd_popd(tempdir): - code, _, err = self.invoke_cli(validate_cli, ['parameters', 'ocrd-tool.json', 'ocrd-xyz', dumps({"baz": "foo"})]) - self.assertEqual(code, 0, err) + code, out, err = self.invoke_cli(validate_cli, ['parameters', 'ocrd-tool.json', 'ocrd-xyz', dumps({"baz": "foo"})]) + self.assertEqual(code, 0, out + err) def test_validate_page(self): page_path = assets.path_to('glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS.xml') @@ -84,19 +84,18 @@ def test_validate_page(self): def test_validate_tasks(self): # simple - code, _, err = self.invoke_cli(validate_cli, ['tasks', + code, out, err = self.invoke_cli(validate_cli, ['tasks', "sample-processor-required-param -I FOO -O OUT1 -p '{\"param1\": true}'", "sample-processor-required-param -I FOO -O OUT2 -p '{\"param1\": true}'", ]) - self.assertEqual(code, 0, err) + self.assertEqual(code, 0, out + err) # with workspace code, out, err = self.invoke_cli(validate_cli, ['tasks', '--workspace', assets.path_to('kant_aufklaerung_1784/data'), "sample-processor-required-param -I OCR-D-IMG,OCR-D-GT-PAGE -O OUT1 -p '{\"param1\": true}'", "sample-processor-required-param -I OCR-D-IMG,OCR-D-GT-PAGE -O OUT2 -p '{\"param1\": true}'", ]) - print('code=%s out=%s err=%s' % (code, out, err)) - self.assertEqual(code, 0, err) + self.assertEqual(code, 0, out + err) if __name__ == '__main__': diff --git a/tests/validator/test_json_validator.py b/tests/validator/test_json_validator.py index 25771b701..d81c894f9 100644 --- a/tests/validator/test_json_validator.py +++ b/tests/validator/test_json_validator.py @@ -20,18 +20,18 @@ def setUp(self): def test_validate_string(self): report = JsonValidator.validate('{}', {}) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) def test_defaults_set(self): obj = {'bar': 2000} report = self.defaults_validator._validate(obj) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) self.assertEqual(obj, {'foo': 3000, 'bar': 2000}) def test_properr(self): obj = {'bar': 100, 'quux': {}} report = self.defaults_validator._validate(obj) - self.assertFalse(report.is_valid) + self.assertFalse(report.is_valid, str(report.to_xml())) self.assertEqual(len(report.errors), 1) diff --git a/tests/validator/test_ocrd_tool_validator.py b/tests/validator/test_ocrd_tool_validator.py index 2d035757e..df19e8e64 100644 --- a/tests/validator/test_ocrd_tool_validator.py +++ b/tests/validator/test_ocrd_tool_validator.py @@ -29,7 +29,7 @@ def setUp(self): def test_smoke(self): report = OcrdToolValidator.validate(self.ocrd_tool) - self.assertTrue(report.is_valid, str(report.errors)) + self.assertTrue(report.is_valid, str(report.to_xml())) def test_additional_props(self): self.ocrd_tool['not-allowed'] = 'YUP' @@ -48,7 +48,7 @@ def test_file_param_ok(self): ocrd_tool = json.loads(skeleton) ocrd_tool['tools']['ocrd-xyz']['parameters'] = {"file-param": {"description": "...", "type": "string", "content-type": 'application/rdf+xml'}} report = OcrdToolValidator.validate(ocrd_tool) - self.assertTrue(report.is_valid, str(report.errors)) + self.assertTrue(report.is_valid, str(report.to_xml())) # Not restricted anymore since spec 3.3.0 # def test_file_param_bad_content_types(self): diff --git a/tests/validator/test_parameter_validator.py b/tests/validator/test_parameter_validator.py index f0d9d41d2..297a14906 100644 --- a/tests/validator/test_parameter_validator.py +++ b/tests/validator/test_parameter_validator.py @@ -42,7 +42,7 @@ def test_default_assignment(self): }) obj = {'baz': '23'} report = validator.validate(obj) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) self.assertEqual(obj, {'baz': '23', "num-param": 1}) def test_min_max(): diff --git a/tests/validator/test_resource_list_validator.py b/tests/validator/test_resource_list_validator.py index eb95d9b1e..cc63c30ea 100644 --- a/tests/validator/test_resource_list_validator.py +++ b/tests/validator/test_resource_list_validator.py @@ -22,8 +22,7 @@ def reslist(): def test_resource_list_validator(reslist): report = OcrdResourceListValidator.validate(reslist) - print(report.errors) - assert report.is_valid == True + assert report.is_valid, str(report.to_xml()) if __name__ == '__main__': main(__file__) diff --git a/tests/validator/test_xsd_validator.py b/tests/validator/test_xsd_validator.py index d0150338d..50b3851ff 100644 --- a/tests/validator/test_xsd_validator.py +++ b/tests/validator/test_xsd_validator.py @@ -37,22 +37,22 @@ def test_mets_empty(self): def test_validate_simple_protected_str(self): val = XsdValidator(XSD_METS_URL) report = val._validate(self.ws.mets.to_xml()) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) def test_validate_simple_protected_doc(self): val = XsdValidator(XSD_METS_URL) report = val._validate(self.ws.mets._tree) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) def test_validate_simple_static_doc(self): report = XsdValidator.validate(XSD_METS_URL, self.ws.mets._tree) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) class TestXsdPageValidator(TestCase): def test_validate_page_simple_static_doc(self): report = XsdPageValidator.validate(simple_page) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) if __name__ == '__main__': main(__file__) From 161cf0c5797bd6c619340fd6d2df48d5dee6c078 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 22:28:56 +0200 Subject: [PATCH 101/249] JsonValidator: fix deprecation warning (by actually checking instance) --- src/ocrd_validators/json_validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd_validators/json_validator.py b/src/ocrd_validators/json_validator.py index ccd27b92a..4fb84b3fd 100644 --- a/src/ocrd_validators/json_validator.py +++ b/src/ocrd_validators/json_validator.py @@ -25,7 +25,7 @@ def set_defaults_and_handle_deprecate(validator, properties, instance, schema): for prop, subschema in properties.items(): if "default" in subschema: instance.setdefault(prop, subschema["default"]) - if subschema.get('deprecated', False): + if subschema.get('deprecated', False) and instance.get(prop): yield JsonSchemaDeprecationWarning(f"Property {prop} has been deprecated, ocrd-tool.json should be updated.") for error in validate_properties(validator, properties, instance, schema): From b2e6485642d096ed229c004a7cb88cf73ae1718c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 22:52:26 +0200 Subject: [PATCH 102/249] predefine union types OcrdFileType and OcrdPageType --- src/ocrd/processor/base.py | 16 ++++++++-------- src/ocrd/processor/builtin/dummy_processor.py | 4 ++-- src/ocrd_models/__init__.py | 4 ++-- src/ocrd_models/ocrd_file.py | 2 ++ src/ocrd_models/ocrd_page.py | 5 ++++- 5 files changed, 18 insertions(+), 13 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 751749790..6c91eb00a 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -15,7 +15,7 @@ import os from os import getcwd from pathlib import Path -from typing import List, Optional, Union +from typing import List, Optional, Union, get_args import sys import inspect import tarfile @@ -25,7 +25,7 @@ from requests import HTTPError from ocrd.workspace import Workspace -from ocrd_models.ocrd_file import ClientSideOcrdFile, OcrdFile +from ocrd_models.ocrd_file import OcrdFileType from ocrd.processor.ocrd_page_result import OcrdPageResult from ocrd_utils import ( VERSION as OCRD_VERSION, @@ -332,7 +332,7 @@ def process_workspace(self, workspace: Workspace) -> None: try: # FIXME: add page parallelization by running multiprocessing.Pool (#322) for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False): - input_files : List[Optional[Union[OcrdFile, ClientSideOcrdFile]]] = [None] * len(input_file_tuple) + input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple) page_id = next(input_file.pageId for input_file in input_file_tuple if input_file) @@ -382,7 +382,7 @@ def process_workspace(self, workspace: Workspace) -> None: # fall back to deprecated method self.process() - def _copy_page_file(self, input_file : Union[OcrdFile, ClientSideOcrdFile]) -> None: + def _copy_page_file(self, input_file : OcrdFileType) -> None: """ Copy the given ``input_file`` of the :py:data:`workspace`, representing one physical page (passed as one opened @@ -390,7 +390,7 @@ def _copy_page_file(self, input_file : Union[OcrdFile, ClientSideOcrdFile]) -> N and add it as if it was a processing result. """ input_pcgts : OcrdPage - assert isinstance(input_file, (OcrdFile, ClientSideOcrdFile)) + assert isinstance(input_file, get_args(OcrdFileType)) self._base_logger.debug(f"parsing file {input_file.ID} for page {input_file.pageId}") try: input_pcgts = page_from_file(input_file) @@ -410,7 +410,7 @@ def _copy_page_file(self, input_file : Union[OcrdFile, ClientSideOcrdFile]) -> N force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', ) - def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None: + def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: """ Process the given ``input_files`` of the :py:data:`workspace`, representing one physical page (passed as one opened @@ -422,10 +422,10 @@ def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOc to handle cases like multiple output fileGrps, non-PAGE input etc.) """ input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files) - assert isinstance(input_files[0], (OcrdFile, ClientSideOcrdFile)) + assert isinstance(input_files[0], get_args(OcrdFileType)) page_id = input_files[0].pageId for i, input_file in enumerate(input_files): - assert isinstance(input_file, (OcrdFile, ClientSideOcrdFile)) + assert isinstance(input_file, get_args(OcrdFileType)) self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}") try: page_ = page_from_file(input_file) diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index c2f0eec4f..7b2f1b66e 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -7,7 +7,7 @@ from ocrd import Processor from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor from ocrd.processor.ocrd_page_result import OcrdPageResult -from ocrd_models.ocrd_file import ClientSideOcrdFile, OcrdFile +from ocrd_models.ocrd_file import OcrdFileType from ocrd_models.ocrd_page import OcrdPage, to_xml from ocrd_utils import ( getLogger, @@ -32,7 +32,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional # nothing to do here return OcrdPageResult(input_pcgts[0]) - def process_page_file(self, *input_files: Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None: + def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None: input_file = input_files[0] assert input_file assert input_file.local_filename diff --git a/src/ocrd_models/__init__.py b/src/ocrd_models/__init__.py index 330fefe97..ff4e31798 100644 --- a/src/ocrd_models/__init__.py +++ b/src/ocrd_models/__init__.py @@ -3,8 +3,8 @@ """ from .ocrd_agent import OcrdAgent, ClientSideOcrdAgent from .ocrd_exif import OcrdExif -from .ocrd_file import OcrdFile, ClientSideOcrdFile +from .ocrd_file import OcrdFile, ClientSideOcrdFile, OcrdFileType from .ocrd_mets import OcrdMets -from .ocrd_page import OcrdPage +from .ocrd_page import OcrdPage, OcrdPageType from .ocrd_xml_base import OcrdXmlDocument from .report import ValidationReport diff --git a/src/ocrd_models/ocrd_file.py b/src/ocrd_models/ocrd_file.py index 2315a08ff..a11634171 100644 --- a/src/ocrd_models/ocrd_file.py +++ b/src/ocrd_models/ocrd_file.py @@ -266,3 +266,5 @@ def __str__(self): for k in ['fileGrp', 'ID', 'mimetype', 'url', 'local_filename'] ]) return '' % (props) + +OcrdFileType = Union[OcrdFile, ClientSideOcrdFile] diff --git a/src/ocrd_models/ocrd_page.py b/src/ocrd_models/ocrd_page.py index 87e644fd9..6accb9241 100644 --- a/src/ocrd_models/ocrd_page.py +++ b/src/ocrd_models/ocrd_page.py @@ -2,7 +2,7 @@ API to PAGE-XML, generated with generateDS from XML schema. """ from io import StringIO -from typing import Dict +from typing import Dict, Union from inspect import getmembers from lxml import etree as ET @@ -11,6 +11,7 @@ 'parseEtree', 'parseString', 'OcrdPage', + 'OcrdPageType', "AdvertRegionType", "AlternativeImageType", @@ -199,6 +200,8 @@ def __init__( def __getattr__(self, name): return getattr(self._pcgts, name) +OcrdPageType = Union[OcrdPage, PcGtsType] + def to_xml(el, skip_declaration=False) -> str: """ Serialize ``pc:PcGts`` document as string. From 822d731059532b5e9f401afd7532ba4ab8acfa34 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 23:24:08 +0200 Subject: [PATCH 103/249] processor CLI --debug: set all to ABORT (not just MISSING_OUTPUT) --- src/ocrd/decorators/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index 364ef4c84..3f07ede4a 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -118,10 +118,12 @@ def resolve(name): resolver.resolve_mets_arguments(working_dir, mets, None, mets_server_url) workspace = resolver.workspace_from_url(mets, working_dir, mets_server_url=mets_server_url) page_id = kwargs.get('page_id') - if overwrite: - config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' if debug: + config.OCRD_MISSING_INPUT = 'ABORT' config.OCRD_MISSING_OUTPUT = 'ABORT' + config.OCRD_EXISTING_OUTPUT = 'ABORT' + if overwrite: + config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' report = WorkspaceValidator.check_file_grp(workspace, kwargs['input_file_grp'], '' if overwrite else kwargs['output_file_grp'], page_id) if not report.is_valid: raise Exception("Invalid input/output file grps:\n\t%s" % '\n\t'.join(report.errors)) From 3a7a7713abdf218a6bc64317dba83cd528e26589 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 23:24:39 +0200 Subject: [PATCH 104/249] :memo: changelog --- CHANGELOG.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d55258070..1b53c6a28 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,28 @@ Change Log Versioned according to [Semantic Versioning](http://semver.org/). +## Unreleased + +Changed: + - :fire: `OcrdPage` as proxy of `PcGtsType` instead of alias; also contains `etree` and `mapping` now + - :fire: `Processor.zip_input_files` now can throw `ocrd.NonUniqueInputFile` and `ocrd.MissingInputFile` + (the latter only if `OCRD_MISSING_INPUT=ABORT`) + - :fire: `Processor.zip_input_files` does not by default use `require_first` anymore + (so the first file in any input file tuple per page can be `None` as well) + - :fire: no more `Workspace.overwrite_mode`, merely delegate to `OCRD_EXISTING_OUTPUT=OVERWRITE` + - :art: improve on docs result for `ocrd_utils.config` + +Added: + - :point_right: `OCRD_DOWNLOAD_INPUT` for whether input files should be downloaded before processing + - :point_right: `OCRD_MISSING_INPUT` for how to handle missing input files (**`SKIP`** or `ABORT`) + - :point_right: `OCRD_MISSING_OUTPUT` for how to handle processing failures (**`SKIP`** or `ABORT` or `COPY`) + the latter behaves like ocrd-dummy for the failed page(s) + - :point_right: `OCRD_EXISTING_OUTPUT` for how to handle existing output files (**`SKIP`** or `ABORT` or `OVERWRITE`) + - new CLI option `--debug` as short-hand for `ABORT` choices above + - `Processor.logger` set up by constructor already (for re-use by processor implementors) + - `default`-expand and validate `ocrd_tool.json` in `Processor` constructor, log invalidities + - handle JSON `deprecation` in `ocrd_tool.json` by reporting warnings + ## [3.0.0a1] - 2024-08-15 Changed: From 2bdb6c438d8ca3a9592dbe34d95592cbcdc650f6 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 22 Aug 2024 11:10:51 +0200 Subject: [PATCH 105/249] :package: v3.0.0a2 --- CHANGELOG.md | 4 ++++ VERSION | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1b53c6a28..38f36b96d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [3.0.0a2] - 2024-08-22 + Changed: - :fire: `OcrdPage` as proxy of `PcGtsType` instead of alias; also contains `etree` and `mapping` now - :fire: `Processor.zip_input_files` now can throw `ocrd.NonUniqueInputFile` and `ocrd.MissingInputFile` @@ -2191,6 +2193,8 @@ Fixed Initial Release +[3.0.0a2]: ../../compare/v3.0.0a2..v3.0.0a1 +[3.0.0a1]: ../../compare/v3.0.0a1..v2.67.2 [2.67.2]: ../../compare/v2.67.2..v2.67.1 [2.67.1]: ../../compare/v2.67.1..v2.67.0 [2.67.0]: ../../compare/v2.67.0..v2.66.1 diff --git a/VERSION b/VERSION index 2a9454873..3a5b5bc9d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.0.0a1 \ No newline at end of file +3.0.0a2 From 00bd6fe8500ffcbf125dcc157c5997ed115c9023 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 22 Aug 2024 11:13:38 +0200 Subject: [PATCH 106/249] remove make *-workaround, we will not do that for v3+ --- CHANGELOG.md | 3 +++ Makefile | 38 -------------------------------------- 2 files changed, 3 insertions(+), 38 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 38f36b96d..43bf85764 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Changed: + - :fire: `ocrd_utils`, `ocrd_models`, `ocrd_modelfactory`, `ocrd_validators` and `ocrd_network` are not published as separate packages anymore, everything is contained in `ocrd` and you should adapt your `requirements.txt` accordingly. + ## [3.0.0a2] - 2024-08-22 Changed: diff --git a/Makefile b/Makefile index 39b46ee84..fd1210b65 100644 --- a/Makefile +++ b/Makefile @@ -401,41 +401,3 @@ docker docker-cuda docker-cuda-tf1 docker-cuda-tf2 docker-cuda-torch: # Build wheels and source dist and twine upload them pypi: build twine upload --verbose dist/ocrd-$(VERSION)*{tar.gz,whl} - -pypi-workaround: build-workaround - for dist in $(BUILD_ORDER);do twine upload dist/$$dist-$(VERSION)*{tar.gz,whl};done - -# Only in place until v3 so we don't break existing installations -build-workaround: pyclean - cp pyproject.toml pyproject.toml.BAK - cp src/ocrd_utils/constants.py src/ocrd_utils/constants.py.BAK - cp src/ocrd/cli/__init__.py src/ocrd/cli/__init__.py.BAK - for dist in $(BUILD_ORDER);do \ - cat pyproject.toml.BAK | sed "s,^name =.*,name = \"$$dist\"," > pyproject.toml; \ - cat src/ocrd_utils/constants.py.BAK | sed "s,dist_version('ocrd'),dist_version('$$dist')," > src/ocrd_utils/constants.py; \ - cat src/ocrd/cli/__init__.py.BAK | sed "s,package_name='ocrd',package_name='$$dist'," > src/ocrd/cli/__init__.py; \ - $(MAKE) build; \ - done - rm pyproject.toml.BAK - rm src/ocrd_utils/constants.py.BAK - rm src/ocrd/cli/__init__.py.BAK - -# test that the aliased packages work in isolation and combined -test-workaround: build-workaround - $(MAKE) uninstall-workaround - for dist in $(BUILD_ORDER);do \ - pip install dist/$$dist-*.whl ;\ - ocrd --version ;\ - make test ;\ - pip uninstall --yes $$dist ;\ - done - for dist in $(BUILD_ORDER);do \ - pip install dist/$$dist-*.whl ;\ - done - ocrd --version ;\ - make test ;\ - for dist in $(BUILD_ORDER);do pip uninstall --yes $$dist;done - -uninstall-workaround: - for dist in $(BUILD_ORDER);do $(PIP) uninstall --yes $$dist;done - From d7775273be5aada8554e4a14693a83afff2cdd1d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Aug 2024 13:10:10 +0200 Subject: [PATCH 107/249] =?UTF-8?q?Processor.parameter:=20only=20validate?= =?UTF-8?q?=20when=20set=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - `Processor` init: do not validate, only set `parameter` if present (needed to avoid exception from `ParameterValidator` for processors with mandatory params in non-processing usage) - `Processor.parameter`: allow `None`, but validate when set - `Processor._setup`: validate parameters, then call `Processor.setup` --- src/ocrd/processor/base.py | 39 +++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 6c91eb00a..76639acdd 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -139,6 +139,19 @@ def ocrd_tool(self) -> dict: self._ocrd_tool = self.metadata['tools'][self.executable] return self._ocrd_tool + @property + def parameter(self) -> Optional[dict]: + """the runtime parameter dict to be used by this processor""" + if hasattr(self, '_parameter'): + return self._parameter + return None + + @parameter.setter + def parameter(self, parameter : dict) -> None: + self._parameter = parameter + # re-run setup to validate parameters and load models etc + self._setup() + def __init__( self, # FIXME: deprecate in favor of process_workspace(workspace) @@ -204,19 +217,12 @@ def __init__( "is deprecated - pass as argument to process_workspace instead") self.page_id = page_id or None self.download = download_files - if parameter is None: - parameter = {} - parameterValidator = ParameterValidator(self.ocrd_tool) - - report = parameterValidator.validate(parameter) - if not report.is_valid: - raise ValueError("Invalid parameters %s" % report.errors) - self.parameter = parameter - # NOTE: this is the logger to be used by processor implementations, - # `processor.base` default implementations should use - # :py:attr:`self._base_logger` + #: The logger to be used by processor implementations. + # `ocrd.processor.base` internals should use :py:attr:`self._base_logger` self.logger = getLogger(f'ocrd.processor.{self.__class__.__name__}') self._base_logger = getLogger('ocrd.processor.base') + if parameter is not None: + self.parameter = parameter # workaround for deprecated#72 (@deprecated decorator does not work for subclasses): setattr(self, 'process', deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process'))) @@ -289,6 +295,17 @@ def list_resources(self): print(res) return + def _setup(self) -> None: + """ + Validate parameters, then run :py:meth:`setup`. Called whenever + :py:data:`parameter` changes. + """ + parameterValidator = ParameterValidator(self.ocrd_tool) + report = parameterValidator.validate(self.parameter) + if not report.is_valid: + raise ValueError("Invalid parameters %s" % report.errors) + self.setup() + def setup(self) -> None: """ Prepare the processor for actual data processing, From 7998aae6e1e1ff4a80da3a203496d8ba4bd5e04a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Aug 2024 13:16:01 +0200 Subject: [PATCH 108/249] get_processor: ensure passing non-empty parameter, rely on `_setup` to call `setup` --- src/ocrd/processor/helpers.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index 08ca0a468..bf9b0e8a1 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -7,7 +7,7 @@ import json import inspect from subprocess import run -from typing import List +from typing import List, Optional from click import wrap_text from ocrd.workspace import Workspace @@ -374,16 +374,14 @@ def get_cached_processor(parameter: dict, processor_class): Otherwise, an instance of the `:py:class:~ocrd.Processor` is returned. """ if processor_class: - dict_params = dict(parameter) if parameter else None - processor = processor_class(None, parameter=dict_params) - processor.setup() + processor = processor_class(None, parameter=dict(parameter)) return processor return None def get_processor( processor_class, - parameter: dict, + parameter: Optional[dict], workspace: Workspace = None, page_id: str = None, input_file_grp: List[str] = None, @@ -391,11 +389,14 @@ def get_processor( instance_caching: bool = False, ): if processor_class: + if parameter is None: + parameter = {} if instance_caching: processor = get_cached_processor(parameter, processor_class) else: + # avoid passing workspace already (deprecated chdir behaviour) processor = processor_class(None, parameter=parameter) - processor.setup() + # set current processing parameters processor.workspace = workspace processor.page_id = page_id processor.input_file_grp = input_file_grp From cc8592b7b0667118a98abe880e02a353c16136ca Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Aug 2024 13:17:03 +0200 Subject: [PATCH 109/249] test_processor: adapt, check required parameters --- tests/processor/test_processor.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 0cbae7d54..8ade93a70 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -16,6 +16,7 @@ from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging from ocrd.resolver import Resolver from ocrd.processor import Processor, run_processor, run_cli, NonUniqueInputFile +from ocrd.processor.helpers import get_processor from unittest import mock import pytest @@ -95,8 +96,18 @@ def test_json(self): DummyProcessor(None).dump_json() def test_params_missing_required(self): - with self.assertRaisesRegex(Exception, 'is a required property'): - DummyProcessorWithRequiredParameters(None) + proc = DummyProcessorWithRequiredParameters(None) + assert proc.parameter is None + with self.assertRaisesRegex(ValueError, 'is a required property'): + proc.parameter = {} + with self.assertRaisesRegex(ValueError, 'is a required property'): + get_processor(DummyProcessorWithRequiredParameters, None) + with self.assertRaisesRegex(ValueError, 'is a required property'): + get_processor(DummyProcessorWithRequiredParameters, {}) + with self.assertRaisesRegex(ValueError, 'is a required property'): + run_processor(DummyProcessorWithRequiredParameters, + workspace=self.workspace, input_file_grp="OCR-D-IMG") + proc.parameter = {'i-am-required': 'foo'} def test_params_preset_resolve(self): with pushd_popd(tempdir=True) as tempdir: @@ -127,6 +138,9 @@ class ParamTestProcessor(Processor): def ocrd_tool(self): return {} proc = ParamTestProcessor(None) + self.assertEqual(proc.parameter, None) + # get_processor will set to non-none and validate + proc = get_processor(ParamTestProcessor, None) self.assertEqual(proc.parameter, {}) def test_run_agent(self): From 45e556d425cc755b84ed409b817fd00a77175270 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 22 Aug 2024 13:46:26 +0200 Subject: [PATCH 110/249] improve _setup docstring Co-authored-by: Konstantin Baierer --- src/ocrd/processor/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 76639acdd..14fb799f4 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -298,7 +298,7 @@ def list_resources(self): def _setup(self) -> None: """ Validate parameters, then run :py:meth:`setup`. Called whenever - :py:data:`parameter` changes. + :py:data:`parameter` is re-assigned. """ parameterValidator = ParameterValidator(self.ocrd_tool) report = parameterValidator.validate(self.parameter) From d4c802be7af4c4d4bdd6b9b41bd22e8552af11b8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 22 Aug 2024 13:50:32 +0200 Subject: [PATCH 111/249] Processor._setup: raise with full ParameterValidator report Co-authored-by: Konstantin Baierer --- src/ocrd/processor/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 14fb799f4..d930f8a0c 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -303,7 +303,7 @@ def _setup(self) -> None: parameterValidator = ParameterValidator(self.ocrd_tool) report = parameterValidator.validate(self.parameter) if not report.is_valid: - raise ValueError("Invalid parameters %s" % report.errors) + raise ValueError(f'Invalid parameters:\n{report.to_xml()}') self.setup() def setup(self) -> None: From b28fefb066dc2b487aa86212a04c4c4736e662b9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 22 Aug 2024 14:58:03 +0200 Subject: [PATCH 112/249] get_processor: parameter only as kwarg Co-authored-by: Konstantin Baierer --- src/ocrd/processor/helpers.py | 2 +- tests/processor/test_processor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index bf9b0e8a1..56328fad7 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -381,7 +381,7 @@ def get_cached_processor(parameter: dict, processor_class): def get_processor( processor_class, - parameter: Optional[dict], + parameter: Optional[dict] = None, workspace: Workspace = None, page_id: str = None, input_file_grp: List[str] = None, diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 8ade93a70..74c56aa9a 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -101,7 +101,7 @@ def test_params_missing_required(self): with self.assertRaisesRegex(ValueError, 'is a required property'): proc.parameter = {} with self.assertRaisesRegex(ValueError, 'is a required property'): - get_processor(DummyProcessorWithRequiredParameters, None) + get_processor(DummyProcessorWithRequiredParameters) with self.assertRaisesRegex(ValueError, 'is a required property'): get_processor(DummyProcessorWithRequiredParameters, {}) with self.assertRaisesRegex(ValueError, 'is a required property'): From 642938b6a92709e65827858fe2efee79e3992714 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Aug 2024 15:08:03 +0200 Subject: [PATCH 113/249] tests: adapt for get_processor parameter only as kwarg --- src/ocrd/processor/helpers.py | 2 +- tests/processor/test_processor.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index 56328fad7..a8cea96fa 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -84,7 +84,7 @@ def run_processor( log.debug("Running processor %s", processorClass) processor = get_processor( - processor_class=processorClass, + processorClass, parameter=parameter, workspace=None, page_id=page_id, diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 74c56aa9a..6a35dda0f 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -103,7 +103,7 @@ def test_params_missing_required(self): with self.assertRaisesRegex(ValueError, 'is a required property'): get_processor(DummyProcessorWithRequiredParameters) with self.assertRaisesRegex(ValueError, 'is a required property'): - get_processor(DummyProcessorWithRequiredParameters, {}) + get_processor(DummyProcessorWithRequiredParameters, parameter={}) with self.assertRaisesRegex(ValueError, 'is a required property'): run_processor(DummyProcessorWithRequiredParameters, workspace=self.workspace, input_file_grp="OCR-D-IMG") @@ -140,7 +140,7 @@ def ocrd_tool(self): proc = ParamTestProcessor(None) self.assertEqual(proc.parameter, None) # get_processor will set to non-none and validate - proc = get_processor(ParamTestProcessor, None) + proc = get_processor(ParamTestProcessor) self.assertEqual(proc.parameter, {}) def test_run_agent(self): From f5e5c54a5f830c37fde1d57d1cf48c6013ddbc70 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 22 Aug 2024 15:08:10 +0200 Subject: [PATCH 114/249] Processor.parameter: make the bound dict read-only Co-authored-by: Konstantin Baierer --- src/ocrd/processor/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index d930f8a0c..53e2da3dc 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -148,7 +148,8 @@ def parameter(self) -> Optional[dict]: @parameter.setter def parameter(self, parameter : dict) -> None: - self._parameter = parameter + from types import MappingProxyType + self._parameter = MappingProxyType(parameter) # re-run setup to validate parameters and load models etc self._setup() From f2d53a63afab5588131d14dc9e82f39d28108635 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Aug 2024 15:31:14 +0200 Subject: [PATCH 115/249] Processor.parameter: move ParameterValidator back to setter, convert to plain dict in getter for serialization etc --- src/ocrd/processor/base.py | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 53e2da3dc..4780338dd 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -16,6 +16,7 @@ from os import getcwd from pathlib import Path from typing import List, Optional, Union, get_args +from types import MappingProxyType import sys import inspect import tarfile @@ -143,15 +144,19 @@ def ocrd_tool(self) -> dict: def parameter(self) -> Optional[dict]: """the runtime parameter dict to be used by this processor""" if hasattr(self, '_parameter'): - return self._parameter + return dict(self._parameter) return None @parameter.setter def parameter(self, parameter : dict) -> None: - from types import MappingProxyType + parameterValidator = ParameterValidator(self.ocrd_tool) + report = parameterValidator.validate(parameter) + if not report.is_valid: + raise ValueError(f'Invalid parameters:\n{report.to_xml()}') + # make parameter dict read-only self._parameter = MappingProxyType(parameter) - # re-run setup to validate parameters and load models etc - self._setup() + # (re-)run setup to load models etc + self.setup() def __init__( self, @@ -296,17 +301,6 @@ def list_resources(self): print(res) return - def _setup(self) -> None: - """ - Validate parameters, then run :py:meth:`setup`. Called whenever - :py:data:`parameter` is re-assigned. - """ - parameterValidator = ParameterValidator(self.ocrd_tool) - report = parameterValidator.validate(self.parameter) - if not report.is_valid: - raise ValueError(f'Invalid parameters:\n{report.to_xml()}') - self.setup() - def setup(self) -> None: """ Prepare the processor for actual data processing, From 7297ca2d1a76bf82ed96efa44e55ddb75e9b7551 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Aug 2024 15:56:31 +0200 Subject: [PATCH 116/249] Processor.parameter: frozendict instead of mappingproxy, add test --- src/ocrd/processor/base.py | 6 +++--- tests/processor/test_processor.py | 10 +++++++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 4780338dd..336b479f5 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -16,12 +16,12 @@ from os import getcwd from pathlib import Path from typing import List, Optional, Union, get_args -from types import MappingProxyType import sys import inspect import tarfile import io from warnings import warn +from frozendict import frozendict from deprecated import deprecated from requests import HTTPError @@ -144,7 +144,7 @@ def ocrd_tool(self) -> dict: def parameter(self) -> Optional[dict]: """the runtime parameter dict to be used by this processor""" if hasattr(self, '_parameter'): - return dict(self._parameter) + return self._parameter return None @parameter.setter @@ -154,7 +154,7 @@ def parameter(self, parameter : dict) -> None: if not report.is_valid: raise ValueError(f'Invalid parameters:\n{report.to_xml()}') # make parameter dict read-only - self._parameter = MappingProxyType(parameter) + self._parameter = frozendict(parameter) # (re-)run setup to load models etc self.setup() diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 6a35dda0f..d037eed3f 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -74,14 +74,22 @@ def test_parameter(self): with open(jsonpath, 'w') as f: f.write('{"baz": "quux"}') with open(jsonpath, 'r') as f: + parameter = json.load(f) processor = run_processor( DummyProcessor, - parameter=json.load(f), + parameter=parameter, input_file_grp="OCR-D-IMG", resolver=self.resolver, workspace=self.workspace ) self.assertEqual(processor.parameter['baz'], 'quux') + processor = get_processor( + DummyProcessor, + parameter=parameter) + with self.assertRaises(TypeError): + processor.parameter['baz'] = 'xuuq' + processor.parameter = { **parameter, 'baz': 'xuuq' } + self.assertEqual(processor.parameter['baz'], 'xuuq') def test_verify(self): proc = DummyProcessor(None) From 6cd4a34c689f56d4cbe5600d85978567d7b1e60e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Aug 2024 19:14:41 +0200 Subject: [PATCH 117/249] introduce Processor.shutdown to be overridden (called at deinit or parameter re-assignment) --- src/ocrd/processor/base.py | 15 +++++++++++++++ tests/processor/test_processor.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 336b479f5..29305c880 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -20,6 +20,7 @@ import inspect import tarfile import io +import weakref from warnings import warn from frozendict import frozendict from deprecated import deprecated @@ -149,6 +150,8 @@ def parameter(self) -> Optional[dict]: @parameter.setter def parameter(self, parameter : dict) -> None: + if self.parameter is not None: + self.shutdown() parameterValidator = ParameterValidator(self.ocrd_tool) report = parameterValidator.validate(parameter) if not report.is_valid: @@ -229,6 +232,8 @@ def __init__( self._base_logger = getLogger('ocrd.processor.base') if parameter is not None: self.parameter = parameter + # ensure that shutdown gets called at destruction + self._finalizer = weakref.finalize(self, self.shutdown) # workaround for deprecated#72 (@deprecated decorator does not work for subclasses): setattr(self, 'process', deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process'))) @@ -311,6 +316,16 @@ def setup(self) -> None: """ pass + def shutdown(self) -> None: + """ + Bring down the processor after data processing, + after to changing back from the workspace directory but + before exiting (or setting up with different parameters). + + (Override this to unload models from memory etc.) + """ + pass + @deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()') def process(self) -> None: """ diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index d037eed3f..5cee01d64 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -91,6 +91,37 @@ def test_parameter(self): processor.parameter = { **parameter, 'baz': 'xuuq' } self.assertEqual(processor.parameter['baz'], 'xuuq') + def test_instance_caching(self): + class DyingDummyProcessor(DummyProcessor): + def shutdown(self): + print(self.parameter['baz']) + self.capture_out_err() + # well above OCRD_MAX_PROCESSOR_CACHE=128 + firstp = None + for i in range(200): + p = get_processor( + DyingDummyProcessor, + parameter={'baz': str(i)}, + instance_caching=True + ) + if i == 0: + firstp = p + lastp = p + p = get_processor(DyingDummyProcessor, + parameter={'baz': '0'}, + instance_caching=True + ) + # should not be cached anymore + self.assertNotEqual(firstp, p) + p = get_processor(DyingDummyProcessor, + parameter={'baz': '199'}, + instance_caching=True + ) + # should still be cached + self.assertEqual(lastp, p) + out, err = self.capture_out_err() + #assert '0' in out.split('\n') + def test_verify(self): proc = DummyProcessor(None) with self.assertRaises(AttributeError): From 407bff8c0fd5a19f3b1a9864718addb4713ce403 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 23 Aug 2024 17:27:47 +0200 Subject: [PATCH 118/249] Processor: introduce `max_instances` class attribute --- src/ocrd/processor/base.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 29305c880..e88092824 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -104,6 +104,14 @@ class Processor(): a number of optional or mandatory parameters. """ + max_instances : int = -1 + """ + maximum number of cached instances (ignored if negative), to be applied on top of + :py:data:`~ocrd_utils.config.OCRD_MAX_PROCESSOR_CACHE` (i.e. whatever is smaller). + + (Override this if you know how many instances fit into memory at once.) + """ + @property def metadata(self) -> dict: """the ocrd-tool.json dict of the package""" From c9fbb2c5aa569428f99fb953c87df10eaf447895 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 23 Aug 2024 17:33:44 +0200 Subject: [PATCH 119/249] get_cached_processor: set lru_cache maxsize from min(cfg,class) at runtime --- src/ocrd/processor/helpers.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index a8cea96fa..e0dd50272 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -359,9 +359,9 @@ def wrap(s): pass -# Taken from https://github.com/OCR-D/core/pull/884 -@freeze_args -@lru_cache(maxsize=config.OCRD_MAX_PROCESSOR_CACHE) +# not decorated here but at runtime (on first use) +#@freeze_args +#@lru_cache(maxsize=config.OCRD_MAX_PROCESSOR_CACHE) def get_cached_processor(parameter: dict, processor_class): """ Call this function to get back an instance of a processor. @@ -378,7 +378,6 @@ def get_cached_processor(parameter: dict, processor_class): return processor return None - def get_processor( processor_class, parameter: Optional[dict] = None, @@ -392,6 +391,16 @@ def get_processor( if parameter is None: parameter = {} if instance_caching: + global get_cached_processor + if not hasattr(get_cached_processor, '__wrapped__'): + # first call: wrap + if processor_class.max_instances < 0: + maxsize = config.OCRD_MAX_PROCESSOR_CACHE + else: + maxsize = min(config.OCRD_MAX_PROCESSOR_CACHE, processor_class.max_instances) + # wrapping in call cache + # wrapping dict into frozendict (from https://github.com/OCR-D/core/pull/884) + get_cached_processor = freeze_args(lru_cache(maxsize=maxsize)(get_cached_processor)) processor = get_cached_processor(parameter, processor_class) else: # avoid passing workspace already (deprecated chdir behaviour) From 9c212a9ca779b0d36fc37dc03034a1b16659e5ff Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 23 Aug 2024 17:34:15 +0200 Subject: [PATCH 120/249] test get_processor instance_caching w/ max_instances --- tests/processor/test_processor.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 5cee01d64..fed950cad 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -93,12 +93,13 @@ def test_parameter(self): def test_instance_caching(self): class DyingDummyProcessor(DummyProcessor): + max_instances = 10 def shutdown(self): print(self.parameter['baz']) self.capture_out_err() - # well above OCRD_MAX_PROCESSOR_CACHE=128 + # customize (as processor implementors would) firstp = None - for i in range(200): + for i in range(DyingDummyProcessor.max_instances + 2): p = get_processor( DyingDummyProcessor, parameter={'baz': str(i)}, @@ -114,7 +115,7 @@ def shutdown(self): # should not be cached anymore self.assertNotEqual(firstp, p) p = get_processor(DyingDummyProcessor, - parameter={'baz': '199'}, + parameter={'baz': str(i)}, instance_caching=True ) # should still be cached From a413f046c530b49f1f9ce4d62695505717c861a5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 23 Aug 2024 17:54:38 +0200 Subject: [PATCH 121/249] test get_processor instance_caching w/ clear_cache --- tests/processor/test_processor.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index fed950cad..19ff1087f 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -95,7 +95,9 @@ def test_instance_caching(self): class DyingDummyProcessor(DummyProcessor): max_instances = 10 def shutdown(self): - print(self.parameter['baz']) + # fixme: will only print _after_ pytest exits, so too late for assertions + #print(self.parameter['baz']) + pass self.capture_out_err() # customize (as processor implementors would) firstp = None @@ -120,7 +122,16 @@ def shutdown(self): ) # should still be cached self.assertEqual(lastp, p) - out, err = self.capture_out_err() + from ocrd.processor.helpers import get_cached_processor + get_cached_processor.__wrapped__.cache_clear() + p = get_processor(DyingDummyProcessor, + parameter={'baz': str(i)}, + instance_caching=True + ) + # should not be cached anymore + self.assertNotEqual(lastp, p) + # fixme: will only print _after_ pytest exits, so too late for assertions + #out, err = self.capture_out_err() #assert '0' in out.split('\n') def test_verify(self): From 870523cb0f7b3558abd03e024121cb0c2521a706 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 22 Aug 2024 11:10:51 +0200 Subject: [PATCH 122/249] :package: v3.0.0a2 --- CHANGELOG.md | 4 ++++ VERSION | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1b53c6a28..38f36b96d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [3.0.0a2] - 2024-08-22 + Changed: - :fire: `OcrdPage` as proxy of `PcGtsType` instead of alias; also contains `etree` and `mapping` now - :fire: `Processor.zip_input_files` now can throw `ocrd.NonUniqueInputFile` and `ocrd.MissingInputFile` @@ -2191,6 +2193,8 @@ Fixed Initial Release +[3.0.0a2]: ../../compare/v3.0.0a2..v3.0.0a1 +[3.0.0a1]: ../../compare/v3.0.0a1..v2.67.2 [2.67.2]: ../../compare/v2.67.2..v2.67.1 [2.67.1]: ../../compare/v2.67.1..v2.67.0 [2.67.0]: ../../compare/v2.67.0..v2.66.1 diff --git a/VERSION b/VERSION index 2a9454873..3a5b5bc9d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.0.0a1 \ No newline at end of file +3.0.0a2 From 20bb6d1114433a17c2a88cfdd52db635f1eb24e6 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 22 Aug 2024 11:13:38 +0200 Subject: [PATCH 123/249] remove make *-workaround, we will not do that for v3+ --- CHANGELOG.md | 3 +++ Makefile | 38 -------------------------------------- 2 files changed, 3 insertions(+), 38 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 38f36b96d..43bf85764 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Changed: + - :fire: `ocrd_utils`, `ocrd_models`, `ocrd_modelfactory`, `ocrd_validators` and `ocrd_network` are not published as separate packages anymore, everything is contained in `ocrd` and you should adapt your `requirements.txt` accordingly. + ## [3.0.0a2] - 2024-08-22 Changed: diff --git a/Makefile b/Makefile index 39b46ee84..fd1210b65 100644 --- a/Makefile +++ b/Makefile @@ -401,41 +401,3 @@ docker docker-cuda docker-cuda-tf1 docker-cuda-tf2 docker-cuda-torch: # Build wheels and source dist and twine upload them pypi: build twine upload --verbose dist/ocrd-$(VERSION)*{tar.gz,whl} - -pypi-workaround: build-workaround - for dist in $(BUILD_ORDER);do twine upload dist/$$dist-$(VERSION)*{tar.gz,whl};done - -# Only in place until v3 so we don't break existing installations -build-workaround: pyclean - cp pyproject.toml pyproject.toml.BAK - cp src/ocrd_utils/constants.py src/ocrd_utils/constants.py.BAK - cp src/ocrd/cli/__init__.py src/ocrd/cli/__init__.py.BAK - for dist in $(BUILD_ORDER);do \ - cat pyproject.toml.BAK | sed "s,^name =.*,name = \"$$dist\"," > pyproject.toml; \ - cat src/ocrd_utils/constants.py.BAK | sed "s,dist_version('ocrd'),dist_version('$$dist')," > src/ocrd_utils/constants.py; \ - cat src/ocrd/cli/__init__.py.BAK | sed "s,package_name='ocrd',package_name='$$dist'," > src/ocrd/cli/__init__.py; \ - $(MAKE) build; \ - done - rm pyproject.toml.BAK - rm src/ocrd_utils/constants.py.BAK - rm src/ocrd/cli/__init__.py.BAK - -# test that the aliased packages work in isolation and combined -test-workaround: build-workaround - $(MAKE) uninstall-workaround - for dist in $(BUILD_ORDER);do \ - pip install dist/$$dist-*.whl ;\ - ocrd --version ;\ - make test ;\ - pip uninstall --yes $$dist ;\ - done - for dist in $(BUILD_ORDER);do \ - pip install dist/$$dist-*.whl ;\ - done - ocrd --version ;\ - make test ;\ - for dist in $(BUILD_ORDER);do pip uninstall --yes $$dist;done - -uninstall-workaround: - for dist in $(BUILD_ORDER);do $(PIP) uninstall --yes $$dist;done - From faa59a87364062c887de35108189fb621634db31 Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 23 Aug 2024 17:29:11 +0200 Subject: [PATCH 124/249] Processor.metadata_location property to specify where in the package ocrd-tool.json is found --- src/ocrd/processor/base.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index e88092824..4351b4865 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -112,12 +112,19 @@ class Processor(): (Override this if you know how many instances fit into memory at once.) """ + @property + def metadata_location(self) -> str: + """ + Location of `ocrd-tool.json` inside the package. By default we expect it in the root of the module + """ + return 'ocrd-tool.json' + @property def metadata(self) -> dict: """the ocrd-tool.json dict of the package""" if hasattr(self, '_metadata'): return self._metadata - self._metadata = json.loads(resource_string(self.__module__.split('.')[0], 'ocrd-tool.json')) + self._metadata = json.loads(resource_string(self.__module__.split('.')[0], self.metadata_location)) report = OcrdToolValidator.validate(self._metadata) if not report.is_valid: # FIXME: remove when bertsky/core#10 is merged From 5819c8167d1a2be662dd32b58bff0531ded40f8b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 23 Aug 2024 19:06:37 +0200 Subject: [PATCH 125/249] Processor.verify: always check cardinality (as we now have the defaults from ocrd-tool.json) --- src/ocrd/processor/base.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 4351b4865..4f1e86b96 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -127,9 +127,8 @@ def metadata(self) -> dict: self._metadata = json.loads(resource_string(self.__module__.split('.')[0], self.metadata_location)) report = OcrdToolValidator.validate(self._metadata) if not report.is_valid: - # FIXME: remove when bertsky/core#10 is merged - self.logger = getLogger(f'ocrd.processor.{self.__class__.__name__}') - self.logger.error(f"The ocrd-tool.json of this processor is {'problematic' if not report.errors else 'invalid'}:\n{report.to_xml()}.\nPlease open an issue at {self._metadata['git_url']}.") + self.logger.error(f"The ocrd-tool.json of this processor is {'problematic' if not report.errors else 'invalid'}:\n" + f"{report.to_xml()}.\nPlease open an issue at {self._metadata['git_url']}.") return self._metadata @property @@ -178,7 +177,7 @@ def parameter(self, parameter : dict) -> None: def __init__( self, - # FIXME: deprecate in favor of process_workspace(workspace) + # FIXME: remove in favor of process_workspace(workspace) workspace : Optional[Workspace], ocrd_tool=None, parameter=None, @@ -286,14 +285,10 @@ def assert_file_grp_cardinality(grps : List[str], spec : Union[int, List[int]], assert len(grps) >= minimum, msg % (len(grps), str(spec)) if maximum > 0: assert len(grps) <= maximum, msg % (len(grps), str(spec)) - # FIXME: maybe we should enforce the cardinality properties to be specified or apply default=1 here - # (but we already have ocrd-tool validation, and these first need to be adopted by implementors) - if 'input_file_grp_cardinality' in self.ocrd_tool: - assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'], - "Unexpected number of input file groups %d vs %s") - if 'output_file_grp_cardinality' in self.ocrd_tool: - assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'], - "Unexpected number of output file groups %d vs %s") + assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'], + "Unexpected number of input file groups %d vs %s") + assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'], + "Unexpected number of output file groups %d vs %s") for input_file_grp in input_file_grps: assert input_file_grp in self.workspace.mets.file_groups # keep this for backwards compatibility: From 4f88f1d209bc86cdaea031ae9b2bda685e4f8fba Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 11:49:08 +0200 Subject: [PATCH 126/249] fix --log-filename (6fc606027a): apply in ocrd_cli_wrap_processor --- src/ocrd/decorators/__init__.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index 3f07ede4a..fcc70a71e 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -1,4 +1,5 @@ import sys +from contextlib import nullcontext from ocrd_utils import ( config, @@ -9,6 +10,7 @@ parse_json_string_with_comments, set_json_key_value_overrides, parse_json_string_or_file, + redirect_stderr_and_stdout_to_file, ) from ocrd_validators import WorkspaceValidator from ocrd_network import ProcessingWorker, ProcessorServer, AgentType @@ -140,7 +142,7 @@ def resolve(name): print("Profiling...") pr = cProfile.Profile() pr.enable() - def exit(): + def goexit(): pr.disable() print("Profiling completed") if profile_file: @@ -149,8 +151,13 @@ def exit(): s = io.StringIO() pstats.Stats(pr, stream=s).sort_stats("cumulative").print_stats() print(s.getvalue()) - atexit.register(exit) - run_processor(processorClass, mets_url=mets, workspace=workspace, **kwargs) + atexit.register(goexit) + if log_filename: + log_ctx = redirect_stderr_and_stdout_to_file(log_filename) + else: + log_ctx = nullcontext() + with log_ctx: + run_processor(processorClass, mets_url=mets, workspace=workspace, **kwargs) def check_and_run_network_agent(ProcessorClass, subcommand: str, address: str, database: str, queue: str): From d621f3631bc5a45730bc9f47d9ee0b6cd9aaf040 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 11:54:07 +0200 Subject: [PATCH 127/249] fix exception --- src/ocrd/resource_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index 44bbd081b..e63c5fd01 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -248,7 +248,7 @@ def _download_impl(url, filename, progress_cb=None, size=None): if "Content-Disposition" not in r.headers: url = get_url_from_gdrive_confirmation(r.text) except RuntimeError as e: - log.warning("Cannot unwrap Google Drive URL: ", e) + log.warning("Cannot unwrap Google Drive URL: %s", e) with open(filename, 'wb') as f: with requests.get(url, stream=True) as r: r.raise_for_status() From 4868fb152a1eedf0452a662aafcdce640cd20a88 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 12:27:33 +0200 Subject: [PATCH 128/249] adapt to PIL.Image moved constants --- src/ocrd/workspace.py | 8 +++---- src/ocrd_utils/image.py | 50 ++++++++++++++++++++--------------------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index 3523d9f15..bd3380652 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -1168,9 +1168,9 @@ def _reflect(log, name, orientation, segment_image, segment_coords, segment_xywh # Transpose in affine coordinate transform: # (consistent with image transposition or AlternativeImage below) transposition = { - 90: Image.ROTATE_90, - 180: Image.ROTATE_180, - 270: Image.ROTATE_270 + 90: Image.Transpose.ROTATE_90, + 180: Image.Transpose.ROTATE_180, + 270: Image.Transpose.ROTATE_270 }.get(orientation) # no default segment_coords['transform'] = transpose_coordinates( segment_coords['transform'], transposition, @@ -1238,5 +1238,5 @@ def _scale(log, name, factor, segment_image, segment_coords, segment_xywh, **kwa segment_image = segment_image.resize((int(segment_image.width * factor), int(segment_image.height * factor)), # slowest, but highest quality: - Image.BICUBIC) + Image.Resampling.BICUBIC) return segment_image, segment_coords, segment_xywh diff --git a/src/ocrd_utils/image.py b/src/ocrd_utils/image.py index 3bc14e661..6f2524608 100644 --- a/src/ocrd_utils/image.py +++ b/src/ocrd_utils/image.py @@ -65,10 +65,10 @@ def adjust_canvas_to_transposition(size, method): Return a numpy array of the enlarged width and height. """ - if method in [Image.ROTATE_90, - Image.ROTATE_270, - Image.TRANSPOSE, - Image.TRANSVERSE]: + if method in [Image.Transpose.ROTATE_90, + Image.Transpose.ROTATE_270, + Image.Transpose.TRANSPOSE, + Image.Transpose.TRANSVERSE]: size = size[::-1] return size @@ -348,26 +348,26 @@ def transpose_coordinates(transform, method, orig=np.array([0, 0])): calculate the affine coordinate transform corresponding to the composition of both transformations, which is respectively: - - ``PIL.Image.FLIP_LEFT_RIGHT``: + - ``PIL.Image.Transpose.FLIP_LEFT_RIGHT``: entails translation to the center, followed by pure reflection about the y-axis, and subsequent translation back - - ``PIL.Image.FLIP_TOP_BOTTOM``: + - ``PIL.Image.Transpose.FLIP_TOP_BOTTOM``: entails translation to the center, followed by pure reflection about the x-axis, and subsequent translation back - - ``PIL.Image.ROTATE_180``: + - ``PIL.Image.Transpose.ROTATE_180``: entails translation to the center, followed by pure reflection about the origin, and subsequent translation back - - ``PIL.Image.ROTATE_90``: + - ``PIL.Image.Transpose.ROTATE_90``: entails translation to the center, followed by pure rotation by 90° counter-clockwise, and subsequent translation back - - ``PIL.Image.ROTATE_270``: + - ``PIL.Image.Transpose.ROTATE_270``: entails translation to the center, followed by pure rotation by 270° counter-clockwise, and subsequent translation back - - ``PIL.Image.TRANSPOSE``: + - ``PIL.Image.Transpose.TRANSPOSE``: entails translation to the center, followed by pure rotation by 90° counter-clockwise and pure reflection about the x-axis, and subsequent translation back - - ``PIL.Image.TRANSVERSE``: + - ``PIL.Image.Transpose.TRANSVERSE``: entails translation to the center, followed by pure rotation by 90° counter-clockwise and pure reflection about the y-axis, and subsequent translation back @@ -388,13 +388,13 @@ def transpose_coordinates(transform, method, orig=np.array([0, 0])): [0, 0, 1]]) transform = shift_coordinates(transform, -orig) operations = { - Image.FLIP_LEFT_RIGHT: [refly], - Image.FLIP_TOP_BOTTOM: [reflx], - Image.ROTATE_180: [reflx, refly], - Image.ROTATE_90: [rot90], - Image.ROTATE_270: [rot90, reflx, refly], - Image.TRANSPOSE: [rot90, reflx], - Image.TRANSVERSE: [rot90, refly] + Image.Transpose.FLIP_LEFT_RIGHT: [refly], + Image.Transpose.FLIP_TOP_BOTTOM: [reflx], + Image.Transpose.ROTATE_180: [reflx, refly], + Image.Transpose.ROTATE_90: [rot90], + Image.Transpose.ROTATE_270: [rot90, reflx, refly], + Image.Transpose.TRANSPOSE: [rot90, reflx], + Image.Transpose.TRANSVERSE: [rot90, refly] }.get(method) # no default for operation in operations: transform = np.dot(operation, transform) @@ -411,29 +411,29 @@ def transpose_image(image, method): Given a PIL.Image ``image`` and a transposition mode ``method``, apply the respective operation: - - ``PIL.Image.FLIP_LEFT_RIGHT``: + - ``PIL.Image.Transpose.FLIP_LEFT_RIGHT``: all pixels get mirrored at half the width of the image - - ``PIL.Image.FLIP_TOP_BOTTOM``: + - ``PIL.Image.Transpose.FLIP_TOP_BOTTOM``: all pixels get mirrored at half the height of the image - - ``PIL.Image.ROTATE_180``: + - ``PIL.Image.Transpose.ROTATE_180``: all pixels get mirrored at both, the width and half the height of the image, i.e. the image gets rotated by 180° counter-clockwise - - ``PIL.Image.ROTATE_90``: + - ``PIL.Image.Transpose.ROTATE_90``: rows become columns (but counted from the right) and columns become rows, i.e. the image gets rotated by 90° counter-clockwise; width becomes height and vice versa - - ``PIL.Image.ROTATE_270``: + - ``PIL.Image.Transpose.ROTATE_270``: rows become columns and columns become rows (but counted from the bottom), i.e. the image gets rotated by 270° counter-clockwise; width becomes height and vice versa - - ``PIL.Image.TRANSPOSE``: + - ``PIL.Image.Transpose.TRANSPOSE``: rows become columns and vice versa, i.e. all pixels get mirrored at the main diagonal; width becomes height and vice versa - - ``PIL.Image.TRANSVERSE``: + - ``PIL.Image.Transpose.TRANSVERSE``: rows become columns (but counted from the right) and columns become rows (but counted from the bottom), i.e. all pixels get mirrored at the opposite diagonal; From da72c0a93eb6ab7d6bbdde4872ab54820a5c4a30 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 12:29:12 +0200 Subject: [PATCH 129/249] ocrd_utils: add parse_json_file_with_comments --- src/ocrd_utils/__init__.py | 2 ++ src/ocrd_utils/str.py | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/src/ocrd_utils/__init__.py b/src/ocrd_utils/__init__.py index 836f01dce..c853a34bd 100644 --- a/src/ocrd_utils/__init__.py +++ b/src/ocrd_utils/__init__.py @@ -75,6 +75,7 @@ :py:func:`concat_padded`, :py:func:`nth_url_segment`, :py:func:`remove_non_path_from_url`, + :py:func:`parse_json_file_with_comments`, :py:func:`parse_json_string_with_comments`, :py:func:`parse_json_string_or_file`, :py:func:`set_json_key_value_overrides`, @@ -204,6 +205,7 @@ make_xml_id, nth_url_segment, partition_list, + parse_json_file_with_comments, parse_json_string_or_file, parse_json_string_with_comments, sparkline, diff --git a/src/ocrd_utils/str.py b/src/ocrd_utils/str.py index 7009a9ec0..4f1e08805 100644 --- a/src/ocrd_utils/str.py +++ b/src/ocrd_utils/str.py @@ -21,6 +21,7 @@ 'make_file_id', 'make_xml_id', 'nth_url_segment', + 'parse_json_file_with_comments', 'parse_json_string_or_file', 'parse_json_string_with_comments', 'remove_non_path_from_url', @@ -162,6 +163,13 @@ def is_string(val): return isinstance(val, str) +def parse_json_file_with_comments(val): + """ + Parse a file of JSON interspersed with #-prefixed full-line comments + """ + with open(val, 'r', encoding='utf-8') as inputf: + return parse_json_string_with_comments(inputf.read()) + def parse_json_string_with_comments(val): """ Parse a string of JSON interspersed with #-prefixed full-line comments From ca78b94f108d3b2bf848000694b67322ba6a9919 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 12:31:35 +0200 Subject: [PATCH 130/249] cli.workspace: pass fileGrp as well, improve description --- src/ocrd/cli/workspace.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index e2186a727..1461e53e0 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -118,7 +118,7 @@ def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency @workspace_cli.command('clone', cls=command_with_replaced_help( (r' \[WORKSPACE_DIR\]', ''))) # XXX deprecated argument @click.option('-f', '--clobber-mets', help="Overwrite existing METS file", default=False, is_flag=True) -@click.option('-a', '--download', is_flag=True, help="Download all files and change location in METS file after cloning") +@click.option('-a', '--download', is_flag=True, help="Download all selected files and add local path references in METS file afterwards") @click.argument('mets_url') @mets_find_options # XXX deprecated @@ -129,8 +129,10 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim Create a workspace from METS_URL and return the directory METS_URL can be a URL, an absolute path or a path relative to $PWD. - If METS_URL is not provided, use --mets accordingly. METS_URL can also be an OAI-PMH GetRecord URL wrapping a METS file. + + Additional options pertain to the selection of files / fileGrps / pages + to be downloaded, if --download is used. """ LOG = getLogger('ocrd.cli.workspace.clone') if workspace_dir: @@ -143,6 +145,7 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim mets_basename=ctx.mets_basename, clobber_mets=clobber_mets, download=download, + fileGrp=file_grp, ID=file_id, pageId=page_id, mimetype=mimetype, @@ -408,7 +411,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi if dry_run: log.info('workspace.add_file(%s)' % file_dict) else: - workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict) + workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict) # pylint: disable=redundant-keyword-arg # save changes to disk workspace.save_mets() From cf41745d24231cb7fdbceeb1305e9dbc7752c94e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 12:35:37 +0200 Subject: [PATCH 131/249] OcrdMets.add_agent: does not have positional args --- src/ocrd/mets_server.py | 2 +- src/ocrd_models/ocrd_mets.py | 4 ++-- tests/model/test_ocrd_mets.py | 2 +- tests/test_workspace.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index da6e873c0..7c22da278 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -236,7 +236,7 @@ def agents(self): agent_dict["_type"] = agent_dict.pop("type") return [ClientSideOcrdAgent(None, **agent_dict) for agent_dict in agent_dicts] - def add_agent(self, *args, **kwargs): + def add_agent(self, **kwargs): if not self.multiplexing_mode: return self.session.request("POST", f"{self.url}/agent", json=OcrdAgentModel.create(**kwargs).dict()) else: diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index 4d1e6cba5..90d37b37d 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -198,7 +198,7 @@ def agents(self) -> List[OcrdAgent]: """ return [OcrdAgent(el_agent) for el_agent in self._tree.getroot().findall('mets:metsHdr/mets:agent', NS)] - def add_agent(self, *args, **kwargs) -> OcrdAgent: + def add_agent(self, **kwargs) -> OcrdAgent: """ Add an :py:class:`ocrd_models.ocrd_agent.OcrdAgent` to the list of agents in the ``metsHdr``. """ @@ -213,7 +213,7 @@ def add_agent(self, *args, **kwargs) -> OcrdAgent: el_agent_last.addnext(el_agent) except StopIteration: el_metsHdr.insert(0, el_agent) - return OcrdAgent(el_agent, *args, **kwargs) + return OcrdAgent(el_agent, **kwargs) @property def file_groups(self) -> List[str]: diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index 739db7625..89742a507 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -248,7 +248,7 @@ def test_file_pageid(sbb_sample_01): def test_agent(sbb_sample_01): beforelen = len(sbb_sample_01.agents) - sbb_sample_01.add_agent('foo bar v0.0.1', 'OTHER', 'OTHER', 'YETOTHERSTILL') + sbb_sample_01.add_agent(name='foo bar v0.0.1', _type='OTHER', othertype='OTHER', role='YETOTHERSTILL') assert len(sbb_sample_01.agents) == beforelen + 1 def test_metshdr(): diff --git a/tests/test_workspace.py b/tests/test_workspace.py index 1ae007ae5..02cb72d34 100644 --- a/tests/test_workspace.py +++ b/tests/test_workspace.py @@ -745,7 +745,7 @@ def _fixture_metsDocumentID(tmp_path): def test_agent_before_metsDocumentID(workspace_metsDocumentID): report = WorkspaceValidator.validate(Resolver(), mets_url=workspace_metsDocumentID.mets_target) assert report.is_valid - workspace_metsDocumentID.mets.add_agent('foo bar v0.0.1', 'OTHER', 'OTHER', 'OTHER') + workspace_metsDocumentID.mets.add_agent(name='foo bar v0.0.1', _type='OTHER', othertype='OTHER', role='OTHER') workspace_metsDocumentID.save_mets() report = WorkspaceValidator.validate(Resolver(), mets_url=workspace_metsDocumentID.mets_target) print(report.errors) From cadc6e6e65f659346af50f0f7ade642af94579fe Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 12:37:48 +0200 Subject: [PATCH 132/249] remove misplaced kwargs from run_processor --- src/ocrd/decorators/__init__.py | 2 +- src/ocrd/processor/helpers.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index fcc70a71e..b0b1cad04 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -109,7 +109,7 @@ def resolve(name): kwargs['parameter'] = dict() # Merge parameter overrides and parameters if 'parameter_override' in kwargs: - set_json_key_value_overrides(kwargs['parameter'], *kwargs['parameter_override']) + set_json_key_value_overrides(kwargs['parameter'], *kwargs.pop('parameter_override')) # Assert -I / -O if not kwargs['input_file_grp']: raise ValueError('-I/--input-file-grp is required') diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index e0dd50272..2950af3e4 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -39,10 +39,7 @@ def run_processor( log_level=None, input_file_grp=None, output_file_grp=None, - show_resource=None, - list_resources=False, parameter=None, - parameter_override=None, working_dir=None, mets_server_url=None, instance_caching=False From 7966057f975a76db39bf10b1c38540860b7c179c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 12:48:54 +0200 Subject: [PATCH 133/249] =?UTF-8?q?Processor.metadata:=20refactor=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - `metadata`, `executable`, `ocrd_tool`, `version`: use cached_property instead of internal ad-hoc attributes - rename `metadata_location` → `metadata_filename`, add cached_property chain `metadata_location`, `metadata_rawdict` used by `metadata` to make it easy to override - `metadata_filename` if just the path of `ocrd-tool.json` in the package deviates - `metadata_location` if the `ocrd-tool.json` is not distributed via Python pkg - `metadata_rawdict` if the `ocrd-tool.json` is not in a file - `metadata` if the validated, expanded `ocrd-tool.json` is somewhere else - `DummyProcessor`: just override `Processor.metadata_filename` - processor tests: adapt to new properties and `verify` enforcing cardinality --- src/ocrd/processor/base.py | 123 +++++++++++++----- src/ocrd/processor/builtin/dummy_processor.py | 10 +- tests/data/__init__.py | 12 +- tests/data/ocrd-cp.ocrd-tool.json | 7 +- .../test_integration_4_processing_worker.py | 5 +- 5 files changed, 107 insertions(+), 50 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 4f1e86b96..5329ea670 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -9,6 +9,7 @@ 'run_processor' ] +from functools import cached_property from os.path import exists, join from shutil import copyfileobj import json @@ -35,13 +36,12 @@ MIME_TO_EXT, config, getLogger, - initLogging, list_resource_candidates, pushd_popd, list_all_resources, get_processor_resource_types, resource_filename, - resource_string, + parse_json_file_with_comments, make_file_id, deprecation_warning ) @@ -96,12 +96,14 @@ def __init__(self, fileGrp, pageId, mimetype): class Processor(): """ - A processor is a tool that implements the uniform OCR-D command-line interface - for run-time data processing. That is, it executes a single workflow step, - or a combination of workflow steps, on the workspace (represented by local METS). - It reads input files for all or requested physical pages of the input fileGrp(s), - and writes output files for them into the output fileGrp(s). It may take - a number of optional or mandatory parameters. + A processor is a tool that implements the uniform OCR-D + `command-line interface for run-time data processing `_. + + That is, it executes a single workflow step, or a combination of workflow steps, + on the workspace (represented by local METS). It reads input files for all or selected + physical pages of the input fileGrp(s), computes additional annotation, and writes output + files for them into the output fileGrp(s). It may take a number of optional or mandatory + parameters. """ max_instances : int = -1 @@ -113,47 +115,96 @@ class Processor(): """ @property - def metadata_location(self) -> str: + def metadata_filename(self) -> str: """ - Location of `ocrd-tool.json` inside the package. By default we expect it in the root of the module + Relative location of the ``ocrd-tool.json`` file inside the package. + + Used by :py:data:`metadata_location`. + + (Override if ``ocrd-tool.json`` is not in the root of the module, + e.g. ``namespace/ocrd-tool.json`` or ``data/ocrd-tool.json``). """ return 'ocrd-tool.json' - @property + @cached_property + def metadata_location(self) -> str: + """ + Absolute path of the ``ocrd-tool.json`` file as distributed with the package. + + Used by :py:data:`metadata_rawdict`. + + (Override if ``ocrd-tool.json`` is not distributed with the Python package.) + """ + return resource_filename(__package__.split('.')[0], self.metadata_filename) + + @cached_property + def metadata_rawdict(self) -> dict: + """ + Raw (unvalidated, unexpanded) ``ocrd-tool.json`` dict contents of the package. + + Used by :py:data:`metadata`. + + (Override if ``ocrd-tool.json`` is not in a file.) + """ + return parse_json_file_with_comments(self.metadata_location) + + @cached_property def metadata(self) -> dict: - """the ocrd-tool.json dict of the package""" - if hasattr(self, '_metadata'): - return self._metadata - self._metadata = json.loads(resource_string(self.__module__.split('.')[0], self.metadata_location)) - report = OcrdToolValidator.validate(self._metadata) + """ + The ``ocrd-tool.json`` dict contents of the package, according to the OCR-D + `spec `_ for processor tools. + + After deserialisation, it also gets validated against the + `schema `_ with all defaults + expanded. + + Used by :py:data:`ocrd_tool` and :py:data:`version`. + + (Override if you want to provide metadata programmatically instead of a + JSON file.) + """ + metadata = self.metadata_rawdict + report = OcrdToolValidator.validate(metadata) if not report.is_valid: self.logger.error(f"The ocrd-tool.json of this processor is {'problematic' if not report.errors else 'invalid'}:\n" - f"{report.to_xml()}.\nPlease open an issue at {self._metadata['git_url']}.") - return self._metadata + f"{report.to_xml()}.\nPlease open an issue at {metadata.get('git_url', 'the website')}.") + return metadata - @property + @cached_property def version(self) -> str: - """the version of the package""" - if hasattr(self, '_version'): - return self._version - self._version = self.metadata['version'] - return self._version + """ + The program version of the package. + Usually the ``version`` part of :py:data:`metadata`. - @property + (Override if you do not want to use :py:data:`metadata` lookup + mechanism.) + """ + return self.metadata['version'] + + @cached_property def executable(self) -> str: - """the executable name of this processor tool""" - if hasattr(self, '_executable'): - return self._executable - self._executable = os.path.basename(inspect.stack()[-1].filename) - return self._executable + """ + The executable name of this processor tool. Taken from the runtime + filename. - @property + Used by :py:data:`ocrd_tool` for lookup in :py:data:`metadata`. + + (Override if your entry-point name deviates from the ``executable`` + name, or the processor gets instantiated from another runtime.) + """ + return os.path.basename(inspect.stack()[-1].filename) + + @cached_property def ocrd_tool(self) -> dict: - """the ocrd-tool.json dict of this processor tool""" - if hasattr(self, '_ocrd_tool'): - return self._ocrd_tool - self._ocrd_tool = self.metadata['tools'][self.executable] - return self._ocrd_tool + """ + The ``ocrd-tool.json`` dict contents of this processor tool. + Usually the :py:data:`executable` key of the ``tools`` part + of :py:data:`metadata`. + + (Override if you do not want to use :py:data:`metadata` lookup + mechanism.) + """ + return self.metadata['tools'][self.executable] @property def parameter(self) -> Optional[dict]: diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index 7b2f1b66e..9bba9bee8 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -20,8 +20,6 @@ ) from ocrd_modelfactory import page_from_file -OCRD_TOOL = parse_json_string_with_comments(resource_string(__package__ + '.dummy', 'ocrd-tool.json')) - class DummyProcessor(Processor): """ Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group @@ -76,17 +74,13 @@ def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None: super().process_page_file(input_file) @property - def metadata(self): - return OCRD_TOOL + def metadata_filename(self): + return 'processor/builtin/dummy/ocrd-tool.json' @property def executable(self): return 'ocrd-dummy' - @property - def version(self): - return '0.0.3' - @click.command() @ocrd_cli_options def cli(*args, **kwargs): diff --git a/tests/data/__init__.py b/tests/data/__init__.py index 53fa227d0..c706546c5 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -9,6 +9,10 @@ 'executable': 'ocrd-test', 'description': 'dolor sit', 'steps': ['recognition/post-correction'], + # as we bypass Processor.metadata with OcrdToolValidator + # we get no default expansion, so add default cardinalities here + 'input_file_grp_cardinality': 1, + 'output_file_grp_cardinality': 1, 'parameters': { 'baz': { 'type': 'string', @@ -133,7 +137,11 @@ def process_page_file(self, input_file): class IncompleteProcessor(Processor): @property - def ocrd_tool(self): - return {} + def executable(self): + return 'ocrd-foo' + + @property + def metadata_rawdict(self): + return {'tools': {self.executable: {}}} diff --git a/tests/data/ocrd-cp.ocrd-tool.json b/tests/data/ocrd-cp.ocrd-tool.json index 728c144c5..948695c06 100755 --- a/tests/data/ocrd-cp.ocrd-tool.json +++ b/tests/data/ocrd-cp.ocrd-tool.json @@ -1,15 +1,18 @@ { - "version": "1.0", + "version": "1.0.0", "tools": { "ocrd-cp": { "executable": "ocrd-cp", "description": "dummy processor copying", "steps": ["preprocessing/optimization"], "categories": ["Image preprocessing"], + # we allow 1 or 2 input file grps + # the output cardinality gets expanded from default + "input_file_grp_cardinality": [1,2], "parameters": { "message": { "type": "string", - "default": "", + "default": "hello by default", "description": "message to print on stdout" } } diff --git a/tests/network/test_integration_4_processing_worker.py b/tests/network/test_integration_4_processing_worker.py index e211bd238..ae322b097 100644 --- a/tests/network/test_integration_4_processing_worker.py +++ b/tests/network/test_integration_4_processing_worker.py @@ -1,6 +1,6 @@ from pathlib import Path from pika import BasicProperties -from src.ocrd.processor.builtin.dummy_processor import DummyProcessor, OCRD_TOOL +from src.ocrd.processor.builtin.dummy_processor import DummyProcessor from src.ocrd_network.constants import JobState from src.ocrd_network.database import sync_db_create_workspace, sync_db_create_processing_job from src.ocrd_network.logging_utils import get_processing_job_logging_file_path @@ -25,12 +25,13 @@ def test_processing_worker_process_message(): # wrong reads from the deployed dummy worker (part of the processing server integration test) processor_name = "ocrd-dummy-test" result_queue_name = f"{processor_name}-result" + ocrd_tool = DummyProcessor(None).metadata processing_worker = ProcessingWorker( rabbitmq_addr=test_config.RABBITMQ_URL, mongodb_addr=test_config.DB_URL, processor_name=processor_name, - ocrd_tool=OCRD_TOOL, + ocrd_tool=ocrd_tool, processor_class=DummyProcessor ) processing_worker.connect_publisher(enable_acks=True) From bba142d3520b34e13cc200949a6222e2368ff4ea Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 13:03:51 +0200 Subject: [PATCH 134/249] bashlib input-files: adapt, allow passing ocrd-tool.json path and executable name --- src/ocrd/cli/bashlib.py | 39 +++++++++++++++++++++++++++++++-------- src/ocrd/lib.bash | 2 ++ 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/src/ocrd/cli/bashlib.py b/src/ocrd/cli/bashlib.py index 26139cb48..6934744c8 100644 --- a/src/ocrd/cli/bashlib.py +++ b/src/ocrd/cli/bashlib.py @@ -82,6 +82,8 @@ def bashlib_constants(name): print(val) @bashlib_cli.command('input-files') +@click.option('--ocrd-tool', help="path to ocrd-tool.json of processor to feed", default=None) +@click.option('--executable', help="name of processor executable in ocrd-tool.json", default=None) @click.option('-m', '--mets', help="METS to process", default=DEFAULT_METS_BASENAME) @click.option('-w', '--working-dir', help="Working Directory") @click.option('-I', '--input-file-grp', help='File group(s) used as input.', default=None) @@ -96,7 +98,7 @@ def bashlib_constants(name): @parameter_option @parameter_override_option @ocrd_loglevel -def bashlib_input_files(**kwargs): +def bashlib_input_files(ocrd_tool, executable, **kwargs): """ List input files for processing @@ -108,12 +110,6 @@ def bashlib_input_files(**kwargs): (The printing format is one associative array initializer per line.) """ class BashlibProcessor(Processor): - @property - def ocrd_tool(self): - return {'executable': '', 'steps': ['']} - @property - def version(self): - return '1.0' # go half way of the normal run_processor / process_workspace call tree # by just delegating to process_workspace, overriding process_page_file # to ensure all input files exist locally (without persisting them in the METS) @@ -129,4 +125,31 @@ def process_page_file(self, *input_files): print(f"[{field}]='{value}'", end=' ') output_file_id = make_file_id(input_files[0], kwargs['output_file_grp']) print(f"[outputFileId]='{output_file_id}'") - ocrd_cli_wrap_processor(BashlibProcessor, **kwargs) + if ocrd_tool and executable: + class FullBashlibProcessor(BashlibProcessor): + @property + def metadata_location(self): + # needed for metadata loading and validation mechanism + return ocrd_tool + @property + def executable(self): + # needed for ocrd_tool lookup + return executable + else: + # we have no true metadata file, so fill in just to make it work + class FullBashlibProcessor(BashlibProcessor): + @property + def ocrd_tool(self): + # needed to satisfy the validator + return {'executable': '', + # required now + 'input_file_grp_cardinality': 1, + 'output_file_grp_cardinality': 1, + 'steps': [''] + } + @property + def version(self): + # needed to satisfy the validator and wrapper + return '1.0' + + ocrd_cli_wrap_processor(FullBashlibProcessor, **kwargs) diff --git a/src/ocrd/lib.bash b/src/ocrd/lib.bash index 82fa2005d..6b08f669d 100644 --- a/src/ocrd/lib.bash +++ b/src/ocrd/lib.bash @@ -299,6 +299,8 @@ ocrd__wrap () { eval "ocrd__files[$i]=ocrd__file$i" let ++i done < <(ocrd bashlib input-files \ + --ocrd-tool $OCRD_TOOL_JSON \ + --executable $OCRD_TOOL_NAME \ -m "${ocrd__argv[mets_file]}" \ -I "${ocrd__argv[input_file_grp]}" \ -O "${ocrd__argv[output_file_grp]}" \ From 32cdc5a7aa35d0c54d2120d47249201e90912f61 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 13:04:40 +0200 Subject: [PATCH 135/249] add to pylint karma --- src/ocrd/cli/__init__.py | 56 +++++---- src/ocrd/cli/bashlib.py | 11 +- src/ocrd/cli/ocrd_tool.py | 10 +- src/ocrd/cli/workspace.py | 9 +- src/ocrd/decorators/__init__.py | 5 +- src/ocrd/mets_server.py | 3 +- src/ocrd/processor/base.py | 8 +- src/ocrd/processor/builtin/dummy_processor.py | 5 +- src/ocrd/processor/helpers.py | 5 +- src/ocrd/resolver.py | 3 - src/ocrd/resource_manager.py | 8 +- src/ocrd/workspace.py | 9 +- src/ocrd/workspace_backup.py | 2 +- src/ocrd_modelfactory/__init__.py | 2 +- src/ocrd_models/constants.py | 1 - src/ocrd_models/ocrd_exif.py | 4 +- src/ocrd_models/ocrd_file.py | 4 +- src/ocrd_models/ocrd_mets.py | 40 +++---- src/ocrd_models/ocrd_page.py | 1 - src/ocrd_models/ocrd_xml_base.py | 4 +- src/ocrd_utils/config.py | 8 +- src/ocrd_utils/logging.py | 36 +++--- src/ocrd_utils/os.py | 9 +- src/ocrd_utils/str.py | 5 +- src/ocrd_validators/json_validator.py | 4 +- src/ocrd_validators/ocrd_tool_validator.py | 4 +- src/ocrd_validators/page_validator.py | 112 +++++++++--------- src/ocrd_validators/parameter_validator.py | 4 +- .../resource_list_validator.py | 7 +- src/ocrd_validators/workspace_validator.py | 42 +++---- src/ocrd_validators/xsd_validator.py | 2 +- tests/validator/test_workspace_validator.py | 2 +- 32 files changed, 202 insertions(+), 223 deletions(-) diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index 3722e3c21..6a752f2e3 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -10,6 +10,36 @@ from ocrd_utils import config +# pylint: disable=wrong-import-position + +def command_with_replaced_help(*replacements): + + class CommandWithReplacedHelp(click.Command): + def get_help(self, ctx): + newhelp = super().get_help(ctx) + for replacement in replacements: + newhelp = re.sub(*replacement, newhelp) + # print(newhelp) + return newhelp + + return CommandWithReplacedHelp + +# pylint: enable=wrong-import-position + +from ..decorators import ocrd_loglevel +from .ocrd_tool import ocrd_tool_cli +from .workspace import workspace_cli +from .process import process_cli +from .bashlib import bashlib_cli +from .validate import validate_cli +from .resmgr import resmgr_cli +from .zip import zip_cli +from .log import log_cli +from .network import network_cli + + +__all__ = ['cli'] + _epilog = f""" \b @@ -60,30 +90,6 @@ {config.describe('OCRD_LOGGING_DEBUG')} """ -def command_with_replaced_help(*replacements): - - class CommandWithReplacedHelp(click.Command): - def get_help(self, ctx): - help = super().get_help(ctx) - for replacement in replacements: - help = re.sub(*replacement, help) - # print(help) - return help - - return CommandWithReplacedHelp - - -from ..decorators import ocrd_loglevel -from .ocrd_tool import ocrd_tool_cli -from .workspace import workspace_cli -from .process import process_cli -from .bashlib import bashlib_cli -from .validate import validate_cli -from .resmgr import resmgr_cli -from .zip import zip_cli -from .log import log_cli -from .network import network_cli - @click.group(epilog=_epilog) @click.version_option(package_name='ocrd') @ocrd_loglevel @@ -101,5 +107,3 @@ def cli(**kwargs): # pylint: disable=unused-argument cli.add_command(log_cli) cli.add_command(resmgr_cli) cli.add_command(network_cli) - -__all__ = ['cli'] diff --git a/src/ocrd/cli/bashlib.py b/src/ocrd/cli/bashlib.py index 6934744c8..d46c81ee4 100644 --- a/src/ocrd/cli/bashlib.py +++ b/src/ocrd/cli/bashlib.py @@ -8,7 +8,6 @@ """ from __future__ import print_function import sys -from os.path import isfile import click from ocrd.constants import BASHLIB_FILENAME @@ -23,15 +22,7 @@ ocrd_loglevel, ocrd_cli_wrap_processor ) -from ocrd_utils import ( - is_local_filename, - get_local_filename, - initLogging, - getLogger, - make_file_id, - config -) -from ocrd.resolver import Resolver +from ocrd_utils import make_file_id from ocrd.processor import Processor # ---------------------------------------------------------------------- diff --git a/src/ocrd/cli/ocrd_tool.py b/src/ocrd/cli/ocrd_tool.py index 929fe47cc..f63a7235a 100644 --- a/src/ocrd/cli/ocrd_tool.py +++ b/src/ocrd/cli/ocrd_tool.py @@ -17,7 +17,6 @@ from ocrd.processor import Processor from ocrd_utils import ( set_json_key_value_overrides, - VERSION as OCRD_VERSION, parse_json_string_or_file, parse_json_string_with_comments as loads ) @@ -30,22 +29,23 @@ def __init__(self, filename): with codecs.open(filename, encoding='utf-8') as f: self.content = f.read() self.json = loads(self.content) + self.tool_name = '' class BashProcessor(Processor): @property - def metadata(inner_self): + def metadata(inner_self): # pylint: disable=no-self-argument,arguments-renamed return self.json @property - def executable(inner_self): + def executable(inner_self): # pylint: disable=no-self-argument,arguments-renamed return self.tool_name @property - def moduledir(inner_self): + def moduledir(inner_self): # pylint: disable=no-self-argument,arguments-renamed return os.path.dirname(self.filename) # set docstrings to empty __doc__ = None # HACK: override the module-level docstring, too getmodule(OcrdToolCtx).__doc__ = None - def process(inner_self): + def process(inner_self): # pylint: disable=no-self-argument,arguments-renamed return super() self.processor = BashProcessor diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index 1461e53e0..3aece3493 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -6,7 +6,7 @@ :nested: full """ import os -from os import getcwd, rmdir, unlink +from os import rmdir, unlink from os.path import dirname, relpath, normpath, exists, join, isabs, isdir from pathlib import Path from json import loads, dumps @@ -14,7 +14,6 @@ from glob import glob # XXX pathlib.Path.glob does not support absolute globs import re import time -import numpy as np import click @@ -455,7 +454,7 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, incl snake_to_camel = {"file_id": "ID", "page_id": "pageId", "file_grp": "fileGrp"} output_field = [snake_to_camel.get(x, x) for x in output_field] modified_mets = False - ret = list() + ret = [] workspace = Workspace( ctx.resolver, directory=ctx.directory, @@ -751,7 +750,7 @@ def set_id(ctx, id): # pylint: disable=redefined-builtin @workspace_cli.command('update-page') @click.option('--set', 'attr_value_pairs', help=f"set mets:div ATTR to VALUE. possible keys: {METS_PAGE_DIV_ATTRIBUTE.names()}", metavar="ATTR VALUE", nargs=2, multiple=True) -@click.option('--order', help="[DEPRECATED - use --set ATTR VALUE", metavar='ORDER') +@click.option('--order', help="[DEPRECATED - use --set ATTR VALUE", metavar='ORDER') @click.option('--orderlabel', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL') @click.option('--contentids', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL') @click.argument('PAGE_ID') @@ -760,7 +759,7 @@ def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id): """ Update the @ID, @ORDER, @ORDERLABEL, @LABEL or @CONTENTIDS attributes of the mets:div with @ID=PAGE_ID """ - update_kwargs = {k: v for k, v in attr_value_pairs} + update_kwargs = dict(attr_value_pairs) if order: update_kwargs['ORDER'] = order if orderlabel: diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index b0b1cad04..f52a13575 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -106,7 +106,7 @@ def resolve(name): kwargs['parameter'] = parse_json_string_or_file(*kwargs['parameter'], resolve_preset_file=resolve) else: - kwargs['parameter'] = dict() + kwargs['parameter'] = {} # Merge parameter overrides and parameters if 'parameter_override' in kwargs: set_json_key_value_overrides(kwargs['parameter'], *kwargs.pop('parameter_override')) @@ -146,8 +146,7 @@ def goexit(): pr.disable() print("Profiling completed") if profile_file: - with open(profile_file, 'wb') as f: - pr.dump_stats(profile_file) + pr.dump_stats(profile_file) s = io.StringIO() pstats.Stats(pr, stream=s).sort_stats("cumulative").print_stats() print(s.getvalue()) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 7c22da278..81f9e15d0 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -21,7 +21,7 @@ import uvicorn from ocrd_models import OcrdFile, ClientSideOcrdFile, OcrdAgent, ClientSideOcrdAgent -from ocrd_utils import getLogger, deprecated_alias +from ocrd_utils import getLogger # @@ -403,7 +403,6 @@ def create_process(mets_server_url: str, ws_dir_path: str, log_file: str) -> int @staticmethod def kill_process(mets_server_pid: int): subprocess_run(args=["kill", "-s", "SIGINT", f"{mets_server_pid}"], shell=False, universal_newlines=True) - return def shutdown(self): if self.is_uds: diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 5329ea670..dacf9b072 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -22,7 +22,6 @@ import tarfile import io import weakref -from warnings import warn from frozendict import frozendict from deprecated import deprecated from requests import HTTPError @@ -350,14 +349,12 @@ def dump_json(self): Print :py:attr:`ocrd_tool` on stdout. """ print(json.dumps(self.ocrd_tool, indent=True)) - return def dump_module_dir(self): """ Print :py:attr:`moduledir` on stdout. """ print(self.moduledir) - return def list_resources(self): """ @@ -365,7 +362,6 @@ def list_resources(self): """ for res in self.list_all_resources(): print(res) - return def setup(self) -> None: """ @@ -756,7 +752,7 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): # can actually be much more costly than traversing the ltree. # This might depend on the number of pages vs number of fileGrps. - pages = dict() + pages = {} for i, ifg in enumerate(ifgs): files_ = sorted(self.workspace.mets.find_all_files( pageId=self.page_id, fileGrp=ifg, mimetype=mimetype), @@ -811,7 +807,7 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): if self.page_id and not any(pages): self._base_logger.critical(f"Could not find any files for selected pageId {self.page_id}.\n" f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.") - ifts = list() + ifts = [] for page, ifiles in pages.items(): for i, ifg in enumerate(ifgs): if not ifiles[i]: diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index 9bba9bee8..a5f217a15 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -1,6 +1,6 @@ # pylint: disable=missing-module-docstring,invalid-name -from os.path import join, basename -from typing import Optional, Union +from os.path import join +from typing import Optional import click @@ -10,7 +10,6 @@ from ocrd_models.ocrd_file import OcrdFileType from ocrd_models.ocrd_page import OcrdPage, to_xml from ocrd_utils import ( - getLogger, make_file_id, MIME_TO_EXT, MIMETYPE_PAGE, diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index 2950af3e4..6483790bd 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -1,7 +1,6 @@ """ Helper methods for running and documenting processors """ -from os import chdir, getcwd from time import perf_counter, process_time from functools import lru_cache import json @@ -99,7 +98,7 @@ def run_processor( t0_cpu = process_time() if any(x in config.OCRD_PROFILE for x in ['RSS', 'PSS']): backend = 'psutil_pss' if 'PSS' in config.OCRD_PROFILE else 'psutil' - from memory_profiler import memory_usage + from memory_profiler import memory_usage # pylint: disable=import-outside-toplevel try: mem_usage = memory_usage(proc=(processor.process_workspace, [workspace], {}), # only run process once @@ -209,7 +208,7 @@ def run_cli( if not log_filename: result = run(args, check=False) else: - with open(log_filename, 'a') as file_desc: + with open(log_filename, 'a', encoding='utf-8') as file_desc: result = run(args, check=False, stdout=file_desc, stderr=file_desc) return result.returncode diff --git a/src/ocrd/resolver.py b/src/ocrd/resolver.py index 124d00692..7ed58d4d4 100644 --- a/src/ocrd/resolver.py +++ b/src/ocrd/resolver.py @@ -18,7 +18,6 @@ ) from ocrd.workspace import Workspace from ocrd_models import OcrdMets -from ocrd_models.constants import NAMESPACES as NS from ocrd_models.utils import handle_oai_response class Resolver(): @@ -310,5 +309,3 @@ def resolve_mets_arguments(self, directory, mets_url, mets_basename=DEFAULT_METS raise ValueError("--mets '%s' has a directory part inconsistent with --directory '%s'" % (mets_url, directory)) return str(Path(directory).resolve()), str(mets_url), str(mets_basename), mets_server_url - - diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index e63c5fd01..da1ee4833 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -1,6 +1,6 @@ from pathlib import Path from os.path import join -from os import environ, listdir, makedirs, getcwd, path, unlink +from os import environ, listdir, getcwd, unlink from shutil import copytree, rmtree, copy from fnmatch import filter as apply_glob from datetime import datetime @@ -16,11 +16,11 @@ # https://github.com/OCR-D/core/issues/867 # https://stackoverflow.com/questions/50900727/skip-converting-entities-while-loading-a-yaml-string-using-pyyaml import yaml.constructor -yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:timestamp'] = \ - yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:str'] +yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:timestamp'] = \ + yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:str'] from ocrd_validators import OcrdResourceListValidator -from ocrd_utils import getLogger, directory_size, get_moduledir, EXT_TO_MIME, nth_url_segment, guess_media_type, config +from ocrd_utils import getLogger, directory_size, get_moduledir, guess_media_type, config from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index bd3380652..27c56f048 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -1,7 +1,7 @@ import io from os import makedirs, unlink, listdir, path from pathlib import Path -from shutil import move, copyfileobj +from shutil import copyfileobj from re import sub from tempfile import NamedTemporaryFile from contextlib import contextmanager @@ -43,7 +43,6 @@ MIME_TO_PIL, MIMETYPE_PAGE, REGEX_PREFIX, - config ) from .workspace_backup import WorkspaceBackupManager @@ -111,7 +110,7 @@ def __init__( def __repr__(self): return 'Workspace[remote=%s, directory=%s, baseurl=%s, file_groups=%s, files=%s]' % ( - not not self.is_remote, + self.is_remote, self.directory, self.baseurl, self.mets.file_groups, @@ -648,7 +647,7 @@ def image_from_page(self, page, page_id, log = getLogger('ocrd.workspace.image_from_page') page_image_info = self.resolve_image_exif(page.imageFilename) page_image = self._resolve_image_as_pil(page.imageFilename) - page_coords = dict() + page_coords = {} # use identity as initial affine coordinate transform: page_coords['transform'] = np.eye(3) # interim bbox (updated with each change to the transform): @@ -1091,7 +1090,7 @@ def save_image_file(self, image : Image.Image, The (absolute) path of the created file. """ log = getLogger('ocrd.workspace.save_image_file') - saveargs = dict() + saveargs = {} if 'dpi' in image.info: saveargs['dpi'] = image.info['dpi'] image_bytes = io.BytesIO() diff --git a/src/ocrd/workspace_backup.py b/src/ocrd/workspace_backup.py index 6cc3f1530..87ee884bd 100644 --- a/src/ocrd/workspace_backup.py +++ b/src/ocrd/workspace_backup.py @@ -1,6 +1,6 @@ from datetime import datetime from os import makedirs -from os.path import join, basename, getsize, abspath +from os.path import join, basename, getsize from glob import glob from shutil import copy import hashlib diff --git a/src/ocrd_modelfactory/__init__.py b/src/ocrd_modelfactory/__init__.py index c0600e51f..828949fe9 100644 --- a/src/ocrd_modelfactory/__init__.py +++ b/src/ocrd_modelfactory/__init__.py @@ -72,7 +72,7 @@ def page_from_image(input_file : Union[OcrdFile, ClientSideOcrdFile], **kwargs) ), pcGtsId=input_file.ID ) - mapping = dict() + mapping = {} etree : ET._Element = pcgts.to_etree(mapping_=mapping) revmap = dict(((node, element) for element, node in mapping.items())) return OcrdPage(pcgts, etree, mapping, revmap) diff --git a/src/ocrd_models/constants.py b/src/ocrd_models/constants.py index db6e51e3a..a67bfecc1 100644 --- a/src/ocrd_models/constants.py +++ b/src/ocrd_models/constants.py @@ -44,7 +44,6 @@ 'ocrd': 'https://ocr-d.de', } -# pylint: disable=bad-whitespace TAG_METS_AGENT = '{%s}agent' % NAMESPACES['mets'] TAG_METS_DIV = '{%s}div' % NAMESPACES['mets'] TAG_METS_FILE = '{%s}file' % NAMESPACES['mets'] diff --git a/src/ocrd_models/ocrd_exif.py b/src/ocrd_models/ocrd_exif.py index 82b8b7e1c..ab050bae5 100644 --- a/src/ocrd_models/ocrd_exif.py +++ b/src/ocrd_models/ocrd_exif.py @@ -102,7 +102,7 @@ def to_xml(self): Serialize all properties as XML string. """ ret = '' - for k in self.__dict__: - ret += '<%s>%s' % (k, self.__dict__[k], k) + for k, v in self.__dict__.items(): + ret += f'<{k}>{v}' ret += '' return ret diff --git a/src/ocrd_models/ocrd_file.py b/src/ocrd_models/ocrd_file.py index a11634171..91eac8d8e 100644 --- a/src/ocrd_models/ocrd_file.py +++ b/src/ocrd_models/ocrd_file.py @@ -230,12 +230,12 @@ class ClientSideOcrdFile: def __init__( self, - el, + el, # pylint: disable=unused-argument mimetype: str = '', pageId: str = '', loctype: str ='OTHER', local_filename: Optional[str] = None, - mets : Any = None, + mets : Any = None, # pylint: disable=unused-argument url: str = '', ID: str = '', fileGrp: str = '' diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index 90d37b37d..c3fb11f60 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -75,7 +75,7 @@ def empty_mets(now : Optional[str] = None, cache_flag : bool = False): def __init__(self, **kwargs) -> None: """ """ - super(OcrdMets, self).__init__(**kwargs) + super().__init__(**kwargs) # XXX If the environment variable OCRD_METS_CACHING is set to "true", # then enable caching, if "false", disable caching, overriding the @@ -488,11 +488,12 @@ def add_file(self, fileGrp : str, mimetype : Optional[str] = None, url : Optiona f"A file with ID=={ID} already exists {mets_file} but unrelated - cannot mitigate") # To get rid of Python's FutureWarning - checking if v is not None - kwargs = {k: v for k, v in locals().items() if - k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v is not None} + kwargs = {k: v for k, v in locals().items() + if k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v is not None} # This separation is needed to reuse the same el_mets_file element in the caching if block el_mets_file = ET.SubElement(el_fileGrp, TAG_METS_FILE) # The caching of the physical page is done in the OcrdFile constructor + # (which calls us back with set_physical_page_for_file) mets_file = OcrdFile(el_mets_file, mets=self, **kwargs) if self._cache_flag: @@ -542,9 +543,9 @@ def remove_one_file(self, ID : Union[str, OcrdFile], fileGrp : str = None) -> Oc # Delete the physical page ref fptrs = [] if self._cache_flag: - for page in self._fptr_cache.keys(): - if ID in self._fptr_cache[page]: - fptrs.append(self._fptr_cache[page][ID]) + for pageId, fptrdict in self._fptr_cache.items(): + if ID in fptrdict: + fptrs.append(fptrdict[ID]) else: fptrs = self._tree.getroot().findall('.//mets:fptr[@FILEID="%s"]' % ID, namespaces=NS) @@ -700,8 +701,8 @@ def get_physical_pages(self, for_fileIds : Optional[List[str]] = None, for_pageI assert for_fileIds # at this point we know for_fileIds is set, assert to convince pyright ret = [None] * len(for_fileIds) if self._cache_flag: - for pageId in self._fptr_cache.keys(): - for fptr in self._fptr_cache[pageId].keys(): + for pageId, fptrdict in self._fptr_cache.items(): + for fptr in fptrdict: if fptr in for_fileIds: index = for_fileIds.index(fptr) if return_divs: @@ -737,10 +738,10 @@ def set_physical_page_for_file(self, pageId : str, ocrd_file : OcrdFile, # delete any existing page mapping for this file.ID fptrs = [] if self._cache_flag: - for page_id in self._fptr_cache.keys(): - if ocrd_file.ID in self._fptr_cache[page_id].keys(): - if self._fptr_cache[page_id][ocrd_file.ID] is not None: - fptrs.append(self._fptr_cache[page_id][ocrd_file.ID]) + for page, fptrdict in self._fptr_cache.items(): + if ocrd_file.ID in fptrdict: + if fptrdict[ocrd_file.ID] is not None: + fptrs.append(fptrdict[ocrd_file.ID]) else: fptrs = self._tree.getroot().findall( 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % @@ -791,7 +792,7 @@ def set_physical_page_for_file(self, pageId : str, ocrd_file : OcrdFile, self._fptr_cache[pageId].update({ocrd_file.ID: el_fptr}) def update_physical_page_attributes(self, page_id : str, **kwargs) -> None: - invalid_keys = list(k for k in kwargs.keys() if k not in METS_PAGE_DIV_ATTRIBUTE.names()) + invalid_keys = list(k for k in kwargs if k not in METS_PAGE_DIV_ATTRIBUTE.names()) if invalid_keys: raise ValueError(f"Invalid attribute {invalid_keys}. Allowed values: {METS_PAGE_DIV_ATTRIBUTE.names()}") @@ -812,8 +813,8 @@ def get_physical_page_for_file(self, ocrd_file : OcrdFile) -> Optional[str]: corresponding to the ``mets:file`` :py:attr:`ocrd_file`. """ if self._cache_flag: - for pageId in self._fptr_cache.keys(): - if ocrd_file.ID in self._fptr_cache[pageId].keys(): + for pageId, fptrdict in self._fptr_cache.items(): + if ocrd_file.ID in fptrdict: return pageId else: ret = self._tree.getroot().find( @@ -828,7 +829,7 @@ def remove_physical_page(self, ID : str) -> None: """ mets_div = None if self._cache_flag: - if ID in self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID].keys(): + if ID in self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID]: mets_div = [self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID][ID]] else: mets_div = self._tree.getroot().xpath( @@ -857,9 +858,9 @@ def remove_physical_page_fptr(self, fileId : str) -> List[str]: # If that's the case then we do not need to iterate 2 loops, just one. mets_fptrs = [] if self._cache_flag: - for page_id in self._fptr_cache.keys(): - if fileId in self._fptr_cache[page_id].keys(): - mets_fptrs.append(self._fptr_cache[page_id][fileId]) + for pageId, fptrdict in self._fptr_cache.items(): + if fileId in fptrdict: + mets_fptrs.append(fptrdict[fileId]) else: mets_fptrs = self._tree.getroot().xpath( 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % fileId, @@ -919,4 +920,3 @@ def merge(self, other_mets, force : bool = False, # FIXME: merge structMap logical and structLink as well if after_add_cb: after_add_cb(f_dest) - diff --git a/src/ocrd_models/ocrd_page.py b/src/ocrd_models/ocrd_page.py index 6accb9241..3f0cc690f 100644 --- a/src/ocrd_models/ocrd_page.py +++ b/src/ocrd_models/ocrd_page.py @@ -3,7 +3,6 @@ """ from io import StringIO from typing import Dict, Union -from inspect import getmembers from lxml import etree as ET __all__ = [ diff --git a/src/ocrd_models/ocrd_xml_base.py b/src/ocrd_models/ocrd_xml_base.py index 8579a5b40..ea4798c5b 100644 --- a/src/ocrd_models/ocrd_xml_base.py +++ b/src/ocrd_models/ocrd_xml_base.py @@ -8,8 +8,8 @@ from .utils import xmllint_format -for curie in NAMESPACES: - ET.register_namespace(curie, NAMESPACES[curie]) +for curie, url in NAMESPACES.items(): + ET.register_namespace(curie, url) class OcrdXmlDocument(): """ diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 851fb42a8..f33af0264 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -12,8 +12,12 @@ from tempfile import gettempdir from textwrap import fill, indent -_validator_boolean = lambda val: isinstance(val, bool) or str.lower(val) in ('true', 'false', '0', '1') -_parser_boolean = lambda val: bool(val) if isinstance(val, (int, bool)) else str.lower(val) in ('true', '1') + +def _validator_boolean(val): + return isinstance(val, bool) or str.lower(val) in ('true', 'false', '0', '1') + +def _parser_boolean(val): + return bool(val) if isinstance(val, (int, bool)) else str.lower(val) in ('true', '1') class OcrdEnvVariable(): diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index bb771fc0c..5cea55e5b 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -5,9 +5,9 @@ Logging can be overridden either programmatically in code using the library or by creating one or more of -- /etc/ocrd_logging.py -- $HOME/ocrd_logging.py -- $PWD/ocrd_logging.py +- ``/etc/ocrd_logging.py`` +- ``$HOME/ocrd_logging.py`` +- ``$PWD/ocrd_logging.py`` These files will be executed in the context of ocrd/ocrd_logging.py, with `logging` global set. @@ -16,20 +16,18 @@ - Try to be less intrusive with OCR-D specific logging conventions to make it easier and less surprising to define logging behavior when using OCR-D/core as a library - - Change setOverrideLogLevel to only override the log level of the ``ocrd`` + - Change :py:meth:`setOverrideLogLevel` to only override the log level of the ``ocrd`` logger and its descendants - - initLogging will set exactly one handler, for the root logger or for the + - :py:meth:`initLogging` will set exactly one handler, for the root logger or for the ``ocrd`` logger. - Child loggers should propagate to the ancestor logging (default - behavior of the logging library - no more PropagationShyLogger) - - disableLogging only removes any handlers from the ``ocrd`` logger + behavior of the logging library - no more ``PropagationShyLogger``) + - :py:meth:`disableLogging` only removes any handlers from the ``ocrd`` logger """ # pylint: disable=no-member from __future__ import absolute_import -from traceback import format_stack - import logging import logging.config from pathlib import Path @@ -81,10 +79,10 @@ def tf_disable_interactive_logs(): try: - from os import environ + from os import environ # pylint: disable=import-outside-toplevel # This env variable must be set before importing from Keras environ['TF_CPP_MIN_LOG_LEVEL'] = '3' - from tensorflow.keras.utils import disable_interactive_logging + from tensorflow.keras.utils import disable_interactive_logging # pylint: disable=import-outside-toplevel # Enabled interactive logging throws an exception # due to a call of sys.stdout.flush() disable_interactive_logging() @@ -143,21 +141,21 @@ def get_logging_config_files(): def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_LOGGING_DEBUG): """ - Reset ``ocrd`` logger, read logging configuration if exists, otherwise use basicConfig + Reset ``ocrd`` logger, read logging configuration if exists, otherwise use :py:meth:`logging.basicConfig` - initLogging is to be called by OCR-D/core once, i.e. + This is to be called by OCR-D/core only once, i.e. - for the ``ocrd`` CLI - for the processor wrapper methods Other processes that use OCR-D/core as a library can, but do not have to, use this functionality. Keyword Args: - - builtin_only (bool, False): Whether to search for logging configuration - on-disk (``False``) or only use the - hard-coded config (``True``). For testing - - force_reinit (bool, False): Whether to ignore the module-level - ``_initialized_flag``. For testing only. - - silent (bool, True): Whether to log logging behavior by printing to stderr + - builtin_only (bool): Whether to search for logging configuration + on-disk (``False``) or only use the hard-coded config (``True``). + For testing + - force_reinit (bool): Whether to ignore the module-level ``_initialized_flag``. + For testing only + - silent (bool): Whether to log logging behavior by printing to stderr """ global _initialized_flag if _initialized_flag and not force_reinit: diff --git a/src/ocrd_utils/os.py b/src/ocrd_utils/os.py index 18463de0c..70721acbe 100644 --- a/src/ocrd_utils/os.py +++ b/src/ocrd_utils/os.py @@ -71,9 +71,8 @@ def unzip_file_to_dir(path_to_zip, output_directory): """ Extract a ZIP archive to a directory """ - z = ZipFile(path_to_zip, 'r') - z.extractall(output_directory) - z.close() + with ZipFile(path_to_zip, 'r') as z: + z.extractall(output_directory) @lru_cache() def get_ocrd_tool_json(executable): @@ -87,7 +86,7 @@ def get_ocrd_tool_json(executable): ocrd_tool = ocrd_all_tool[executable] except (JSONDecodeError, OSError, KeyError): try: - ocrd_tool = loads(run([executable, '--dump-json'], stdout=PIPE).stdout) + ocrd_tool = loads(run([executable, '--dump-json'], stdout=PIPE, check=False).stdout) except (JSONDecodeError, OSError) as e: getLogger('ocrd.utils.get_ocrd_tool_json').error(f'{executable} --dump-json produced invalid JSON: {e}') if 'resource_locations' not in ocrd_tool: @@ -102,7 +101,7 @@ def get_moduledir(executable): moduledir = ocrd_all_moduledir[executable] except (JSONDecodeError, OSError, KeyError): try: - moduledir = run([executable, '--dump-module-dir'], encoding='utf-8', stdout=PIPE).stdout.rstrip('\n') + moduledir = run([executable, '--dump-module-dir'], encoding='utf-8', stdout=PIPE, check=False).stdout.rstrip('\n') except (JSONDecodeError, OSError) as e: getLogger('ocrd.utils.get_moduledir').error(f'{executable} --dump-module-dir failed: {e}') return moduledir diff --git a/src/ocrd_utils/str.py b/src/ocrd_utils/str.py index 4f1e08805..6a973fac7 100644 --- a/src/ocrd_utils/str.py +++ b/src/ocrd_utils/str.py @@ -4,9 +4,9 @@ import re import json -from typing import List, Union +from typing import List from .constants import REGEX_FILE_ID, SPARKLINE_CHARS -from .deprecate import deprecation_warning +#from .deprecate import deprecation_warning from deprecated import deprecated from warnings import warn from numpy import array_split @@ -273,4 +273,3 @@ def sparkline(values : List[int]) -> str: # normalize to 0..1 and convert to index in SPARKLINE_CHARS mapped = [int(x / max_value * max_mapping) for x in values] return ''.join(SPARKLINE_CHARS[x] for x in mapped) - diff --git a/src/ocrd_validators/json_validator.py b/src/ocrd_validators/json_validator.py index 4fb84b3fd..f21a23afe 100644 --- a/src/ocrd_validators/json_validator.py +++ b/src/ocrd_validators/json_validator.py @@ -2,7 +2,6 @@ Validating JSON-Schema """ import json -from warnings import warn from jsonschema import Draft201909Validator, ValidationError, validators # pylint: disable=import-error @@ -28,8 +27,7 @@ def set_defaults_and_handle_deprecate(validator, properties, instance, schema): if subschema.get('deprecated', False) and instance.get(prop): yield JsonSchemaDeprecationWarning(f"Property {prop} has been deprecated, ocrd-tool.json should be updated.") - for error in validate_properties(validator, properties, instance, schema): - yield error + yield from validate_properties(validator, properties, instance, schema) return validators.extend(validator_class, {"properties": set_defaults_and_handle_deprecate}) diff --git a/src/ocrd_validators/ocrd_tool_validator.py b/src/ocrd_validators/ocrd_tool_validator.py index 827001ef7..00a402c12 100644 --- a/src/ocrd_validators/ocrd_tool_validator.py +++ b/src/ocrd_validators/ocrd_tool_validator.py @@ -22,5 +22,5 @@ def validate(obj, schema=OCRD_TOOL_SCHEMA): """ return OcrdToolValidator(schema)._validate(obj) # pylint: disable=protected-access - def __init__(self, schema, validator_class=...): - super().__init__(schema, DefaultValidatingDraft20199Validator) + def __init__(self, schema): + super().__init__(schema, validator_class=DefaultValidatingDraft20199Validator) diff --git a/src/ocrd_validators/page_validator.py b/src/ocrd_validators/page_validator.py index d6d8a95b5..0459f1781 100644 --- a/src/ocrd_validators/page_validator.py +++ b/src/ocrd_validators/page_validator.py @@ -34,50 +34,50 @@ _HIERARCHY = [ # page can contain different types of regions - (PageType, 'get_AdvertRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_ChartRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_ChemRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_CustomRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_GraphicRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_ImageRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_LineDrawingRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_MapRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_MathsRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_MusicRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_NoiseRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_SeparatorRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_TableRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_TextRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_UnknownRegion', None), # pylint: disable=bad-whitespace + (PageType, 'get_AdvertRegion', None), + (PageType, 'get_ChartRegion', None), + (PageType, 'get_ChemRegion', None), + (PageType, 'get_CustomRegion', None), + (PageType, 'get_GraphicRegion', None), + (PageType, 'get_ImageRegion', None), + (PageType, 'get_LineDrawingRegion', None), + (PageType, 'get_MapRegion', None), + (PageType, 'get_MathsRegion', None), + (PageType, 'get_MusicRegion', None), + (PageType, 'get_NoiseRegion', None), + (PageType, 'get_SeparatorRegion', None), + (PageType, 'get_TableRegion', None), + (PageType, 'get_TextRegion', None), + (PageType, 'get_UnknownRegion', None), # all regions can be recursive - (RegionType, 'get_AdvertRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_ChartRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_ChemRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_CustomRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_GraphicRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_ImageRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_LineDrawingRegion', None), # pylint: disable=bad-whitespace - #(RegionType, 'get_MapRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_MathsRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_MusicRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_NoiseRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_SeparatorRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_TableRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_TextRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_UnknownRegion', None), # pylint: disable=bad-whitespace + (RegionType, 'get_AdvertRegion', None), + (RegionType, 'get_ChartRegion', None), + (RegionType, 'get_ChemRegion', None), + (RegionType, 'get_CustomRegion', None), + (RegionType, 'get_GraphicRegion', None), + (RegionType, 'get_ImageRegion', None), + (RegionType, 'get_LineDrawingRegion', None), + #(RegionType, 'get_MapRegion', None), + (RegionType, 'get_MathsRegion', None), + (RegionType, 'get_MusicRegion', None), + (RegionType, 'get_NoiseRegion', None), + (RegionType, 'get_SeparatorRegion', None), + (RegionType, 'get_TableRegion', None), + (RegionType, 'get_TextRegion', None), + (RegionType, 'get_UnknownRegion', None), # only TextRegion can contain TextLine - (TextRegionType, 'get_TextLine', '\n'), # pylint: disable=bad-whitespace - (TextLineType, 'get_Word', ' '), # pylint: disable=bad-whitespace - (WordType, 'get_Glyph', ''), # pylint: disable=bad-whitespace - (GlyphType, None, None), # pylint: disable=bad-whitespace + (TextRegionType, 'get_TextLine', '\n'), + (TextLineType, 'get_Word', ' '), + (WordType, 'get_Glyph', ''), + (GlyphType, None, None), ] _ORDER = [ (None, TextLineOrderSimpleType.BOTTOMTOTOP, ReadingDirectionSimpleType.RIGHTTOLEFT), - (PageType, 'get_textLineOrder', 'get_readingDirection'), # pylint: disable=bad-whitespace - (TextRegionType, 'get_textLineOrder', 'get_readingDirection'), # pylint: disable=bad-whitespace - (TextLineType, None, 'get_readingDirection'), # pylint: disable=bad-whitespace - (WordType, None, 'get_readingDirection'), # pylint: disable=bad-whitespace + (PageType, 'get_textLineOrder', 'get_readingDirection'), + (TextRegionType, 'get_textLineOrder', 'get_readingDirection'), + (TextLineType, None, 'get_readingDirection'), + (WordType, None, 'get_readingDirection'), ] # The following parameters control how tolerant we are with respect to @@ -115,9 +115,9 @@ def __init__(self, tag, ID, file_id, actual, expected): self.file_id = file_id self.actual = actual self.expected = expected - super(ConsistencyError, self).__init__( - "INCONSISTENCY in %s ID '%s' of file '%s': text results '%s' != concatenated '%s'" % ( - tag, ID, file_id, actual, expected)) + super().__init__( + f"INCONSISTENCY in {tag} ID '{ID}' of file '{file_id}': " + f"text results '{actual}' != concatenated '{expected}'") class CoordinateConsistencyError(Exception): """ @@ -141,9 +141,9 @@ def __init__(self, tag, ID, file_id, outer, inner): self.file_id = file_id self.outer = outer self.inner = inner - super(CoordinateConsistencyError, self).__init__( - "INCONSISTENCY in %s ID '%s' of '%s': coords '%s' not within parent coords '%s'" % ( - tag, ID, file_id, inner, outer)) + super().__init__( + f"INCONSISTENCY in {tag} ID '{ID}' of '{file_id}': " + f"coords '{inner}' not within parent coords '{outer}'") class CoordinateValidityError(Exception): """ @@ -166,9 +166,8 @@ def __init__(self, tag, ID, file_id, points, reason='unknown'): self.ID = ID self.file_id = file_id self.points = points - super(CoordinateValidityError, self).__init__( - "INVALIDITY in %s ID '%s' of '%s': coords '%s' - %s" % ( - tag, ID, file_id, points, reason)) + super().__init__( + f"INVALIDITY in {tag} ID '{ID}' of '{file_id}': coords '{points}' - {reason}") def compare_without_whitespace(a, b): """ @@ -177,13 +176,14 @@ def compare_without_whitespace(a, b): return re.sub('\\s+', '', a) == re.sub('\\s+', '', b) def page_get_reading_order(ro, rogroup): - """Add all elements from the given reading order group to the given dictionary. - + """ + Add all elements from the given reading order group to the given dictionary. + Given a dict ``ro`` from layout element IDs to ReadingOrder element objects, and an object ``rogroup`` with additional ReadingOrder element objects, add all references to the dict, traversing the group recursively. """ - regionrefs = list() + regionrefs = [] if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): regionrefs = (rogroup.get_RegionRefIndexed() + rogroup.get_OrderedGroupIndexed() + @@ -241,12 +241,12 @@ def validate_consistency(node, page_textequiv_consistency, page_textequiv_strate node_id = node.get_pcGtsId() node = node.get_Page() # has no .id if not readingOrder: - readingOrder = dict() + readingOrder = {} ro = node.get_ReadingOrder() if ro: page_get_reading_order(readingOrder, ro.get_OrderedGroup() or ro.get_UnorderedGroup()) if not joinRelations: - joinRelations = list() + joinRelations = [] relations = node.get_Relations() # get RelationsType if relations: relations = relations.get_Relation() # get list of RelationType @@ -358,7 +358,7 @@ def concatenate(nodes, concatenate_with, page_textequiv_strategy, joins=None): if not nodes: return '' if not joins: - joins = list() + joins = [] result = get_text(nodes[0], page_textequiv_strategy) for node, next_node in zip(nodes, nodes[1:]): if (node.id, next_node.id) not in joins: @@ -470,11 +470,11 @@ def validate(filename=None, ocrd_page=None, ocrd_file=None, page = parse(filename, silence=True) file_id = filename else: - raise Exception("At least one of ocrd_page, ocrd_file or filename must be set") + raise ValueError("At least one of ocrd_page, ocrd_file or filename must be set") if page_textequiv_strategy not in ('first'): - raise Exception("page_textequiv_strategy %s not implemented" % page_textequiv_strategy) + raise ValueError("page_textequiv_strategy %s not implemented" % page_textequiv_strategy) if page_textequiv_consistency not in ('strict', 'lax', 'fix', 'off'): - raise Exception("page_textequiv_consistency level %s not implemented" % page_textequiv_consistency) + raise ValueError("page_textequiv_consistency level %s not implemented" % page_textequiv_consistency) report = ValidationReport() log.info("Validating input file '%s'", file_id) validate_consistency(page, page_textequiv_consistency, page_textequiv_strategy, check_baseline, check_coords, report, file_id) diff --git a/src/ocrd_validators/parameter_validator.py b/src/ocrd_validators/parameter_validator.py index 26364f70f..ca2a7ed8e 100644 --- a/src/ocrd_validators/parameter_validator.py +++ b/src/ocrd_validators/parameter_validator.py @@ -20,7 +20,7 @@ def validate(self, *args, **kwargs): # pylint: disable=arguments-differ obj (dict): schema (dict): """ - return super(ParameterValidator, self)._validate(*args, **kwargs) + return super()._validate(*args, **kwargs) def __init__(self, ocrd_tool): """ @@ -40,7 +40,7 @@ def __init__(self, ocrd_tool): if p[n]['required']: required.append(n) del(p[n]['required']) - super(ParameterValidator, self).__init__({ + super().__init__({ "type": "object", "required": required, "additionalProperties": False, diff --git a/src/ocrd_validators/resource_list_validator.py b/src/ocrd_validators/resource_list_validator.py index d1a77b59b..47f3c81a9 100644 --- a/src/ocrd_validators/resource_list_validator.py +++ b/src/ocrd_validators/resource_list_validator.py @@ -16,9 +16,10 @@ class OcrdResourceListValidator(JsonValidator): """ @staticmethod - def validate(obj, schema=RESOURCE_LIST_SCHEMA): + def validate(obj, schema=None): """ Validate against ``resource_list.schema.yml`` schema. """ - return JsonValidator(schema, validator_class=DefaultValidatingDraft20199Validator)._validate(obj) - + if schema is None: + schema = RESOURCE_LIST_SCHEMA + return JsonValidator(schema, validator_class=DefaultValidatingDraft20199Validator)._validate(obj) # pylint: disable=protected-access diff --git a/src/ocrd_validators/workspace_validator.py b/src/ocrd_validators/workspace_validator.py index d5be46099..28d45495e 100644 --- a/src/ocrd_validators/workspace_validator.py +++ b/src/ocrd_validators/workspace_validator.py @@ -103,7 +103,7 @@ def __init__(self, resolver, mets_url, src_dir=None, skip=None, download=False, 'page_xsd'] if check not in self.skip] - self.find_kwargs = dict(include_fileGrp=include_fileGrp, exclude_fileGrp=exclude_fileGrp) + self.find_kwargs = {"include_fileGrp": include_fileGrp, "exclude_fileGrp": exclude_fileGrp} self.src_dir = src_dir self.workspace = None self.mets = None @@ -139,7 +139,7 @@ def _validate(self): self._resolve_workspace() except Exception as e: # pylint: disable=broad-except self.log.warning("Failed to instantiate workspace: %s", e) - self.report.add_error("Failed to instantiate workspace: %s" % e) + self.report.add_error(f"Failed to instantiate workspace: {e}") return self.report with pushd_popd(self.workspace.directory): try: @@ -158,7 +158,7 @@ def _validate(self): if self.page_checks: self._validate_page() except Exception: # pylint: disable=broad-except - self.report.add_error("Validation aborted with exception: %s" % format_exc()) + self.report.add_error(f"Validation aborted with exception: {format_exc()}") return self.report def _resolve_workspace(self): @@ -193,9 +193,9 @@ def _validate_imagefilename(self): page = page_from_file(f).get_Page() imageFilename = page.imageFilename if not self.mets.find_files(url=imageFilename, **self.find_kwargs): - self.report.add_error("PAGE-XML %s : imageFilename '%s' not found in METS" % (f.local_filename, imageFilename)) + self.report.add_error(f"PAGE '{f.ID}': imageFilename '{imageFilename}' not found in METS") if is_local_filename(imageFilename) and not Path(imageFilename).exists(): - self.report.add_warning("PAGE-XML %s : imageFilename '%s' points to non-existent local file" % (f.local_filename, imageFilename)) + self.report.add_warning(f"PAGE '{f.ID}': imageFilename '{imageFilename}' points to non-existent local file") def _validate_dimension(self): """ @@ -210,9 +210,9 @@ def _validate_dimension(self): page = page_from_file(f).get_Page() _, _, exif = self.workspace.image_from_page(page, f.pageId) if page.imageHeight != exif.height: - self.report.add_error("PAGE '%s': @imageHeight != image's actual height (%s != %s)" % (f.ID, page.imageHeight, exif.height)) + self.report.add_error(f"PAGE '{f.ID}': @imageHeight != image's actual height ({page.imageHeight} != {exif.height})") if page.imageWidth != exif.width: - self.report.add_error("PAGE '%s': @imageWidth != image's actual width (%s != %s)" % (f.ID, page.imageWidth, exif.width)) + self.report.add_error(f"PAGE '{f.ID}': @imageWidth != image's actual width ({page.imageWidth} != {exif.width})") def _validate_multipage(self): """ @@ -229,9 +229,9 @@ def _validate_multipage(self): try: exif = self.workspace.resolve_image_exif(f.local_filename) if exif.n_frames > 1: - self.report.add_error("Image %s: More than 1 frame: %s" % (f.ID, exif.n_frames)) + self.report.add_error(f"Image '{f.ID}': More than 1 frame: {exif.n_frames}") except FileNotFoundError: - self.report.add_error("Image %s: Could not retrieve %s (local_filename=%s, url=%s)" % (f.ID, f.local_filename, f.url)) + self.report.add_error(f"Image '{f.ID}': Could not retrieve (local_filename='{f.local_filename}', url='{f.url}')") return def _validate_pixel_density(self): @@ -250,7 +250,7 @@ def _validate_pixel_density(self): for k in ['xResolution', 'yResolution']: v = exif.__dict__.get(k) if v is None or v <= 72: - self.report.add_notice("Image %s: %s (%s pixels per %s) is suspiciously low" % (f.ID, k, v, exif.resolutionUnit)) + self.report.add_notice(f"Image '{f.ID}': {k} ({v} pixels per {exif.resolutionUnit}) is suspiciously low") def _validate_mets_file_group_names(self): """ @@ -261,7 +261,7 @@ def _validate_mets_file_group_names(self): self.log.debug('_validate_mets_file_group_names') for fileGrp in self.mets.file_groups: if not fileGrp.startswith(FILE_GROUP_PREFIX): - self.report.add_notice("fileGrp USE does not begin with '%s': %s" % (FILE_GROUP_PREFIX, fileGrp)) + self.report.add_notice(f"fileGrp USE '{fileGrp}' does not begin with '{FILE_GROUP_PREFIX}'") else: # OCR-D-FOO-BAR -> ('FOO', 'BAR') # \____/\_/ \_/ @@ -273,9 +273,9 @@ def _validate_mets_file_group_names(self): if '-' in category: category, name = category.split('-', 1) if category not in FILE_GROUP_CATEGORIES: - self.report.add_notice("Unspecified USE category '%s' in fileGrp '%s'" % (category, fileGrp)) + self.report.add_notice(f"Unspecified USE category '{category}' in fileGrp '{fileGrp}'") if name is not None and not re.match(r'^[A-Z0-9-]{3,}$', name): - self.report.add_notice("Invalid USE name '%s' in fileGrp '%s'" % (name, fileGrp)) + self.report.add_notice(f"Invalid USE name '{name}' in fileGrp '{fileGrp}'") def _validate_mets_files(self): """ @@ -288,16 +288,16 @@ def _validate_mets_files(self): self.report.add_error("No files") for f in self.mets.find_files(**self.find_kwargs): if f._el.get('GROUPID'): # pylint: disable=protected-access - self.report.add_notice("File '%s' has GROUPID attribute - document might need an update" % f.ID) + self.report.add_notice(f"File '{f.ID}' has GROUPID attribute - document might need an update") if not (f.url or f.local_filename): - self.report.add_error("File '%s' has neither mets:Flocat[@LOCTYPE='URL']/@xlink:href nor mets:FLocat[@LOCTYPE='OTHER'][@OTHERLOCTYPE='FILE']/xlink:href" % f.ID) + self.report.add_error(f"File '{f.ID}' has neither mets:Flocat[@LOCTYPE='URL']/@xlink:href nor mets:FLocat[@LOCTYPE='OTHER'][@OTHERLOCTYPE='FILE']/xlink:href") continue if f.url and 'url' not in self.skip: if re.match(r'^file:/[^/]', f.url): - self.report.add_error("File '%s' has an invalid (Java-specific) file URL '%s'" % (f.ID, f.url)) + self.report.add_error(f"File '{f.ID}' has an invalid (Java-specific) file URL '{f.url}'") scheme = f.url[0:f.url.index(':')] if scheme not in ('http', 'https', 'file'): - self.report.add_warning("File '%s' has non-HTTP, non-file URL '%s'" % (f.ID, f.url)) + self.report.add_warning(f"File '{f.ID}' has non-HTTP, non-file URL '{f.url}'") def _validate_page(self): """ @@ -323,15 +323,15 @@ def _validate_page(self): if 'dimension' in self.page_checks: _, _, exif = self.workspace.image_from_page(page, f.pageId) if page.imageHeight != exif.height: - self.report.add_error("PAGE '%s': @imageHeight != image's actual height (%s != %s)" % (f.ID, page.imageHeight, exif.height)) + self.report.add_error(f"PAGE '{f.ID}': @imageHeight != image's actual height ({page.imageHeight} != {exif.height})") if page.imageWidth != exif.width: - self.report.add_error("PAGE '%s': @imageWidth != image's actual width (%s != %s)" % (f.ID, page.imageWidth, exif.width)) + self.report.add_error(f"PAGE '{f.ID}': @imageWidth != image's actual width ({page.imageWidth} != {exif.width})") if 'imagefilename' in self.page_checks: imageFilename = page.imageFilename if not self.mets.find_files(url=imageFilename): - self.report.add_error("PAGE-XML %s : imageFilename '%s' not found in METS" % (f.url, imageFilename)) + self.report.add_error(f"PAGE '{f.ID}': imageFilename '{imageFilename}' not found in METS") if is_local_filename(imageFilename) and not Path(imageFilename).exists(): - self.report.add_warning("PAGE-XML %s : imageFilename '%s' points to non-existent local file" % (f.url, imageFilename)) + self.report.add_warning(f"PAGE '{f.ID}': imageFilename '{imageFilename}' points to non-existent local file") if 'mets_fileid_page_pcgtsid' in self.page_checks and pcgts.pcGtsId != f.ID: self.report.add_warning('pc:PcGts/@pcGtsId differs from mets:file/@ID: "%s" !== "%s"' % (pcgts.pcGtsId or '', f.ID or '')) diff --git a/src/ocrd_validators/xsd_validator.py b/src/ocrd_validators/xsd_validator.py index 81b945756..92e450212 100644 --- a/src/ocrd_validators/xsd_validator.py +++ b/src/ocrd_validators/xsd_validator.py @@ -45,7 +45,7 @@ def __init__(self, schema_url): schema_url (str): URI of XML schema to validate against. """ if schema_url not in XSD_PATHS: - raise Exception('XML schema not bundled with OCR-D: %s' % schema_url) + raise ValueError('XML schema not bundled with OCR-D: %s' % schema_url) with open(XSD_PATHS[schema_url], 'r') as f: xmlschema_doc = ET.parse(f) self._xmlschema = ET.XMLSchema(xmlschema_doc) diff --git a/tests/validator/test_workspace_validator.py b/tests/validator/test_workspace_validator.py index bc516d5a5..2e63bb549 100644 --- a/tests/validator/test_workspace_validator.py +++ b/tests/validator/test_workspace_validator.py @@ -90,7 +90,7 @@ def test_validate_file_groups_non_ocrd(self): self.assertEqual(len(report.errors), 1) self.assertIn('No files', report.errors[0]) self.assertEqual(len(report.notices), 1) - self.assertIn("USE does not begin with 'OCR-D-'", report.notices[0]) + self.assertIn("fileGrp USE 'FOO' does not begin with 'OCR-D-'", report.notices[0]) def test_validate_file_groups_unspecified(self): with TemporaryDirectory() as tempdir: From a95f269d83695e38f502084523868a3d365ec810 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 13:15:13 +0200 Subject: [PATCH 136/249] update pylintrc --- .pylintrc | 18 ++++++++---------- src/ocrd/resource_manager.py | 4 ++++ 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/.pylintrc b/.pylintrc index b2125d824..a4106a1bb 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,19 +1,21 @@ [MASTER] -extension-pkg-whitelist=lxml -ignored-modules=cv2,tesserocr,ocrd.model +extension-pkg-whitelist=lxml,pydantic +ignored-modules=cv2,tesserocr,ocrd_models.ocrd_page_generateds +ignore-patterns=.*generateds.* [MESSAGES CONTROL] -ignore-patterns='.*generateds.*' disable = fixme, - E501, + line-too-long, + consider-using-f-string, + logging-fstring-interpolation, trailing-whitespace, logging-not-lazy, inconsistent-return-statements, + disallowed-name, invalid-name, line-too-long, missing-docstring, - no-self-use, wrong-import-order, too-many-nested-blocks, superfluous-parens, @@ -25,13 +27,9 @@ disable = ungrouped-imports, useless-object-inheritance, useless-import-alias, - bad-continuation, no-else-return, logging-not-lazy -[FORMAT] -no-space-check=empty-line - [DESIGN] # Maximum number of arguments for function / method max-args=12 @@ -40,7 +38,7 @@ max-locals=30 # Maximum number of return / yield for function / method body max-returns=12 # Maximum number of branch for function / method body -max-branchs=30 +max-branches=30 # Maximum number of statements in function / method body max-statements=60 # Maximum number of parents for a class (see R0901). diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index da1ee4833..3c4c60306 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -13,12 +13,16 @@ from gdown.download import get_url_from_gdrive_confirmation from yaml import safe_load, safe_dump +# pylint: disable=wrong-import-position + # https://github.com/OCR-D/core/issues/867 # https://stackoverflow.com/questions/50900727/skip-converting-entities-while-loading-a-yaml-string-using-pyyaml import yaml.constructor yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:timestamp'] = \ yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:str'] +# pylint: enable=wrong-import-position + from ocrd_validators import OcrdResourceListValidator from ocrd_utils import getLogger, directory_size, get_moduledir, guess_media_type, config from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json From 50c088ecdf0d889e7095f71f99ec93bc08be5dcc Mon Sep 17 00:00:00 2001 From: kba Date: Sat, 24 Aug 2024 16:40:58 +0200 Subject: [PATCH 137/249] processor.metadata_location: use self.__module__ not __package__ --- src/ocrd/processor/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index dacf9b072..c0b66b269 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -126,7 +126,7 @@ def metadata_filename(self) -> str: return 'ocrd-tool.json' @cached_property - def metadata_location(self) -> str: + def metadata_location(self) -> Path: """ Absolute path of the ``ocrd-tool.json`` file as distributed with the package. @@ -134,7 +134,7 @@ def metadata_location(self) -> str: (Override if ``ocrd-tool.json`` is not distributed with the Python package.) """ - return resource_filename(__package__.split('.')[0], self.metadata_filename) + return resource_filename(self.__module__.split('.')[0], self.metadata_filename) @cached_property def metadata_rawdict(self) -> dict: From 821123765f7b9854c569f90e88e18ade025b68e8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 25 Aug 2024 02:18:53 +0200 Subject: [PATCH 138/249] pylint: try ignoring generateds (again) --- .pylintrc | 1 + src/ocrd/cli/ocrd_tool.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/.pylintrc b/.pylintrc index a4106a1bb..2e3af4288 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,6 +1,7 @@ [MASTER] extension-pkg-whitelist=lxml,pydantic ignored-modules=cv2,tesserocr,ocrd_models.ocrd_page_generateds +ignore-paths=ocrd_page_generateds.py ignore-patterns=.*generateds.* [MESSAGES CONTROL] diff --git a/src/ocrd/cli/ocrd_tool.py b/src/ocrd/cli/ocrd_tool.py index f63a7235a..fa815daeb 100644 --- a/src/ocrd/cli/ocrd_tool.py +++ b/src/ocrd/cli/ocrd_tool.py @@ -28,6 +28,8 @@ def __init__(self, filename): self.filename = filename with codecs.open(filename, encoding='utf-8') as f: self.content = f.read() + # perhaps the validator should _always_ run (for default expansion) + # so validate command only for the report? self.json = loads(self.content) self.tool_name = '' From 3e2700cddcfa51af9cc73427fd4ebcfc53458282 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 25 Aug 2024 11:32:14 +0200 Subject: [PATCH 139/249] :memo: update changelog --- CHANGELOG.md | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 43bf85764..244b168e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,13 +5,37 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Fixed: + - actuall apply CLI `--log-filename` + - adapt to Pillow changes + - `ocrd workspace clone`: do pass on `--file-grp` (for download filtering) + Changed: - - :fire: `ocrd_utils`, `ocrd_models`, `ocrd_modelfactory`, `ocrd_validators` and `ocrd_network` are not published as separate packages anymore, everything is contained in `ocrd` and you should adapt your `requirements.txt` accordingly. + - :fire: `ocrd_utils`, `ocrd_models`, `ocrd_modelfactory`, `ocrd_validators` and `ocrd_network` are not published + as separate packages anymore, everything is contained in `ocrd` - you should adapt your `requirements.txt` accordingly + - :fire: `Processor.parameter` now a property (attribute always exists, but `None` for non-processing contexts) + - :fire: `Processor.parameter` is now a `frozendict` (contents immutable) + - :fire: `Processor.parameter` validate when(ever) set instead of (just) the constructor + - setting `Processor.parameter` will also trigger (`Processor.shutdown() and) `Processor.setup()` + - `get_processor(... instance_caching=True)`: use `min(max_instances, OCRD_MAX_PROCESSOR_CACHE)` + - :fire: `Processor.verify` always validates fileGrp cardinalities (because we have `ocrd-tool.json` defaults now) + - :fire: `OcrdMets.add_agent` without positional arguments + - `ocrd bashlib input-files` now uses normal Processor decorator, and gets passed actual `ocrd-tool.json` and tool name + from bashlib's `ocrd__wrap` + +Added: + - `Processor.metadata_filename`: expose to make local path of `ocrd-tool.json` in Python distribution reusable+overridable + - `Processor.metadata_location`: expose to make absolute path of `ocrd-tool.json` reusable+overridable + - `Processor.metadata_rawdict`: expose to make in-memory contents of `ocrd-tool.json` reusable+overridable + - `Processor.metadata`: expose to make validated and default-expanded contents of `ocrd-tool.json` reusable+overridable + - `Processor.shutdown`: to shut down processor after processing, optional + - `Processor.max_instances`: class attribute to control instance caching of this implementation ## [3.0.0a2] - 2024-08-22 Changed: - :fire: `OcrdPage` as proxy of `PcGtsType` instead of alias; also contains `etree` and `mapping` now + - :fire: `page_from_file`: removed kwarg `with_tree` - use `OcrdPage.etree` and `OcrdPage.mapping` instead - :fire: `Processor.zip_input_files` now can throw `ocrd.NonUniqueInputFile` and `ocrd.MissingInputFile` (the latter only if `OCRD_MISSING_INPUT=ABORT`) - :fire: `Processor.zip_input_files` does not by default use `require_first` anymore From 342df5825522c8fa28c570a0cc7fd8f348730c0c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 25 Aug 2024 20:05:52 +0200 Subject: [PATCH 140/249] test_bashlib: allow testing prereleases successfully --- tests/cli/test_bashlib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/cli/test_bashlib.py b/tests/cli/test_bashlib.py index b1ab68c7f..ba7c283e4 100644 --- a/tests/cli/test_bashlib.py +++ b/tests/cli/test_bashlib.py @@ -121,7 +121,7 @@ def test_bashlib_minversion(self): assert f"ERROR: ocrd/core is too old ({VERSION} < {version})" in err # test non-matching prerelease (the 99th alpha pre-release here) - version = "%d.%d.%da99" % (major, minor, patch) + version = "%d.%d.%dz99" % (major, minor, patch) assert VERSION != version # assuming we will never have 99 alpha prereleases ^^ exit_code, out, err = self.invoke_bash("source $(ocrd bashlib filename) && ocrd__minversion " + version) assert exit_code > 0 From 11ed8c568274779f4c6cc25cad36fd9424c40b1c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 25 Aug 2024 20:07:06 +0200 Subject: [PATCH 141/249] Processor.process_page_file / OcrdPageResultImage: allow PageType instead of AlternativeImageType --- src/ocrd/processor/base.py | 20 ++++++++++++++++++-- src/ocrd/processor/ocrd_page_result.py | 6 +++--- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index c0b66b269..df6574818 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -45,7 +45,15 @@ deprecation_warning ) from ocrd_validators import ParameterValidator -from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType, OcrdPage, to_xml +from ocrd_models.ocrd_page import ( + PageType, + AlternativeImageType, + MetadataItemType, + LabelType, + LabelsType, + OcrdPage, + to_xml, +) from ocrd_modelfactory import page_from_file from ocrd_validators.ocrd_tool_validator import OcrdToolValidator @@ -523,7 +531,15 @@ def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: for image_result in result.images: image_file_id = f'{output_file_id}_{image_result.file_id_suffix}' image_file_path = join(self.output_file_grp, f'{image_file_id}.png') - image_result.alternative_image.set_filename(image_file_path) + if isinstance(image_result.alternative_image, PageType): + image_result.alternative_image.set_imageFilename(image_file_path) + image_result.alternative_image.set_imageWidth(image_result.pil.width) + image_result.alternative_image.set_imageHeight(image_result.pil.height) + elif isinstance(image_result.alternative_image, AlternativeImageType): + image_result.alternative_image.set_filename(image_file_path) + else: + raise ValueError(f"process_page_pcgts returned an OcrdPageResultImage of unknown type " + f"{type(image_result.alternative_image)}") self.workspace.save_image_file( image_result.pil, image_file_id, diff --git a/src/ocrd/processor/ocrd_page_result.py b/src/ocrd/processor/ocrd_page_result.py index c63330c73..dcd8ccd44 100644 --- a/src/ocrd/processor/ocrd_page_result.py +++ b/src/ocrd/processor/ocrd_page_result.py @@ -1,15 +1,15 @@ from dataclasses import dataclass, field -from typing import List +from typing import List, Union from ocrd_models.ocrd_page import OcrdPage from PIL.Image import Image -from ocrd_models.ocrd_page_generateds import AlternativeImageType +from ocrd_models.ocrd_page_generateds import AlternativeImageType, PageType @dataclass class OcrdPageResultImage(): pil : Image file_id_suffix : str - alternative_image : AlternativeImageType + alternative_image : Union[AlternativeImageType, PageType] @dataclass class OcrdPageResult(): From 77e31f26a5a3beebfb10895d0383790dc69ca23a Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 26 Aug 2024 11:29:04 +0200 Subject: [PATCH 142/249] :package: v3.0.0b1 --- CHANGELOG.md | 2 ++ VERSION | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e3214ab8b..063e7feb6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [3.0.0b1] - 2024-08-26 + Fixed: - actuall apply CLI `--log-filename` - adapt to Pillow changes diff --git a/VERSION b/VERSION index 3a5b5bc9d..2daa89b06 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.0.0a2 +3.0.0b1 From d3ee57c271b9144f26b3e1f357dbddc50abe5f24 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 26 Aug 2024 17:58:40 +0200 Subject: [PATCH 143/249] :fire: bad no good terrible hack to fix integration_test --- src/ocrd/processor/base.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index df6574818..1f05e6a67 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -142,7 +142,11 @@ def metadata_location(self) -> Path: (Override if ``ocrd-tool.json`` is not distributed with the Python package.) """ - return resource_filename(self.__module__.split('.')[0], self.metadata_filename) + # XXX HACK + module_tokens = self.__module__.split('.') + if module_tokens[0] == 'src': + module_tokens.pop(0) + return resource_filename(module_tokens[0], self.metadata_filename) @cached_property def metadata_rawdict(self) -> dict: From 0245f4ba9fe525c37744e717d312628b66955c5a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 27 Aug 2024 03:37:08 +0200 Subject: [PATCH 144/249] generate_processor_help: avoid repeating docstrings from superclass --- CHANGELOG.md | 2 +- Dockerfile | 2 +- src/ocrd/processor/__init__.py | 2 +- src/ocrd/processor/base.py | 151 ++++++++++++++++++++++++++++++++- src/ocrd/processor/helpers.py | 145 +------------------------------ 5 files changed, 154 insertions(+), 148 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 063e7feb6..c4e38bc42 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## [3.0.0b1] - 2024-08-26 Fixed: - - actuall apply CLI `--log-filename` + - actually apply CLI `--log-filename` - adapt to Pillow changes - `ocrd workspace clone`: do pass on `--file-grp` (for download filtering) diff --git a/Dockerfile b/Dockerfile index 144ae774d..77c24bf77 100644 --- a/Dockerfile +++ b/Dockerfile @@ -50,9 +50,9 @@ FROM ocrd_core_base as ocrd_core_test ARG SKIP_ASSETS WORKDIR /build/core COPY Makefile . +COPY .gitmodules . RUN if test -z "$SKIP_ASSETS" || test $SKIP_ASSETS -eq 0 ; then make assets ; fi COPY tests ./tests -COPY .gitmodules . COPY requirements_test.txt . RUN pip install -r requirements_test.txt RUN mkdir /ocrd-data && chmod 777 /ocrd-data diff --git a/src/ocrd/processor/__init__.py b/src/ocrd/processor/__init__.py index b6c1188de..7cbcb851d 100644 --- a/src/ocrd/processor/__init__.py +++ b/src/ocrd/processor/__init__.py @@ -3,6 +3,7 @@ ResourceNotFoundError, NonUniqueInputFile, MissingInputFile, + generate_processor_help, ) from .ocrd_page_result import ( OcrdPageResult, @@ -11,5 +12,4 @@ from .helpers import ( run_cli, run_processor, - generate_processor_help ) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index df6574818..61a25e527 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -23,6 +23,8 @@ import io import weakref from frozendict import frozendict + +from click import wrap_text from deprecated import deprecated from requests import HTTPError @@ -58,7 +60,7 @@ from ocrd_validators.ocrd_tool_validator import OcrdToolValidator # XXX imports must remain for backwards-compatibility -from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import +from .helpers import run_cli, run_processor # pylint: disable=unused-import class ResourceNotFoundError(FileNotFoundError): @@ -838,3 +840,150 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): if ifiles[0] or not require_first: ifts.append(tuple(ifiles)) return ifts + +def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None): + """Generate a string describing the full CLI of this processor including params. + + Args: + ocrd_tool (dict): this processor's ``tools`` section of the module's ``ocrd-tool.json`` + processor_instance (object, optional): the processor implementation + (for adding any module/class/function docstrings) + subcommand (string): 'worker' or 'server' + """ + doc_help = '' + if processor_instance: + module = inspect.getmodule(processor_instance) + if module and module.__doc__: + doc_help += '\n' + inspect.cleandoc(module.__doc__) + '\n' + if processor_instance.__doc__: + doc_help += '\n' + inspect.cleandoc(processor_instance.__doc__) + '\n' + # Try to find the most concrete docstring among the various methods that an implementation + # could overload, first serving. + # In doing so, compare with Processor to avoid a glitch in the way py>=3.5 inherits docstrings. + # (They are supposed to only repeat information inspect.getdoc, rather than inherit __doc__ itself.) + for method in ['process_page_pcgts', 'process_page_file', 'process_workspace', 'process']: + instance_method = getattr(processor_instance, method) + superclass_method = getattr(Processor, method) + if instance_method.__doc__ and instance_method.__doc__ != superclass_method.__doc__: + doc_help += '\n' + inspect.cleandoc(instance_method.__doc__) + '\n' + break + if doc_help: + doc_help = '\n\n' + wrap_text(doc_help, width=72, + initial_indent=' > ', + subsequent_indent=' > ', + preserve_paragraphs=True) + subcommands = '''\ + worker Start a processing worker rather than do local processing + server Start a processor server rather than do local processing +''' + + processing_worker_options = '''\ + --queue The RabbitMQ server address in format + "amqp://{user}:{pass}@{host}:{port}/{vhost}" + [amqp://admin:admin@localhost:5672] + --database The MongoDB server address in format + "mongodb://{host}:{port}" + [mongodb://localhost:27018] + --log-filename Filename to redirect STDOUT/STDERR to, + if specified. +''' + + processing_server_options = '''\ + --address The Processor server address in format + "{host}:{port}" + --database The MongoDB server address in format + "mongodb://{host}:{port}" + [mongodb://localhost:27018] +''' + + processing_options = '''\ + -m, --mets URL-PATH URL or file path of METS to process [./mets.xml] + -w, --working-dir PATH Working directory of local workspace [dirname(URL-PATH)] + -I, --input-file-grp USE File group(s) used as input + -O, --output-file-grp USE File group(s) used as output + -g, --page-id ID Physical page ID(s) to process instead of full document [] + --overwrite Remove existing output pages/images + (with "--page-id", remove only those). + Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE + --debug Abort on any errors with full stack trace. + Short-hand for OCRD_MISSING_OUTPUT=ABORT + --profile Enable profiling + --profile-file PROF-PATH Write cProfile stats to PROF-PATH. Implies "--profile" + -p, --parameter JSON-PATH Parameters, either verbatim JSON string + or JSON file path + -P, --param-override KEY VAL Override a single JSON object key-value pair, + taking precedence over --parameter + -U, --mets-server-url URL URL of a METS Server for parallel incremental access to METS + If URL starts with http:// start an HTTP server there, + otherwise URL is a path to an on-demand-created unix socket + -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE] + Override log level globally [INFO] +''' + + information_options = '''\ + -C, --show-resource RESNAME Dump the content of processor resource RESNAME + -L, --list-resources List names of processor resources + -J, --dump-json Dump tool description as JSON + -D, --dump-module-dir Show the 'module' resource location path for this processor + -h, --help Show this message + -V, --version Show version +''' + + parameter_help = '' + if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']: + parameter_help = ' NONE\n' + else: + def wrap(s): + return wrap_text(s, initial_indent=' '*3, + subsequent_indent=' '*4, + width=72, preserve_paragraphs=True) + for param_name, param in ocrd_tool['parameters'].items(): + parameter_help += wrap('"%s" [%s%s]' % ( + param_name, + param['type'], + ' - REQUIRED' if 'required' in param and param['required'] else + ' - %s' % json.dumps(param['default']) if 'default' in param else '')) + parameter_help += '\n ' + wrap(param['description']) + if 'enum' in param: + parameter_help += '\n ' + wrap('Possible values: %s' % json.dumps(param['enum'])) + parameter_help += "\n" + + if not subcommand: + return f'''\ +Usage: {ocrd_tool['executable']} [worker|server] [OPTIONS] + + {ocrd_tool['description']}{doc_help} + +Subcommands: +{subcommands} +Options for processing: +{processing_options} +Options for information: +{information_options} +Parameters: +{parameter_help} +''' + elif subcommand == 'worker': + return f'''\ +Usage: {ocrd_tool['executable']} worker [OPTIONS] + + Run {ocrd_tool['executable']} as a processing worker. + + {ocrd_tool['description']}{doc_help} + +Options: +{processing_worker_options} +''' + elif subcommand == 'server': + return f'''\ +Usage: {ocrd_tool['executable']} server [OPTIONS] + + Run {ocrd_tool['executable']} as a processor sever. + + {ocrd_tool['description']}{doc_help} + +Options: +{processing_server_options} +''' + else: + pass diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index 6483790bd..a675ff129 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -8,13 +8,11 @@ from subprocess import run from typing import List, Optional -from click import wrap_text -from ocrd.workspace import Workspace +from ..workspace import Workspace from ocrd_utils import freeze_args, getLogger, config, setOverrideLogLevel, getLevelName, sparkline __all__ = [ - 'generate_processor_help', 'run_cli', 'run_processor' ] @@ -213,147 +211,6 @@ def run_cli( return result.returncode -def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None): - """Generate a string describing the full CLI of this processor including params. - - Args: - ocrd_tool (dict): this processor's ``tools`` section of the module's ``ocrd-tool.json`` - processor_instance (object, optional): the processor implementation - (for adding any module/class/function docstrings) - subcommand (string): 'worker' or 'server' - """ - doc_help = '' - if processor_instance: - module = inspect.getmodule(processor_instance) - if module and module.__doc__: - doc_help += '\n' + inspect.cleandoc(module.__doc__) + '\n' - if processor_instance.__doc__: - doc_help += '\n' + inspect.cleandoc(processor_instance.__doc__) + '\n' - if processor_instance.process_workspace.__doc__: - doc_help += '\n' + inspect.cleandoc(processor_instance.process_workspace.__doc__) + '\n' - if processor_instance.process.__doc__: - doc_help += '\n' + inspect.cleandoc(processor_instance.process.__doc__) + '\n' - if doc_help: - doc_help = '\n\n' + wrap_text(doc_help, width=72, - initial_indent=' > ', - subsequent_indent=' > ', - preserve_paragraphs=True) - subcommands = '''\ - worker Start a processing worker rather than do local processing - server Start a processor server rather than do local processing -''' - - processing_worker_options = '''\ - --queue The RabbitMQ server address in format - "amqp://{user}:{pass}@{host}:{port}/{vhost}" - [amqp://admin:admin@localhost:5672] - --database The MongoDB server address in format - "mongodb://{host}:{port}" - [mongodb://localhost:27018] - --log-filename Filename to redirect STDOUT/STDERR to, - if specified. -''' - - processing_server_options = '''\ - --address The Processor server address in format - "{host}:{port}" - --database The MongoDB server address in format - "mongodb://{host}:{port}" - [mongodb://localhost:27018] -''' - - processing_options = '''\ - -m, --mets URL-PATH URL or file path of METS to process [./mets.xml] - -w, --working-dir PATH Working directory of local workspace [dirname(URL-PATH)] - -I, --input-file-grp USE File group(s) used as input - -O, --output-file-grp USE File group(s) used as output - -g, --page-id ID Physical page ID(s) to process instead of full document [] - --overwrite Remove existing output pages/images - (with "--page-id", remove only those). - Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE - --debug Abort on any errors with full stack trace. - Short-hand for OCRD_MISSING_OUTPUT=ABORT - --profile Enable profiling - --profile-file PROF-PATH Write cProfile stats to PROF-PATH. Implies "--profile" - -p, --parameter JSON-PATH Parameters, either verbatim JSON string - or JSON file path - -P, --param-override KEY VAL Override a single JSON object key-value pair, - taking precedence over --parameter - -U, --mets-server-url URL URL of a METS Server for parallel incremental access to METS - If URL starts with http:// start an HTTP server there, - otherwise URL is a path to an on-demand-created unix socket - -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE] - Override log level globally [INFO] -''' - - information_options = '''\ - -C, --show-resource RESNAME Dump the content of processor resource RESNAME - -L, --list-resources List names of processor resources - -J, --dump-json Dump tool description as JSON - -D, --dump-module-dir Show the 'module' resource location path for this processor - -h, --help Show this message - -V, --version Show version -''' - - parameter_help = '' - if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']: - parameter_help = ' NONE\n' - else: - def wrap(s): - return wrap_text(s, initial_indent=' '*3, - subsequent_indent=' '*4, - width=72, preserve_paragraphs=True) - for param_name, param in ocrd_tool['parameters'].items(): - parameter_help += wrap('"%s" [%s%s]' % ( - param_name, - param['type'], - ' - REQUIRED' if 'required' in param and param['required'] else - ' - %s' % json.dumps(param['default']) if 'default' in param else '')) - parameter_help += '\n ' + wrap(param['description']) - if 'enum' in param: - parameter_help += '\n ' + wrap('Possible values: %s' % json.dumps(param['enum'])) - parameter_help += "\n" - - if not subcommand: - return f'''\ -Usage: {ocrd_tool['executable']} [worker|server] [OPTIONS] - - {ocrd_tool['description']}{doc_help} - -Subcommands: -{subcommands} -Options for processing: -{processing_options} -Options for information: -{information_options} -Parameters: -{parameter_help} -''' - elif subcommand == 'worker': - return f'''\ -Usage: {ocrd_tool['executable']} worker [OPTIONS] - - Run {ocrd_tool['executable']} as a processing worker. - - {ocrd_tool['description']}{doc_help} - -Options: -{processing_worker_options} -''' - elif subcommand == 'server': - return f'''\ -Usage: {ocrd_tool['executable']} server [OPTIONS] - - Run {ocrd_tool['executable']} as a processor sever. - - {ocrd_tool['description']}{doc_help} - -Options: -{processing_server_options} -''' - else: - pass - # not decorated here but at runtime (on first use) #@freeze_args From efe420138141a8c15b3967987c103753274edeb6 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 27 Aug 2024 04:07:38 +0200 Subject: [PATCH 145/249] Processor.process_workspace: abort anyway if too many failures (OCRD_MAX_MISSING_OUTPUTS) --- src/ocrd/processor/base.py | 39 ++++++++++++++++++++++++++------------ src/ocrd_utils/config.py | 5 +++++ 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 61a25e527..cbd819a44 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -424,6 +424,9 @@ def process_workspace(self, workspace: Workspace) -> None: self.workspace = workspace self.verify() try: + nr_succeeded = 0 + nr_skipped = 0 + nr_copied = 0 # FIXME: add page parallelization by running multiprocessing.Pool (#322) for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False): input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple) @@ -449,29 +452,38 @@ def process_workspace(self, workspace: Workspace) -> None: # - persistent (data) error → skip / dummy / raise try: self.process_page_file(*input_files) - except Exception as err: - # we have to be broad here, but want to exclude NotImplementedError - if isinstance(err, NotImplementedError): + nr_succeeded += 1 + # exclude NotImplementedError, so we can try process() below + except NotImplementedError: + raise + # handle input failures separately + except FileExistsError as err: + if config.OCRD_EXISTING_OUTPUT == 'ABORT': raise err - if isinstance(err, FileExistsError): - if config.OCRD_EXISTING_OUTPUT == 'ABORT': - raise err - if config.OCRD_EXISTING_OUTPUT == 'SKIP': - continue - if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE': - # too late here, must not happen - raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE") - # FIXME: re-usable/actionable logging + if config.OCRD_EXISTING_OUTPUT == 'SKIP': + continue + if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE': + # too late here, must not happen + raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE") + # broad coverage of output failures + except Exception as err: + # FIXME: add re-usable/actionable logging self._base_logger.exception(f"Failure on page {page_id}: {err}") if config.OCRD_MISSING_OUTPUT == 'ABORT': raise err if config.OCRD_MISSING_OUTPUT == 'SKIP': + nr_skipped += 1 continue if config.OCRD_MISSING_OUTPUT == 'COPY': self._copy_page_file(input_files[0]) + nr_copied += 1 else: desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False) raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}") + if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS: + raise Exception(f"too many failures with skipped output ({nr_skipped})") + if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS: + raise Exception(f"too many failures with fallback output ({nr_skipped})") except NotImplementedError: # fall back to deprecated method self.process() @@ -534,6 +546,9 @@ def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: image_file_id = f'{output_file_id}_{image_result.file_id_suffix}' image_file_path = join(self.output_file_grp, f'{image_file_id}.png') if isinstance(image_result.alternative_image, PageType): + # special case: not an alternative image, but replacing the original image + # (this is needed by certain processors when the original's coordinate system + # cannot or must not be kept) image_result.alternative_image.set_imageFilename(image_file_path) image_result.alternative_image.set_imageWidth(image_result.pil.width) image_result.alternative_image.set_imageHeight(image_result.pil.height) diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 3425bc920..9f9d924f6 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -184,6 +184,11 @@ def _ocrd_download_timeout_parser(val): validator=lambda val: val in ['SKIP', 'COPY', 'ABORT'], parser=str) +config.add("OCRD_MAX_MISSING_OUTPUTS", + description="Maximal rate of skipped/fallback pages among all processed pages before aborting.", + default=(True, 0.1), + parser=float) + config.add("OCRD_EXISTING_OUTPUT", description="""\ How to deal with already existing output files (for some fileGrp/pageId) during processing: From fce7627a3963924ef9f3500360f0423fa0b2c6ba Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 27 Aug 2024 11:09:08 +0200 Subject: [PATCH 146/249] adapt tests for OCRD_MAX_MISSING_OUTPUTS --- src/ocrd_utils/config.py | 2 +- tests/processor/test_processor.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 9f9d924f6..29632f8cc 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -185,7 +185,7 @@ def _ocrd_download_timeout_parser(val): parser=str) config.add("OCRD_MAX_MISSING_OUTPUTS", - description="Maximal rate of skipped/fallback pages among all processed pages before aborting.", + description="Maximal rate of skipped/fallback pages among all processed pages before aborting (decimal fraction, ignored if negative).", default=(True, 0.1), parser=float) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 19ff1087f..4e6114763 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -219,6 +219,8 @@ def test_run_output0(self): def test_run_output_missing(self): ws = self.workspace from ocrd_utils import config + # do not raise for number of failures: + config.OCRD_MAX_MISSING_OUTPUTS = -1 config.OCRD_MISSING_OUTPUT = 'SKIP' run_processor(DummyProcessorWithOutputFailures, workspace=ws, input_file_grp="OCR-D-IMG", @@ -237,6 +239,14 @@ def test_run_output_missing(self): input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-OUT") assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + # do raise for number of failures: + config.OCRD_MAX_MISSING_OUTPUTS = 0.4 + config.OCRD_MISSING_OUTPUT = 'SKIP' + with pytest.raises(Exception) as exc: + run_processor(DummyProcessorWithOutputFailures, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT") + assert "too many failures" in str(exc.value) def test_run_output_overwrite(self): with pushd_popd(tempdir=True) as tempdir: From c08166e177e3080db4b2a9b5be4ea6b218939f05 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 27 Aug 2024 22:47:40 +0200 Subject: [PATCH 147/249] =?UTF-8?q?Processor:=20add=20per-page=20timeouts?= =?UTF-8?q?=20and=20parallelism=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ocrd_utils: introduce `config.OCRD_MAX_PARALLEL_PAGES` - Processor: introduce `max_workers` class attribute (as per-implementation limit of `OCRD_MAX_PARALLEL_PAGES`) - ocrd_utils: introduce `config.OCRD_PROCESSING_PAGE_TIMEOUT` - Processor: introduce `max_page_seconds` class attribute (as per-implementation limit of `OCRD_PROCESSING_PAGE_TIMEOUT`) - Processor.process_workspace: instead of calling `process_page_file` directly for each input file in a loop, submit these invokations as tasks to an internal `ThreadPoolExecutor`, which will run with `OCRD_MAX_PARALLEL_PAGES` workers, and loop over retrieving results from it (with normal error handling) - for each (per-page) task, add a `timeout` limit `OCRD_PROCESSING_PAGE_TIMEOUT` (with TimeoutError an additional error case to be handled in accordance with `OCRD_MISSING_OUTPUT` setting) --- src/ocrd/processor/base.py | 98 +++++++++++++++++++++++++++++--------- src/ocrd_utils/config.py | 10 ++++ 2 files changed, 86 insertions(+), 22 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 4e6002470..269f27b70 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -23,14 +23,16 @@ import io import weakref from frozendict import frozendict +from concurrent.futures import ThreadPoolExecutor, TimeoutError from click import wrap_text from deprecated import deprecated from requests import HTTPError -from ocrd.workspace import Workspace +from ..workspace import Workspace +from ..mets_server import ClientSideOcrdMets from ocrd_models.ocrd_file import OcrdFileType -from ocrd.processor.ocrd_page_result import OcrdPageResult +from .ocrd_page_result import OcrdPageResult from ocrd_utils import ( VERSION as OCRD_VERSION, MIMETYPE_PAGE, @@ -120,7 +122,27 @@ class Processor(): maximum number of cached instances (ignored if negative), to be applied on top of :py:data:`~ocrd_utils.config.OCRD_MAX_PROCESSOR_CACHE` (i.e. whatever is smaller). - (Override this if you know how many instances fit into memory at once.) + (Override this if you know how many instances fit into memory - GPU / CPU RAM - at once.) + """ + + max_workers : int = -1 + """ + maximum number of processor threads for page-parallel processing (ignored if negative), + to be applied on top of :py:data:`~ocrd_utils.config.OCRD_MAX_PARALLEL_PAGES` (i.e. + whatever is smaller). + + (Override this if you know how many pages fit into processing units - GPU shaders / CPU cores + - at once, or if your class is not thread-safe.) + """ + + max_page_seconds : int = -1 + """ + maximum number of seconds may be spent processing a single page (ignored if negative), + to be applied on top of :py:data:`~ocrd_utils.config.OCRD_PROCESSING_PAGE_TIMEOUT` + (i.e. whatever is smaller). + + (Override this if you know how costly this processor may be, irrespective of image size + or complexity of the page.) """ @property @@ -431,7 +453,26 @@ def process_workspace(self, workspace: Workspace) -> None: nr_succeeded = 0 nr_skipped = 0 nr_copied = 0 - # FIXME: add page parallelization by running multiprocessing.Pool (#322) + + # set up multithreading + if self.max_workers < 0: + max_workers = config.OCRD_MAX_PARALLEL_PAGES + else: + max_workers = min(config.OCRD_MAX_PARALLEL_PAGES, self.max_workers) + if max_workers > 1: + assert isinstance(workspace.mets, ClientSideOcrdMets), \ + "OCRD_MAX_PARALLEL_PAGES>1 requires also using --mets-server-url" + if self.max_page_seconds < 0: + max_seconds = config.OCRD_PROCESSING_PAGE_TIMEOUT + else: + max_seconds = min(config.OCRD_PROCESSING_PAGE_TIMEOUT, self.max_page_seconds) + executor = ThreadPoolExecutor( + max_workers=max_workers, + thread_name_prefix=f"pagetask.{workspace.mets.unique_identifier}" + ) + self._base_logger.debug("started executor %s", str(executor)) + tasks = {} + for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False): input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple) page_id = next(input_file.pageId @@ -450,12 +491,20 @@ def process_workspace(self, workspace: Workspace) -> None: except (ValueError, FileNotFoundError, HTTPError) as e: self._base_logger.error(repr(e)) self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}") + # process page + tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files) + self._base_logger.debug("submitted %d processing tasks", len(tasks)) + + for task in tasks: + # wait for results, handle errors + page_id, input_files = tasks[task] # FIXME: differentiate error cases in various ways: # - ResourceNotFoundError → use ResourceManager to download (once), then retry # - transient (I/O or OOM) error → maybe sleep, retry # - persistent (data) error → skip / dummy / raise try: - self.process_page_file(*input_files) + self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds) + task.result(timeout=max_seconds) nr_succeeded += 1 # exclude NotImplementedError, so we can try process() below except NotImplementedError: @@ -469,10 +518,10 @@ def process_workspace(self, workspace: Workspace) -> None: if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE': # too late here, must not happen raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE") - # broad coverage of output failures - except Exception as err: + # broad coverage of output failures (including TimeoutError) + except (Exception, TimeoutError) as err: # FIXME: add re-usable/actionable logging - self._base_logger.exception(f"Failure on page {page_id}: {err}") + self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") if config.OCRD_MISSING_OUTPUT == 'ABORT': raise err if config.OCRD_MISSING_OUTPUT == 'SKIP': @@ -484,10 +533,13 @@ def process_workspace(self, workspace: Workspace) -> None: else: desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False) raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}") + if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS: raise Exception(f"too many failures with skipped output ({nr_skipped})") if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS: raise Exception(f"too many failures with fallback output ({nr_skipped})") + executor.shutdown() + except NotImplementedError: # fall back to deprecated method self.process() @@ -511,13 +563,14 @@ def _copy_page_file(self, input_file : OcrdFileType) -> None: output_file_id = make_file_id(input_file, self.output_file_grp) input_pcgts.set_pcGtsId(output_file_id) self.add_metadata(input_pcgts) - self.workspace.add_file(file_id=output_file_id, - file_grp=self.output_file_grp, - page_id=input_file.pageId, - local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), - mimetype=MIMETYPE_PAGE, - content=to_xml(input_pcgts), - force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + self.workspace.add_file( + file_id=output_file_id, + file_grp=self.output_file_grp, + page_id=input_file.pageId, + local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), + mimetype=MIMETYPE_PAGE, + content=to_xml(input_pcgts), + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', ) def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: @@ -571,13 +624,14 @@ def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: ) result.pcgts.set_pcGtsId(output_file_id) self.add_metadata(result.pcgts) - self.workspace.add_file(file_id=output_file_id, - file_grp=self.output_file_grp, - page_id=page_id, - local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), - mimetype=MIMETYPE_PAGE, - content=to_xml(result.pcgts), - force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + self.workspace.add_file( + file_id=output_file_id, + file_grp=self.output_file_grp, + page_id=page_id, + local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), + mimetype=MIMETYPE_PAGE, + content=to_xml(result.pcgts), + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', ) def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 29632f8cc..0186b8539 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -120,6 +120,16 @@ def raw_value(self, name): parser=int, default=(True, 128)) +config.add('OCRD_MAX_PARALLEL_PAGES', + description="Maximum number of processor threads for page-parallel processing (within each Processor's selected page range, independent of the number of Processing Workers or Processor Servers). If set >1, then a METS Server must be used for METS synchronisation.", + parser=int, + default=(True, 1)) + +config.add('OCRD_PROCESSING_PAGE_TIMEOUT', + description="Timeout in seconds for processing a single page. If set >0, when exceeded, the same as OCRD_MISSING_OUTPUT applies.", + parser=int, + default=(True, 0)) + config.add("OCRD_PROFILE", description="""\ Whether to enable gathering runtime statistics From c3a83800da2d56262234c10d54bb0946087a4994 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 27 Aug 2024 22:59:58 +0200 Subject: [PATCH 148/249] add tests for processor per-page timeout and parallelism --- requirements_test.txt | 1 + tests/data/__init__.py | 26 ++++++++++++++++- tests/processor/test_processor.py | 46 +++++++++++++++++++++++++++++-- tests/test_workspace.py | 6 ++-- 4 files changed, 72 insertions(+), 7 deletions(-) diff --git a/requirements_test.txt b/requirements_test.txt index d8cef1dae..a6a87918f 100644 --- a/requirements_test.txt +++ b/requirements_test.txt @@ -3,6 +3,7 @@ cryptography < 43.0.0 pytest >= 4.0.0 generateDS == 2.35.20 pytest-benchmark >= 3.2.3 +pytest-timeout coverage >= 4.5.2 sphinx sphinx_click diff --git a/tests/data/__init__.py b/tests/data/__init__.py index c706546c5..c24a6979b 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -1,8 +1,9 @@ import json import os import re +from time import sleep from pytest import warns -from ocrd import Processor +from ocrd import Processor, OcrdPageResult from ocrd_utils import make_file_id, config DUMMY_TOOL = { @@ -103,6 +104,29 @@ def process(self): force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', ) +class DummyProcessorWithOutputPagewise(Processor): + @property + def ocrd_tool(self): + dummy_tool = dict(DUMMY_TOOL) + dummy_tool['parameters']['sleep'] = {'type': 'number'} + return dummy_tool + + @property + def version(self): + return '0.0.1' + + @property + def executable(self): + return 'ocrd-test' + + def __init__(self, *args, **kwargs): + kwargs['download_files'] = False + super().__init__(*args, **kwargs) + + def process_page_pcgts(self, pcgts, page_id=None): + sleep(self.parameter['sleep']) + return OcrdPageResult(pcgts) + class DummyProcessorWithOutputFailures(Processor): @property def ocrd_tool(self): diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 4e6114763..e0b74fb00 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -9,9 +9,11 @@ DummyProcessor, DummyProcessorWithRequiredParameters, DummyProcessorWithOutput, + DummyProcessorWithOutputPagewise, DummyProcessorWithOutputFailures, IncompleteProcessor ) +from tests.test_mets_server import fixture_start_mets_server from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging from ocrd.resolver import Resolver @@ -232,7 +234,7 @@ def test_run_output_missing(self): run_processor(DummyProcessorWithOutputFailures, workspace=ws, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-OUT") - assert "intermittent" in str(exc.value) + assert "intermittent" in str(exc.value) config.OCRD_MISSING_OUTPUT = 'COPY' config.OCRD_EXISTING_OUTPUT = 'SKIP' run_processor(DummyProcessorWithOutputFailures, workspace=ws, @@ -246,7 +248,28 @@ def test_run_output_missing(self): run_processor(DummyProcessorWithOutputFailures, workspace=ws, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-OUT") - assert "too many failures" in str(exc.value) + assert "too many failures" in str(exc.value) + + def test_run_output_timeout(self): + ws = self.workspace + from ocrd_utils import config + # do not raise for number of failures: + config.OCRD_MAX_MISSING_OUTPUTS = -1 + config.OCRD_MISSING_OUTPUT = 'ABORT' + config.OCRD_PROCESSING_PAGE_TIMEOUT = 3 + run_processor(DummyProcessorWithOutputPagewise, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT", + parameter={"sleep": 1}) + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' + config.OCRD_PROCESSING_PAGE_TIMEOUT = 1 + from concurrent.futures import TimeoutError + with pytest.raises(TimeoutError) as exc: + run_processor(DummyProcessorWithOutputPagewise, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT", + parameter={"sleep": 3}) def test_run_output_overwrite(self): with pushd_popd(tempdir=True) as tempdir: @@ -261,7 +284,7 @@ def test_run_output_overwrite(self): run_processor(DummyProcessorWithOutput, workspace=ws, input_file_grp="GRP1", output_file_grp="OCR-D-OUT") - assert str(exc.value) == "File with ID='OCR-D-OUT_phys_0001' already exists" + assert str(exc.value) == "File with ID='OCR-D-OUT_phys_0001' already exists" config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' run_processor(DummyProcessorWithOutput, workspace=ws, input_file_grp="GRP1", @@ -387,5 +410,22 @@ def ocrd_tool(self): r = self.capture_out_err() assert 'ERROR ocrd.processor.base - Found no file for page phys_0001 in file group GRP1' in r.err +# 2s (+ 2s tolerance) instead of 3*3s (+ 2s tolerance) +@pytest.mark.timeout(4) +def test_run_output_parallel(start_mets_server): + mets_server_url, ws = start_mets_server + from ocrd_utils import config + # do not raise for single-page timeout + config.OCRD_PROCESSING_PAGE_TIMEOUT = -1 + # do not raise for number of failures: + config.OCRD_MAX_MISSING_OUTPUTS = -1 + config.OCRD_MAX_PARALLEL_PAGES = 3 + run_processor(DummyProcessorWithOutputPagewise, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT", + parameter={"sleep": 2}, + mets_server_url=mets_server_url) + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + if __name__ == "__main__": main(__file__) diff --git a/tests/test_workspace.py b/tests/test_workspace.py index 02cb72d34..9d6b64b1e 100644 --- a/tests/test_workspace.py +++ b/tests/test_workspace.py @@ -87,10 +87,10 @@ def test_workspace_add_file_overwrite(plain_workspace): plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT', page_id='phys1', local_filename=fpath) with pytest.raises(FileExistsError) as fn_exc: plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT', page_id=None, local_filename=fpath) - assert str(fn_exc.value) == "File with file_id='ID1' already exists" + assert str(fn_exc.value) == "File with file_id='ID1' already exists" with pytest.raises(FileExistsError) as fn_exc: plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT', page_id='phys2', local_filename=fpath, force=True) - assert 'cannot mitigate' in str(fn_exc.value) + assert 'cannot mitigate' in str(fn_exc.value) plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT2', page_id='phys1', local_filename=fpath, force=True) f = plain_workspace.mets.find_all_files()[0] @@ -684,7 +684,7 @@ def test_merge_overwrite(tmp_path): ws1.add_file('X', page_id='X', mimetype='X', file_id='id123', local_filename='X/X', content='ws1') ws2.add_file('X', page_id='X', mimetype='X', file_id='id456', local_filename='X/X', content='ws2') ws1.merge(ws2) - assert "would overwrite" == str(exc.value) + assert "would overwrite" == str(exc.value) def test_merge_with_filter(plain_workspace, tmp_path): # arrange From b1b7a491d41c00c2f803c9b4f5386ec33c9e7b8e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 27 Aug 2024 23:06:28 +0200 Subject: [PATCH 149/249] :memo: update changelog --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c4e38bc42..2a3807c0b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,13 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Added: + - `Processor.max_workers`: class attribute to control per-page parallelism of this implementation + - `Processor.max_page_seconds`: class attribute to control per-page timeout of this implementation + - `OCRD_MAX_PARALLEL_PAGES` for whether and how many workers should process pages in parallel + - `OCRD_PROCESSING_PAGE_TIMEOUT` for whether and how long processors should wait for single pages + - `OCRD_MAX_MISSING_OUTPUTS` for maximum rate (fraction) of pages before making `OCRD_MISSING_OUTPUT=abort` + ## [3.0.0b1] - 2024-08-26 Fixed: From 9b80ae17ef04dba41bc1f09d5c7be88e7ec8f22c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Aug 2024 15:11:49 +0200 Subject: [PATCH 150/249] ClientSideOcrdMets: use same logger name prefix as server --- src/ocrd/mets_server.py | 51 +++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 30 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 81f9e15d0..41b1f23ec 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -120,7 +120,7 @@ class ClientSideOcrdMets: def __init__(self, url, workspace_path: Optional[str] = None): self.protocol = "tcp" if url.startswith("http://") else "uds" - self.log = getLogger(f"ocrd.mets_client[{url}]") + self.log = getLogger(f"ocrd.models.ocrd_mets.client.{url}") self.url = url if self.protocol == "tcp" else f'http+unix://{url.replace("/", "%2F")}' self.ws_dir_path = workspace_path if workspace_path else None @@ -147,10 +147,9 @@ def save(self): Request writing the changes to the file system """ if not self.multiplexing_mode: - self.session.request("PUT", url=self.url) + self.session.put(url=self.url) else: - self.session.request( - "POST", + self.session.post( self.url, json=MpxReq.save(self.ws_dir_path) ) @@ -161,11 +160,10 @@ def stop(self): """ try: if not self.multiplexing_mode: - self.session.request("DELETE", self.url) + self.session.delete(self.url) return else: - self.session.request( - "POST", + self.session.post( self.url, json=MpxReq.stop(self.ws_dir_path) ) @@ -178,10 +176,9 @@ def reload(self): Request reloading of the mets file from the file system """ if not self.multiplexing_mode: - return self.session.request("POST", f"{self.url}/reload").text + return self.session.post(f"{self.url}/reload").text else: - return self.session.request( - "POST", + return self.session.post( self.url, json=MpxReq.reload(self.ws_dir_path) ).json()["text"] @@ -189,10 +186,9 @@ def reload(self): @property def unique_identifier(self): if not self.multiplexing_mode: - return self.session.request("GET", f"{self.url}/unique_identifier").text + return self.session.get(f"{self.url}/unique_identifier").text else: - return self.session.request( - "POST", + return self.session.post( self.url, json=MpxReq.unique_identifier(self.ws_dir_path) ).json()["text"] @@ -200,11 +196,10 @@ def unique_identifier(self): @property def workspace_path(self): if not self.multiplexing_mode: - self.ws_dir_path = self.session.request("GET", f"{self.url}/workspace_path").text + self.ws_dir_path = self.session.get(f"{self.url}/workspace_path").text return self.ws_dir_path else: - self.ws_dir_path = self.session.request( - "POST", + self.ws_dir_path = self.session.post( self.url, json=MpxReq.workspace_path(self.ws_dir_path) ).json()["text"] @@ -213,10 +208,9 @@ def workspace_path(self): @property def file_groups(self): if not self.multiplexing_mode: - return self.session.request("GET", f"{self.url}/file_groups").json()["file_groups"] + return self.session.get(f"{self.url}/file_groups").json()["file_groups"] else: - return self.session.request( - "POST", + return self.session.post( self.url, json=MpxReq.file_groups(self.ws_dir_path) ).json()["file_groups"] @@ -224,10 +218,9 @@ def file_groups(self): @property def agents(self): if not self.multiplexing_mode: - agent_dicts = self.session.request("GET", f"{self.url}/agent").json()["agents"] + agent_dicts = self.session.get(f"{self.url}/agent").json()["agents"] else: - agent_dicts = self.session.request( - "POST", + agent_dicts = self.session.post( self.url, json=MpxReq.agents(self.ws_dir_path) ).json()["agents"] @@ -238,10 +231,9 @@ def agents(self): def add_agent(self, **kwargs): if not self.multiplexing_mode: - return self.session.request("POST", f"{self.url}/agent", json=OcrdAgentModel.create(**kwargs).dict()) + return self.session.post(f"{self.url}/agent", json=OcrdAgentModel.create(**kwargs).dict()) else: - self.session.request( - "POST", + self.session.post( self.url, json=MpxReq.add_agent(self.ws_dir_path, OcrdAgentModel.create(**kwargs).dict()) ).json() @@ -258,10 +250,9 @@ def find_files(self, **kwargs): kwargs["file_grp"] = kwargs.pop("fileGrp") if not self.multiplexing_mode: - r = self.session.request(method="GET", url=f"{self.url}/file", params={**kwargs}) + r = self.session.get(url=f"{self.url}/file", params={**kwargs}) else: - r = self.session.request( - "POST", + r = self.session.post( self.url, json=MpxReq.find_files(self.ws_dir_path, {**kwargs}) ) @@ -286,11 +277,11 @@ def add_file( ) if not self.multiplexing_mode: - r = self.session.request("POST", f"{self.url}/file", data=data.dict()) + r = self.session.post(f"{self.url}/file", data=data.dict()) if not r: raise RuntimeError("Add file failed. Please check provided parameters") else: - r = self.session.request("POST", self.url, json=MpxReq.add_file(self.ws_dir_path, data.dict())) + r = self.session.post(self.url, json=MpxReq.add_file(self.ws_dir_path, data.dict())) if "error" in r: raise RuntimeError(f"Add file failed: Msg: {r['error']}") From be6b59d03903e3da153e4d64080f35a07fb5082a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Aug 2024 15:14:00 +0200 Subject: [PATCH 151/249] Processor: fix ignore (negative/zero) cases for max_workers / max_page_seconds --- src/ocrd/processor/base.py | 16 ++++++++-------- src/ocrd/workspace.py | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 269f27b70..5f8eabbb0 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -455,19 +455,19 @@ def process_workspace(self, workspace: Workspace) -> None: nr_copied = 0 # set up multithreading - if self.max_workers < 0: - max_workers = config.OCRD_MAX_PARALLEL_PAGES + if self.max_workers <= 0: + max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES) else: - max_workers = min(config.OCRD_MAX_PARALLEL_PAGES, self.max_workers) + max_workers = max(0, min(config.OCRD_MAX_PARALLEL_PAGES, self.max_workers)) if max_workers > 1: assert isinstance(workspace.mets, ClientSideOcrdMets), \ "OCRD_MAX_PARALLEL_PAGES>1 requires also using --mets-server-url" - if self.max_page_seconds < 0: - max_seconds = config.OCRD_PROCESSING_PAGE_TIMEOUT + if self.max_page_seconds <= 0: + max_seconds = max(0, config.OCRD_PROCESSING_PAGE_TIMEOUT) else: - max_seconds = min(config.OCRD_PROCESSING_PAGE_TIMEOUT, self.max_page_seconds) + max_seconds = max(0, min(config.OCRD_PROCESSING_PAGE_TIMEOUT, self.max_page_seconds)) executor = ThreadPoolExecutor( - max_workers=max_workers, + max_workers=max_workers or 1, thread_name_prefix=f"pagetask.{workspace.mets.unique_identifier}" ) self._base_logger.debug("started executor %s", str(executor)) @@ -504,7 +504,7 @@ def process_workspace(self, workspace: Workspace) -> None: # - persistent (data) error → skip / dummy / raise try: self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds) - task.result(timeout=max_seconds) + task.result(timeout=max_seconds or None) nr_succeeded += 1 # exclude NotImplementedError, so we can try process() below except NotImplementedError: diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index 27c56f048..270414ec4 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -95,8 +95,8 @@ def __init__( if self.is_remote: mets = ClientSideOcrdMets(mets_server_url, self.directory) if mets.workspace_path != self.directory: - raise ValueError(f"METS server {mets_server_url} workspace directory {mets.workspace_path} differs " - f"from local workspace directory {self.directory}. These are not the same workspaces.") + raise ValueError(f"METS server {mets_server_url} workspace directory '{mets.workspace_path}' differs " + f"from local workspace directory '{self.directory}'. These are not the same workspaces.") else: mets = OcrdMets(filename=self.mets_target) self.mets = mets From 0b5286f75d5f70c151b0ff769f9da380b17a9592 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Aug 2024 15:15:03 +0200 Subject: [PATCH 152/249] test_mets_server: use tmpdir to avoid side effects between suites --- tests/test_mets_server.py | 48 +++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index 1487617a7..8f94b9564 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -22,13 +22,16 @@ from requests.exceptions import ConnectionError from ocrd import Resolver, OcrdMetsServer, Workspace -from ocrd_utils import pushd_popd, MIMETYPE_PAGE +from ocrd_utils import pushd_popd, MIMETYPE_PAGE, initLogging, setOverrideLogLevel -WORKSPACE_DIR = '/tmp/ocrd-mets-server' TRANSPORTS = ['/tmp/ocrd-mets-server.sock', 'http://127.0.0.1:12345'] +initLogging() +setOverrideLogLevel(10) + @fixture(scope='function', name='start_mets_server', params=TRANSPORTS) -def fixture_start_mets_server(request) -> Iterable[Tuple[str, Workspace]]: +def fixture_start_mets_server(request, tmpdir) -> Iterable[Tuple[str, Workspace]]: + tmpdir = str(tmpdir) def _start_mets_server(*args, **kwargs): mets_server = OcrdMetsServer(*args, **kwargs) mets_server.startup() @@ -39,21 +42,22 @@ def _start_mets_server(*args, **kwargs): if exists(mets_server_url): remove(mets_server_url) - if exists(WORKSPACE_DIR): - rmtree(WORKSPACE_DIR, ignore_errors=True) + if exists(tmpdir): + rmtree(tmpdir, ignore_errors=True) - copytree(assets.path_to('SBB0000F29300010000/data'), WORKSPACE_DIR) - workspace = Workspace(Resolver(), WORKSPACE_DIR) + copytree(assets.path_to('SBB0000F29300010000/data'), tmpdir) + workspace = Workspace(Resolver(), tmpdir) p = Process(target=_start_mets_server, kwargs={'workspace': workspace, 'url': request.param}) p.start() sleep(1) # sleep to start up server - yield mets_server_url, Workspace(resolver=Resolver(), directory=WORKSPACE_DIR, mets_server_url=mets_server_url) + workspace_server = Workspace(Resolver(), tmpdir, mets_server_url=mets_server_url) + yield mets_server_url, workspace_server p.terminate() - rmtree(WORKSPACE_DIR, ignore_errors=True) + rmtree(tmpdir, ignore_errors=True) def add_file_server(x): - mets_server_url, i = x - workspace_server = Workspace(resolver=Resolver(), directory=WORKSPACE_DIR, mets_server_url=mets_server_url) + mets_server_url, directory, i = x + workspace_server = Workspace(Resolver(), directory, mets_server_url=mets_server_url) workspace_server.add_file( 'FOO', local_filename=f'local_filename{i}', @@ -64,8 +68,8 @@ def add_file_server(x): ) def add_agent_server(x): - mets_server_url, i = x - workspace_server = Workspace(resolver=Resolver(), directory=WORKSPACE_DIR, mets_server_url=mets_server_url) + mets_server_url, directory, i = x + workspace_server = Workspace(Resolver(), directory, mets_server_url=mets_server_url) workspace_server.mets.add_agent( name=f'proc{i}', _type='baz', @@ -82,7 +86,10 @@ def test_mets_server_add_file(start_mets_server): # add NO_FILES files in parallel with Pool() as pool: - pool.map(add_file_server, zip(repeat(mets_server_url), range(NO_FILES))) + pool.map(add_file_server, zip( + repeat(mets_server_url), + repeat(workspace_server.directory), + range(NO_FILES))) assert set(workspace_server.mets.file_groups) == set( [ 'OCR-D-IMG', @@ -107,7 +114,7 @@ def test_mets_server_add_file(start_mets_server): assert len(workspace_server.mets.find_all_files(fileGrp='FOO')) == NO_FILES # not yet synced - workspace_file = Workspace(Resolver(), WORKSPACE_DIR) + workspace_file = Workspace(Resolver(), workspace_server.directory) assert len(workspace_file.mets.find_all_files(fileGrp='FOO')) == 0 # sync @@ -125,13 +132,16 @@ def test_mets_server_add_agents(start_mets_server): # add NO_AGENTS agents in parallel with Pool() as pool: - pool.map(add_agent_server, zip(repeat(mets_server_url), list(range(NO_AGENTS)))) + pool.map(add_agent_server, zip( + repeat(mets_server_url), + repeat(workspace_server.directory), + list(range(NO_AGENTS)))) assert len(workspace_server.mets.agents) == NO_AGENTS + no_agents_before # XXX not a tuple assert workspace_server.mets.agents[-1].notes[0][0] == {'{https://ocr-d.de}foo': 'bar'} - workspace_file = Workspace(Resolver(), WORKSPACE_DIR) + workspace_file = Workspace(Resolver(), workspace_server.directory) assert len(workspace_file.mets.agents) == no_agents_before # sync @@ -142,7 +152,7 @@ def test_mets_server_add_agents(start_mets_server): def test_mets_server_str(start_mets_server): mets_server_url, workspace_server = start_mets_server - workspace_server = Workspace(Resolver(), WORKSPACE_DIR, mets_server_url=mets_server_url) + workspace_server = Workspace(Resolver(), workspace_server.directory, mets_server_url=mets_server_url) f = next(workspace_server.find_files()) assert str(f) == '' a = workspace_server.mets.agents[0] @@ -182,7 +192,7 @@ def test_mets_server_socket_stop(start_mets_server): assert True, 'No stop conditions to test for TCP server' else: assert Path(mets_server_url).exists() - assert workspace_server.mets.workspace_path == WORKSPACE_DIR + assert workspace_server.mets.workspace_path == workspace_server.directory workspace_server.mets.stop() with raises(ConnectionError): workspace_server.mets.file_groups From 61e1042303cad08d51761f5a5880cfd7ab73f7d8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Aug 2024 15:16:04 +0200 Subject: [PATCH 153/249] test processor timeout/parallel: avoid side effects to dummy tool json --- tests/data/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/data/__init__.py b/tests/data/__init__.py index c24a6979b..2bf564d39 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -107,7 +107,8 @@ def process(self): class DummyProcessorWithOutputPagewise(Processor): @property def ocrd_tool(self): - dummy_tool = dict(DUMMY_TOOL) + # make deep copy + dummy_tool = json.loads(json.dumps(DUMMY_TOOL)) dummy_tool['parameters']['sleep'] = {'type': 'number'} return dummy_tool From e395b562b9210d75ede7a76c692fe0b74d531434 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Aug 2024 15:16:55 +0200 Subject: [PATCH 154/249] tess: adapt to wording of exceptions --- tests/processor/test_processor.py | 2 +- tests/test_workspace.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index e0b74fb00..0f5d4fbba 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -284,7 +284,7 @@ def test_run_output_overwrite(self): run_processor(DummyProcessorWithOutput, workspace=ws, input_file_grp="GRP1", output_file_grp="OCR-D-OUT") - assert str(exc.value) == "File with ID='OCR-D-OUT_phys_0001' already exists" + assert "already exists" in str(exc.value) config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' run_processor(DummyProcessorWithOutput, workspace=ws, input_file_grp="GRP1", diff --git a/tests/test_workspace.py b/tests/test_workspace.py index 9d6b64b1e..ad9cd1557 100644 --- a/tests/test_workspace.py +++ b/tests/test_workspace.py @@ -87,7 +87,7 @@ def test_workspace_add_file_overwrite(plain_workspace): plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT', page_id='phys1', local_filename=fpath) with pytest.raises(FileExistsError) as fn_exc: plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT', page_id=None, local_filename=fpath) - assert str(fn_exc.value) == "File with file_id='ID1' already exists" + assert "already exists" in str(fn_exc.value) with pytest.raises(FileExistsError) as fn_exc: plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT', page_id='phys2', local_filename=fpath, force=True) assert 'cannot mitigate' in str(fn_exc.value) @@ -684,7 +684,7 @@ def test_merge_overwrite(tmp_path): ws1.add_file('X', page_id='X', mimetype='X', file_id='id123', local_filename='X/X', content='ws1') ws2.add_file('X', page_id='X', mimetype='X', file_id='id456', local_filename='X/X', content='ws2') ws1.merge(ws2) - assert "would overwrite" == str(exc.value) + assert "would overwrite" in str(exc.value) def test_merge_with_filter(plain_workspace, tmp_path): # arrange From a59ba6acb03f36db5e8e930d9ea5aba3d5329a29 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Aug 2024 15:31:38 +0200 Subject: [PATCH 155/249] ClientSideOcrdMets: partial revert of 9b80ae17ef --- src/ocrd/mets_server.py | 49 ++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 41b1f23ec..0aa4174d3 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -147,9 +147,10 @@ def save(self): Request writing the changes to the file system """ if not self.multiplexing_mode: - self.session.put(url=self.url) + self.session.request("PUT", url=self.url) else: - self.session.post( + self.session.request( + "POST", self.url, json=MpxReq.save(self.ws_dir_path) ) @@ -160,10 +161,11 @@ def stop(self): """ try: if not self.multiplexing_mode: - self.session.delete(self.url) + self.session.request("DELETE", self.url) return else: - self.session.post( + self.session.request( + "POST", self.url, json=MpxReq.stop(self.ws_dir_path) ) @@ -176,9 +178,10 @@ def reload(self): Request reloading of the mets file from the file system """ if not self.multiplexing_mode: - return self.session.post(f"{self.url}/reload").text + return self.session.request("POST", f"{self.url}/reload").text else: - return self.session.post( + return self.session.request( + "POST", self.url, json=MpxReq.reload(self.ws_dir_path) ).json()["text"] @@ -186,9 +189,10 @@ def reload(self): @property def unique_identifier(self): if not self.multiplexing_mode: - return self.session.get(f"{self.url}/unique_identifier").text + return self.session.request("GET", f"{self.url}/unique_identifier").text else: - return self.session.post( + return self.session.request( + "POST", self.url, json=MpxReq.unique_identifier(self.ws_dir_path) ).json()["text"] @@ -196,10 +200,11 @@ def unique_identifier(self): @property def workspace_path(self): if not self.multiplexing_mode: - self.ws_dir_path = self.session.get(f"{self.url}/workspace_path").text + self.ws_dir_path = self.session.request("GET", f"{self.url}/workspace_path").text return self.ws_dir_path else: - self.ws_dir_path = self.session.post( + self.ws_dir_path = self.session.request( + "POST", self.url, json=MpxReq.workspace_path(self.ws_dir_path) ).json()["text"] @@ -208,9 +213,10 @@ def workspace_path(self): @property def file_groups(self): if not self.multiplexing_mode: - return self.session.get(f"{self.url}/file_groups").json()["file_groups"] + return self.session.request("GET", f"{self.url}/file_groups").json()["file_groups"] else: - return self.session.post( + return self.session.request( + "POST", self.url, json=MpxReq.file_groups(self.ws_dir_path) ).json()["file_groups"] @@ -218,9 +224,10 @@ def file_groups(self): @property def agents(self): if not self.multiplexing_mode: - agent_dicts = self.session.get(f"{self.url}/agent").json()["agents"] + agent_dicts = self.session.request("GET", f"{self.url}/agent").json()["agents"] else: - agent_dicts = self.session.post( + agent_dicts = self.session.request( + "POST", self.url, json=MpxReq.agents(self.ws_dir_path) ).json()["agents"] @@ -231,9 +238,10 @@ def agents(self): def add_agent(self, **kwargs): if not self.multiplexing_mode: - return self.session.post(f"{self.url}/agent", json=OcrdAgentModel.create(**kwargs).dict()) + return self.session.request("POST", f"{self.url}/agent", json=OcrdAgentModel.create(**kwargs).dict()) else: - self.session.post( + self.session.request( + "POST", self.url, json=MpxReq.add_agent(self.ws_dir_path, OcrdAgentModel.create(**kwargs).dict()) ).json() @@ -250,9 +258,10 @@ def find_files(self, **kwargs): kwargs["file_grp"] = kwargs.pop("fileGrp") if not self.multiplexing_mode: - r = self.session.get(url=f"{self.url}/file", params={**kwargs}) + r = self.session.request(method="GET", url=f"{self.url}/file", params={**kwargs}) else: - r = self.session.post( + r = self.session.request( + "POST", self.url, json=MpxReq.find_files(self.ws_dir_path, {**kwargs}) ) @@ -277,11 +286,11 @@ def add_file( ) if not self.multiplexing_mode: - r = self.session.post(f"{self.url}/file", data=data.dict()) + r = self.session.request("POST", f"{self.url}/file", data=data.dict()) if not r: raise RuntimeError("Add file failed. Please check provided parameters") else: - r = self.session.post(self.url, json=MpxReq.add_file(self.ws_dir_path, data.dict())) + r = self.session.request("POST", self.url, json=MpxReq.add_file(self.ws_dir_path, data.dict())) if "error" in r: raise RuntimeError(f"Add file failed: Msg: {r['error']}") From 554a67d016f8ae5b747826582bd486c3faa396bf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Aug 2024 17:04:04 +0200 Subject: [PATCH 156/249] disableLogging: re-instate root logger, to --- src/ocrd_utils/logging.py | 4 +++- tests/test_decorators.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index 5cea55e5b..181805118 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -210,11 +210,13 @@ def disableLogging(silent=not config.OCRD_LOGGING_DEBUG): # logging.basicConfig(level=logging.CRITICAL) # logging.disable(logging.ERROR) # remove all handlers for the ocrd logger - for logger_name in ROOT_OCRD_LOGGERS: + for logger_name in ROOT_OCRD_LOGGERS + ['']: for handler in logging.getLogger(logger_name).handlers[:]: logging.getLogger(logger_name).removeHandler(handler) for logger_name in LOGGING_DEFAULTS: logging.getLogger(logger_name).setLevel(logging.NOTSET) + # Python default log level is WARNING + logging.root.setLevel(logging.WARNING) # Initializing stream handlers at module level # would cause message output in all runtime contexts, diff --git a/tests/test_decorators.py b/tests/test_decorators.py index 5ab288005..df8d6422b 100644 --- a/tests/test_decorators.py +++ b/tests/test_decorators.py @@ -64,6 +64,7 @@ def test_loglevel_override(self): pytest.skip(f"ocrd_logging.conf found at {get_logging_config_files()}, skipping logging test") import logging disableLogging() + assert logging.getLogger('').getEffectiveLevel() == logging.WARNING assert logging.getLogger('ocrd').getEffectiveLevel() == logging.WARNING initLogging() assert logging.getLogger('ocrd').getEffectiveLevel() == logging.INFO From 1114cd9165844aa5bbffa7439177fb00d78fee4e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Aug 2024 17:15:56 +0200 Subject: [PATCH 157/249] test-logging: also remove ocrd.log from tempdir --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e44919d3f..1a4a6bbdb 100644 --- a/Makefile +++ b/Makefile @@ -273,7 +273,7 @@ test-logging: assets cp src/ocrd_utils/ocrd_logging.conf $$tempdir; \ cd $$tempdir; \ $(PYTHON) -m pytest --continue-on-collection-errors -k TestLogging -k TestDecorators $(TESTDIR); \ - rm -r $$tempdir/ocrd_logging.conf $$tempdir/.benchmarks; \ + rm -r $$tempdir/ocrd_logging.conf $$tempdir/ocrd.log $$tempdir/.benchmarks; \ rm -rf $$tempdir/.coverage; \ rmdir $$tempdir From ce6d23937c13eae2b5c884e77f2243f94770c737 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Aug 2024 00:27:32 +0200 Subject: [PATCH 158/249] Processor: fix 7966057f (deprecated passing of ocrd_tool or version via init) --- src/ocrd/processor/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 5f8eabbb0..614d5cb44 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -301,12 +301,12 @@ def __init__( if ocrd_tool is not None: deprecation_warning("Passing 'ocrd_tool' as keyword argument to Processor is deprecated - " "use or override metadata/executable/ocrd-tool properties instead") - self._ocrd_tool = ocrd_tool - self._executable = ocrd_tool['executable'] + self.ocrd_tool = ocrd_tool + self.executable = ocrd_tool['executable'] if version is not None: deprecation_warning("Passing 'version' as keyword argument to Processor is deprecated - " "use or override metadata/version properties instead") - self._version = version + self.version = version if workspace is not None: deprecation_warning("Passing a workspace argument other than 'None' to Processor " "is deprecated - pass as argument to process_workspace instead") From df9916074b77ebbed3fcad6e84c3bba66e65b0d9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Aug 2024 00:50:08 +0200 Subject: [PATCH 159/249] Processor.generate_processor_help: forgot to include --log-filename --- src/ocrd/decorators/ocrd_cli_options.py | 2 +- src/ocrd/processor/base.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ocrd/decorators/ocrd_cli_options.py b/src/ocrd/decorators/ocrd_cli_options.py index e069b3ea8..944f60645 100644 --- a/src/ocrd/decorators/ocrd_cli_options.py +++ b/src/ocrd/decorators/ocrd_cli_options.py @@ -39,6 +39,7 @@ def cli(mets_url): parameter_option, parameter_override_option, loglevel_option, + option('--log-filename', default=None), option('--address', type=ServerAddressParamType()), option('--queue', type=QueueServerParamType()), option('--database', type=DatabaseParamType()), @@ -48,7 +49,6 @@ def cli(mets_url): option('-D', '--dump-module-dir', is_flag=True, default=False), option('-h', '--help', is_flag=True, default=False), option('-V', '--version', is_flag=True, default=False), - option('--log-filename', default=None), # Subcommand, only used for 'worker'/'server'. Cannot be handled in # click because processors use the @command decorator and even if they # were using `group`, you cannot combine have a command with diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 614d5cb44..859b5d4f7 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -991,6 +991,7 @@ def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None) otherwise URL is a path to an on-demand-created unix socket -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE] Override log level globally [INFO] + --log-filename LOG-PATH File to redirect stderr logging to (overriding ocrd_logging.conf). ''' information_options = '''\ From eb74fab45d1e8fe713801dab92a42710f5ef904d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Aug 2024 00:50:34 +0200 Subject: [PATCH 160/249] bashlib: re-add --log-filename, implement as stderr redirect --- src/ocrd/lib.bash | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ocrd/lib.bash b/src/ocrd/lib.bash index 6b08f669d..476b41096 100644 --- a/src/ocrd/lib.bash +++ b/src/ocrd/lib.bash @@ -156,6 +156,7 @@ ocrd__parse_argv () { while [[ "${1:-}" = -* ]];do case "$1" in -l|--log-level) ocrd__argv[log_level]=$2 ; shift ;; + --log-filename) exec 2> "$2" ; shift ;; -h|--help|--usage) ocrd__usage; exit ;; -J|--dump-json) ocrd__dumpjson; exit ;; -D|--dump-module-dir) echo $(dirname "$OCRD_TOOL_JSON"); exit ;; From 8565a8f4187df52ac30c40a5d5b03366751882f9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Aug 2024 01:19:17 +0200 Subject: [PATCH 161/249] test_processor: add legacy (v2-style) dummy case --- tests/data/__init__.py | 27 +++++++++++++++++++++++++-- tests/processor/test_processor.py | 9 +++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/tests/data/__init__.py b/tests/data/__init__.py index 2bf564d39..1589ae4db 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -1,6 +1,5 @@ import json import os -import re from time import sleep from pytest import warns from ocrd import Processor, OcrdPageResult @@ -147,7 +146,7 @@ def __init__(self, *args, **kwargs): # no error handling with old process(), so override new API def process_page_file(self, input_file): - n = int(re.findall(r'\d+', input_file.pageId)[-1]) + n = self.workspace.mets.physical_pages.index(input_file.pageId) + 1 if n % 2: raise Exception(f"intermittent failure on page {input_file.pageId}") output_file_id = make_file_id(input_file, self.output_file_grp) @@ -160,6 +159,30 @@ def process_page_file(self, input_file): force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', ) +class DummyProcessorWithOutputLegacy(Processor): + def __init__(self, *args, **kwargs): + kwargs['download_files'] = False + kwargs['ocrd_tool'] = DUMMY_TOOL + kwargs['version'] = '0.0.1' + super().__init__(*args, **kwargs) + if hasattr(self, 'output_file_grp'): + self.setup() + + def process(self): + # print([str(x) for x in self.input_files] + for input_file in self.input_files: + file_id = make_file_id(input_file, self.output_file_grp) + # print(input_file.ID, file_id) + self.workspace.add_file( + file_id=file_id, + file_grp=self.output_file_grp, + page_id=input_file.pageId, + mimetype=input_file.mimetype, + local_filename=os.path.join(self.output_file_grp, file_id), + content='CONTENT', + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) + class IncompleteProcessor(Processor): @property def executable(self): diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 0f5d4fbba..1faef5be7 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -9,6 +9,7 @@ DummyProcessor, DummyProcessorWithRequiredParameters, DummyProcessorWithOutput, + DummyProcessorWithOutputLegacy, DummyProcessorWithOutputPagewise, DummyProcessorWithOutputFailures, IncompleteProcessor @@ -218,6 +219,14 @@ def test_run_output0(self): output_file_grp="OCR-D-OUT") assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == 2 + def test_run_output_legacy(self): + ws = self.workspace + run_processor(DummyProcessorWithOutputLegacy, + workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT") + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + def test_run_output_missing(self): ws = self.workspace from ocrd_utils import config From abe069a490ee541ae04ef8f6214ab4250c880570 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Aug 2024 01:48:44 +0200 Subject: [PATCH 162/249] :memo: update changelog --- CHANGELOG.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a3807c0b..b9e660ebc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,10 +12,13 @@ Added: - `OCRD_PROCESSING_PAGE_TIMEOUT` for whether and how long processors should wait for single pages - `OCRD_MAX_MISSING_OUTPUTS` for maximum rate (fraction) of pages before making `OCRD_MISSING_OUTPUT=abort` +Fixed: + - `disableLogging`: also re-instate root logger to Python defaults + ## [3.0.0b1] - 2024-08-26 Fixed: - - actually apply CLI `--log-filename` + - actually apply CLI `--log-filename`, and show in `--help` - adapt to Pillow changes - `ocrd workspace clone`: do pass on `--file-grp` (for download filtering) From 11f926412f5b71a05483eac34d9313a652ba3b5b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Aug 2024 01:48:46 +0200 Subject: [PATCH 163/249] :memo: update readmes (esp. new config variables) --- README.md | 40 +++++++++++++++++++++++++++++++--------- README_bashlib.md | 10 +++++++++- 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index b401428ee..d41a2dddb 100644 --- a/README.md +++ b/README.md @@ -47,17 +47,12 @@ complete stack of OCR-D-related software. The easiest way to install is via `pip`: -```sh -pip install ocrd + pip install ocrd -# or just the functionality you need, e.g. - -pip install ocrd_modelfactory -``` All Python software released by [OCR-D](https://github.com/OCR-D) requires Python 3.8 or higher. -**NOTE** Some OCR-D-Tools (or even test cases) _might_ reveal an unintended behavior if you have specific environment modifications, like: +> **NOTE** Some OCR-D tools (or even test cases) _might_ reveal an unintended behavior if you have specific environment modifications, like: * using a custom build of [ImageMagick](https://github.com/ImageMagick/ImageMagick), whose format delegates are different from what OCR-D supposes * custom Python logging configurations in your personal account @@ -82,7 +77,6 @@ Almost all behaviour of the OCR-D/core software is configured via CLI options an Some parts of the software are configured via environment variables: -* `OCRD_METS_CACHING`: If set to `true`, access to the METS file is cached, speeding in-memory search and modification. * `OCRD_PROFILE`: This variable configures the built-in CPU and memory profiling. If empty, no profiling is done. Otherwise expected to contain any of the following tokens: * `CPU`: Enable CPU profiling of processor runs * `RSS`: Enable RSS memory profiling @@ -95,18 +89,46 @@ Some parts of the software are configured via environment variables: * `XDG_CONFIG_HOME`: Directory to look for `./ocrd/resources.yml` (i.e. `ocrd resmgr` user database) – defaults to `$HOME/.config`. * `XDG_DATA_HOME`: Directory to look for `./ocrd-resources/*` (i.e. `ocrd resmgr` data location) – defaults to `$HOME/.local/share`. -* `OCRD_DOWNLOAD_RETRIES`: Number of times to retry failed attempts for downloads of workspace files. +* `OCRD_DOWNLOAD_RETRIES`: Number of times to retry failed attempts for downloads of resources or workspace files. * `OCRD_DOWNLOAD_TIMEOUT`: Timeout in seconds for connecting or reading (comma-separated) when downloading. +* `OCRD_MISSING_INPUT`: How to deal with missing input files (for some fileGrp/pageId) during processing: + * `SKIP`: ignore and proceed with next page's input + * `ABORT`: throw `MissingInputFile` exception + +* `OCRD_MISSING_OUTPUT`: How to deal with missing output files (for some fileGrp/pageId) during processing: + * `SKIP`: ignore and proceed processing next page + * `COPY`: fall back to copying input PAGE to output fileGrp for page + * `ABORT`: re-throw whatever caused processing to fail + +* `OCRD_MAX_MISSING_OUTPUTS`: Maximal rate of skipped/fallback pages among all processed pages before aborting (decimal fraction, ignored if negative). + +* `OCRD_EXISTING_OUTPUT`: How to deal with already existing output files (for some fileGrp/pageId) during processing: + * `SKIP`: ignore and proceed processing next page + * `OVERWRITE`: force writing result to output fileGrp for page + * `ABORT`: re-throw `FileExistsError` exception + + * `OCRD_METS_CACHING`: Whether to enable in-memory storage of OcrdMets data structures for speedup during processing or workspace operations. * `OCRD_MAX_PROCESSOR_CACHE`: Maximum number of processor instances (for each set of parameters) to be kept in memory (including loaded models) for processing workers or processor servers. +* `OCRD_MAX_PARALLEL_PAGES`: Maximum number of processor threads for page-parallel processing (within each Processor's selected page range, independent of the number of Processing Workers or Processor Servers). If set `>1`, then a METS Server must be used for METS synchronisation. + +* `OCRD_PROCESSING_PAGE_TIMEOUT`: Timeout in seconds for processing a single page. If set >0, when exceeded, the same as OCRD_MISSING_OUTPUT applies. + * `OCRD_NETWORK_SERVER_ADDR_PROCESSING`: Default address of Processing Server to connect to (for `ocrd network client processing`). * `OCRD_NETWORK_SERVER_ADDR_WORKFLOW`: Default address of Workflow Server to connect to (for `ocrd network client workflow`). * `OCRD_NETWORK_SERVER_ADDR_WORKSPACE`: Default address of Workspace Server to connect to (for `ocrd network client workspace`). * `OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS`: Number of attempts for a worker to create its queue. Helpful if the rabbitmq-server needs time to be fully started. +* `OCRD_NETWORK_CLIENT_POLLING_SLEEP`: How many seconds to sleep before trying `ocrd network client` again. +* `OCRD_NETWORK_CLIENT_POLLING_TIMEOUT`: Timeout for a blocking `ocrd network client` (in seconds). + +* `OCRD_NETWORK_SOCKETS_ROOT_DIR`: The root directory where all mets server related socket files are created. +* `OCRD_NETWORK_LOGS_ROOT_DIR`: The root directory where all ocrd_network related file logs are stored. + + ## Packages diff --git a/README_bashlib.md b/README_bashlib.md index 09199468c..20379c3c9 100644 --- a/README_bashlib.md +++ b/README_bashlib.md @@ -21,6 +21,9 @@ For example: * [`ocrd__log`](#ocrd__log) * [`ocrd__minversion`](#ocrd__minversion) * [`ocrd__dumpjson`](#ocrd__dumpjson) +* [`ocrd__resolve_resource`](#ocrd__resolve_resource) +* [`ocrd__show_resource`](#ocrd__show_resource) +* [`ocrd__list_resources`](#ocrd__list_resources) * [`ocrd__usage`](#ocrd__usage) * [`ocrd__parse_argv`](#ocrd__parse_argv) @@ -56,6 +59,10 @@ export OCRD_TOOL_NAME=ocrd-foo-bar (Which you automatically get from [`ocrd__wrap`](#ocrd__wrap).) +### `ocrd__resolve_resource` + +Output given resource file's path. + ### `ocrd__show_resource` Output given resource file's content. @@ -88,6 +95,7 @@ This will be filled by the parser along the following keys: - `profile`: whether `--profile` is enabled - `profile_file`: the argument of `--profile-file` - `log_level`: the argument of `--log-level` +- `mets_server_url`: the argument of `--mets-server-url` argument - `mets_file`: absolute path of the `--mets` argument - `working_dir`: absolute path of the `--working-dir` argument or the parent of `mets_file` - `page_id`: the argument of `--page-id` @@ -95,7 +103,7 @@ This will be filled by the parser along the following keys: - `output_file_grp`: the argument of `--output-file-grp` Moreover, there will be an associative array **`params`** -with the fully expanded runtime values of the ocrd-tool.json parameters. +with the fully validated and default-expanded runtime values of the `ocrd-tool.json` parameters. ### `ocrd__wrap` From ca8812228b87c635adeefe74be4c48a89ff063c0 Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 30 Aug 2024 13:26:53 +0200 Subject: [PATCH 164/249] :package: v3.0.0b2 --- CHANGELOG.md | 7 +++---- VERSION | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b9e660ebc..ccb31bc1a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ Change Log Versioned according to [Semantic Versioning](http://semver.org/). -## Unreleased +## [3.0.0b2] - 2024-08-30 Added: - `Processor.max_workers`: class attribute to control per-page parallelism of this implementation @@ -2244,12 +2244,11 @@ Fixed Initial Release -<<<<<<< HEAD +[3.0.0b2]: ../../compare/v3.0.0b2..v3.0.0b1 +[3.0.0b1]: ../../compare/v3.0.0b1..v3.0.0a2 [3.0.0a2]: ../../compare/v3.0.0a2..v3.0.0a1 [3.0.0a1]: ../../compare/v3.0.0a1..v2.67.2 -======= [2.68.0]: ../../compare/v2.68.0..v2.67.2 ->>>>>>> @{-1} [2.67.2]: ../../compare/v2.67.2..v2.67.1 [2.67.1]: ../../compare/v2.67.1..v2.67.0 [2.67.0]: ../../compare/v2.67.0..v2.66.1 diff --git a/VERSION b/VERSION index 2daa89b06..2aa4d8f0a 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.0.0b1 +3.0.0b2 From 837aba7f55fa5b042bda699acf0dea12b3a20f67 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Aug 2024 22:13:01 +0200 Subject: [PATCH 165/249] ocrd_utils.config: add reset_defaults() --- src/ocrd_utils/config.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 0186b8539..36399870e 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -78,14 +78,26 @@ def has_default(self, name): raise ValueError(f"Unregistered env variable {name}") return self._variables[name].has_default + def reset_defaults(self): + for name in self._variables: + try: + # we cannot use hasattr, because that delegates to getattr, + # which we override and provide defaults for (which of course + # cannot be removed) + if self.__getattribute__(name): + delattr(self, name) + except AttributeError: + pass + def describe(self, name, *args, **kwargs): if not name in self._variables: raise ValueError(f"Unregistered env variable {name}") return self._variables[name].describe(*args, **kwargs) def __getattr__(self, name): + # will be called if name is not accessible (has not been added directly yet) if not name in self._variables: - raise ValueError(f"Unregistered env variable {name}") + raise AttributeError(f"Unregistered env variable {name}") var_obj = self._variables[name] try: raw_value = self.raw_value(name) From 85e96ffbd787ec3aedac73a9368bdd903124b877 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Aug 2024 22:13:31 +0200 Subject: [PATCH 166/249] add test for OcrdEnvConfig.reset_defaults() --- tests/utils/test_config.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/utils/test_config.py b/tests/utils/test_config.py index 99595a864..a94eb5d3c 100644 --- a/tests/utils/test_config.py +++ b/tests/utils/test_config.py @@ -57,3 +57,11 @@ def test_OCRD_PROFILE(): with temp_env_var('OCRD_PROFILE', 'some other value'): with raises(ValueError, match="'OCRD_PROFILE' set to invalid value 'some other value'"): config.OCRD_PROFILE + +def test_defaults(): + default = config.OCRD_MAX_PROCESSOR_CACHE + print(type(default)) + config.OCRD_MAX_PROCESSOR_CACHE = 2 + assert config.OCRD_MAX_PROCESSOR_CACHE == 2 + config.reset_defaults() + assert config.OCRD_MAX_PROCESSOR_CACHE == default From 8911c3be2947870a0193a688fa3a6cf72a9751bc Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 30 Aug 2024 15:17:14 +0200 Subject: [PATCH 167/249] Processor: improve processing log messages --- src/ocrd/processor/base.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 859b5d4f7..a72e4dd3d 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -470,7 +470,7 @@ def process_workspace(self, workspace: Workspace) -> None: max_workers=max_workers or 1, thread_name_prefix=f"pagetask.{workspace.mets.unique_identifier}" ) - self._base_logger.debug("started executor %s", str(executor)) + self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1) tasks = {} for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False): @@ -478,7 +478,7 @@ def process_workspace(self, workspace: Workspace) -> None: page_id = next(input_file.pageId for input_file in input_file_tuple if input_file) - self._base_logger.info(f"processing page {page_id}") + self._base_logger.info(f"preparing page {page_id}") for i, input_file in enumerate(input_file_tuple): if input_file is None: # file/page not found in this file grp @@ -521,9 +521,10 @@ def process_workspace(self, workspace: Workspace) -> None: # broad coverage of output failures (including TimeoutError) except (Exception, TimeoutError) as err: # FIXME: add re-usable/actionable logging - self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") if config.OCRD_MISSING_OUTPUT == 'ABORT': + self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") raise err + self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") if config.OCRD_MISSING_OUTPUT == 'SKIP': nr_skipped += 1 continue @@ -587,6 +588,7 @@ def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files) assert isinstance(input_files[0], get_args(OcrdFileType)) page_id = input_files[0].pageId + self._base_logger.info("processing page %s", page_id) for i, input_file in enumerate(input_files): assert isinstance(input_file, get_args(OcrdFileType)) self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}") From 98d97fc5b2afa7917ffc04335937ccb4fdbbc984 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 30 Aug 2024 15:18:00 +0200 Subject: [PATCH 168/249] ocrd.cli doc: don't rewrap description lists --- src/ocrd/cli/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index fee3c47d8..9e8a37b8b 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -61,11 +61,11 @@ def get_help(self, ctx): \b {config.describe('OCRD_DOWNLOAD_INPUT')} \b -{config.describe('OCRD_MISSING_INPUT')} +{config.describe('OCRD_MISSING_INPUT', wrap_text=False)} \b -{config.describe('OCRD_MISSING_OUTPUT')} +{config.describe('OCRD_MISSING_OUTPUT', wrap_text=False)} \b -{config.describe('OCRD_EXISTING_OUTPUT')} +{config.describe('OCRD_EXISTING_OUTPUT', wrap_text=False)} \b {config.describe('OCRD_METS_CACHING')} \b From cb758e8dfc97aa5a41f5ee08fae3e459c4c283cc Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 30 Aug 2024 15:42:07 +0200 Subject: [PATCH 169/249] :package: v3.0.0b3 --- CHANGELOG.md | 7 +++++++ VERSION | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ccb31bc1a..115d68334 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,12 @@ Change Log Versioned according to [Semantic Versioning](http://semver.org/). +## [3.0.0b3] - 2024-08-30 + +Added: + + * `OcrdConfig.reset_defaults` to reset config variables to their defaults + ## [3.0.0b2] - 2024-08-30 Added: @@ -2244,6 +2250,7 @@ Fixed Initial Release +[3.0.0b3]: ../../compare/v3.0.0b3..v3.0.0b2 [3.0.0b2]: ../../compare/v3.0.0b2..v3.0.0b1 [3.0.0b1]: ../../compare/v3.0.0b1..v3.0.0a2 [3.0.0a2]: ../../compare/v3.0.0a2..v3.0.0a1 diff --git a/VERSION b/VERSION index 2aa4d8f0a..005e92c1e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.0.0b2 +3.0.0b3 From 1ed38a6a7559bc0c109e8130220de49698796efd Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 31 Aug 2024 01:56:38 +0200 Subject: [PATCH 170/249] Processor.metadata_location: find location package prefix (necessary for namespace packages) --- src/ocrd/processor/base.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index a72e4dd3d..a18e53f5a 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -166,11 +166,14 @@ def metadata_location(self) -> Path: (Override if ``ocrd-tool.json`` is not distributed with the Python package.) """ - # XXX HACK - module_tokens = self.__module__.split('.') - if module_tokens[0] == 'src': - module_tokens.pop(0) - return resource_filename(module_tokens[0], self.metadata_filename) + module = inspect.getmodule(self) + module_tokens = module.__package__.split('.') + # for namespace packages, we cannot just use the first token + for i in range(len(module_tokens)): + prefix = '.'.join(module_tokens[:i + 1]) + if sys.modules[prefix].__spec__.has_location: + return resource_filename(prefix, self.metadata_filename) + raise Exception("cannot find top-level module prefix for %s", module.__package__) @cached_property def metadata_rawdict(self) -> dict: From 7d98c270110eda161ca7401645aab778a3911542 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 1 Sep 2024 10:13:38 +0200 Subject: [PATCH 171/249] Processor: log when max_workers / max_page_seconds are in effect --- src/ocrd/processor/base.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index a18e53f5a..a28643660 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -458,17 +458,17 @@ def process_workspace(self, workspace: Workspace) -> None: nr_copied = 0 # set up multithreading - if self.max_workers <= 0: - max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES) - else: - max_workers = max(0, min(config.OCRD_MAX_PARALLEL_PAGES, self.max_workers)) + max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES) + if self.max_workers > 0 and self.max_workers < config.OCRD_MAX_PARALLEL_PAGES: + self._base_logger.info("limiting number of threads from %d to %d", max_workers, self.max_workers) + max_workers = self.max_workers if max_workers > 1: assert isinstance(workspace.mets, ClientSideOcrdMets), \ "OCRD_MAX_PARALLEL_PAGES>1 requires also using --mets-server-url" - if self.max_page_seconds <= 0: - max_seconds = max(0, config.OCRD_PROCESSING_PAGE_TIMEOUT) - else: - max_seconds = max(0, min(config.OCRD_PROCESSING_PAGE_TIMEOUT, self.max_page_seconds)) + max_seconds = max(0, config.OCRD_PROCESSING_PAGE_TIMEOUT) + if self.max_page_seconds > 0 and self.max_page_seconds < config.OCRD_PROCESSING_PAGE_TIMEOUT: + self._base_logger.info("limiting page timeout from %d to %d sec", max_seconds, self.max_page_seconds) + max_seconds = self.max_page_seconds executor = ThreadPoolExecutor( max_workers=max_workers or 1, thread_name_prefix=f"pagetask.{workspace.mets.unique_identifier}" From 6b23b659f514f3672c7f5c6d29feeb1d85a26435 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 1 Sep 2024 10:14:13 +0200 Subject: [PATCH 172/249] Workspace.reload_mets: fix for METS server case --- src/ocrd/workspace.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index 270414ec4..4a99a112c 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -121,7 +121,10 @@ def reload_mets(self): """ Reload METS from the filesystem. """ - self.mets = OcrdMets(filename=self.mets_target) + if self.is_remote: + self.mets.reload() + else: + self.mets = OcrdMets(filename=self.mets_target) @deprecated_alias(pageId="page_id") @deprecated_alias(ID="file_id") From cac05cd31a4cf8a6aa4d3aef7b9b772ec4de96a6 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 2 Sep 2024 11:35:52 +0200 Subject: [PATCH 173/249] :memo: changelog --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 115d68334..767dea5c2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,13 @@ Change Log Versioned according to [Semantic Versioning](http://semver.org/). +## Unreleased + +Fixed: + + * `Processor.metadata_location`: `src` workaround respects namespace packages, qurator-spk/eynollah#134 + * `Workspace.reload_mets`: handle ClientSideOcrdMets as well + ## [3.0.0b3] - 2024-08-30 Added: From 0b0d419c3304747f25cb3be509b182e061870be5 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 2 Sep 2024 11:36:21 +0200 Subject: [PATCH 174/249] :package: v3.0.0b4 --- CHANGELOG.md | 2 ++ VERSION | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 767dea5c2..7ec12c893 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [3.0.0b4] - 2024-09-02 + Fixed: * `Processor.metadata_location`: `src` workaround respects namespace packages, qurator-spk/eynollah#134 diff --git a/VERSION b/VERSION index 005e92c1e..9414e1270 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.0.0b3 +3.0.0b4 From a34beb8ec01c8af34eb7a58fd7fd8ba4f362dc46 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 2 Sep 2024 14:59:42 +0200 Subject: [PATCH 175/249] OcrdMetsServer.add_file: pass on 'force' kwarg, too --- src/ocrd/mets_server.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 0aa4174d3..2f7b9842b 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -284,15 +284,17 @@ def add_file( file_id=ID, page_id=pageId, mimetype=mimetype, url=url, local_filename=local_filename ) + # add force+ignore + kwargs = {**kwargs, **data.dict()} if not self.multiplexing_mode: - r = self.session.request("POST", f"{self.url}/file", data=data.dict()) - if not r: - raise RuntimeError("Add file failed. Please check provided parameters") + r = self.session.request("POST", f"{self.url}/file", data=kwargs) + if not r.ok: + raise RuntimeError(f"Failed to add file ({str(data)}): {r.json()}") else: - r = self.session.request("POST", self.url, json=MpxReq.add_file(self.ws_dir_path, data.dict())) - if "error" in r: - raise RuntimeError(f"Add file failed: Msg: {r['error']}") + r = self.session.request("POST", self.url, json=MpxReq.add_file(self.ws_dir_path, kwargs)) + if not r.ok: + raise RuntimeError(f"Failed to add file ({str(data)}): {r.json()[errors]}") return ClientSideOcrdFile( None, fileGrp=file_grp, @@ -505,7 +507,8 @@ async def add_file( page_id: Optional[str] = Form(), mimetype: str = Form(), url: Optional[str] = Form(None), - local_filename: Optional[str] = Form(None) + local_filename: Optional[str] = Form(None), + force: bool = Form(False), ): """ Add a file @@ -517,7 +520,7 @@ async def add_file( ) # Add to workspace kwargs = file_resource.dict() - workspace.add_file(**kwargs) + workspace.add_file(**kwargs, force=force) return file_resource # ------------- # From dfa715db56ce538e3b6dc4f849983e392bff4296 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 2 Sep 2024 15:00:38 +0200 Subject: [PATCH 176/249] test_mets_server: add test for force (overwrite) --- tests/test_mets_server.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index 8f94b9564..dc94d6c56 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -55,7 +55,7 @@ def _start_mets_server(*args, **kwargs): p.terminate() rmtree(tmpdir, ignore_errors=True) -def add_file_server(x): +def add_file_server(x, force=False): mets_server_url, directory, i = x workspace_server = Workspace(Resolver(), directory, mets_server_url=mets_server_url) workspace_server.add_file( @@ -65,6 +65,7 @@ def add_file_server(x): page_id=f'page{i}', file_id=f'FOO_page{i}_foo{i}', # url=f'url{i}' + force=force ) def add_agent_server(x): @@ -123,6 +124,19 @@ def test_mets_server_add_file(start_mets_server): assert len(workspace_file.mets.find_all_files(fileGrp='FOO')) == NO_FILES +def test_mets_server_add_file_overwrite(start_mets_server): + mets_server_url, workspace_server = start_mets_server + + add_file_server((mets_server_url, workspace_server.directory, 5)) + + assert len(workspace_server.mets.find_all_files(fileGrp='FOO')) == 1 + + with raises(RuntimeError, match="already exists"): + add_file_server((mets_server_url, workspace_server.directory, 5)) + + add_file_server((mets_server_url, workspace_server.directory, 5), force=True) + assert len(workspace_server.mets.find_all_files(fileGrp='FOO')) == 1 + def test_mets_server_add_agents(start_mets_server): NO_AGENTS = 30 From 9a8c41db32e08795dcd5c1614d654e820911abdb Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 2 Sep 2024 15:01:37 +0200 Subject: [PATCH 177/249] test_processor: add test for force (overwrite) w/ METS Server --- tests/data/__init__.py | 2 +- tests/processor/test_processor.py | 35 +++++++++++++++++++++++++++---- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/tests/data/__init__.py b/tests/data/__init__.py index 1589ae4db..11b7b01cc 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -103,7 +103,7 @@ def process(self): force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', ) -class DummyProcessorWithOutputPagewise(Processor): +class DummyProcessorWithOutputSleep(Processor): @property def ocrd_tool(self): # make deep copy diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 1faef5be7..4305f0e68 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -10,7 +10,7 @@ DummyProcessorWithRequiredParameters, DummyProcessorWithOutput, DummyProcessorWithOutputLegacy, - DummyProcessorWithOutputPagewise, + DummyProcessorWithOutputSleep, DummyProcessorWithOutputFailures, IncompleteProcessor ) @@ -266,7 +266,7 @@ def test_run_output_timeout(self): config.OCRD_MAX_MISSING_OUTPUTS = -1 config.OCRD_MISSING_OUTPUT = 'ABORT' config.OCRD_PROCESSING_PAGE_TIMEOUT = 3 - run_processor(DummyProcessorWithOutputPagewise, workspace=ws, + run_processor(DummyProcessorWithOutputSleep, workspace=ws, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-OUT", parameter={"sleep": 1}) @@ -275,7 +275,7 @@ def test_run_output_timeout(self): config.OCRD_PROCESSING_PAGE_TIMEOUT = 1 from concurrent.futures import TimeoutError with pytest.raises(TimeoutError) as exc: - run_processor(DummyProcessorWithOutputPagewise, workspace=ws, + run_processor(DummyProcessorWithOutputSleep, workspace=ws, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-OUT", parameter={"sleep": 3}) @@ -419,6 +419,33 @@ def ocrd_tool(self): r = self.capture_out_err() assert 'ERROR ocrd.processor.base - Found no file for page phys_0001 in file group GRP1' in r.err +def test_run_output_metsserver(start_mets_server): + mets_server_url, ws = start_mets_server + from ocrd_utils import config + # do not raise for number of failures: + config.OCRD_MAX_MISSING_OUTPUTS = -1 + run_processor(DummyProcessorWithOutputSleep, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT", + parameter={"sleep": 0}, + mets_server_url=mets_server_url) + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' + run_processor(DummyProcessorWithOutputSleep, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT", + parameter={"sleep": 0}, + mets_server_url=mets_server_url) + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + config.OCRD_EXISTING_OUTPUT = 'ABORT' + with pytest.raises(Exception) as exc: + run_processor(DummyProcessorWithOutputSleep, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT", + parameter={"sleep": 0}, + mets_server_url=mets_server_url) + assert "already exists" in str(exc.value) + # 2s (+ 2s tolerance) instead of 3*3s (+ 2s tolerance) @pytest.mark.timeout(4) def test_run_output_parallel(start_mets_server): @@ -429,7 +456,7 @@ def test_run_output_parallel(start_mets_server): # do not raise for number of failures: config.OCRD_MAX_MISSING_OUTPUTS = -1 config.OCRD_MAX_PARALLEL_PAGES = 3 - run_processor(DummyProcessorWithOutputPagewise, workspace=ws, + run_processor(DummyProcessorWithOutputSleep, workspace=ws, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-OUT", parameter={"sleep": 2}, From 65ab63c7bde1fa468c63cb1f29a497d3a8f55fcf Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 26 Aug 2024 11:46:14 +0200 Subject: [PATCH 178/249] add typing, extend docs --- src/ocrd/processor/base.py | 4 ++-- src/ocrd/processor/helpers.py | 9 +++++---- tests/data/__init__.py | 13 +++++++------ tests/processor/test_processor.py | 3 ++- 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index a28643660..693176800 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -16,7 +16,7 @@ import os from os import getcwd from pathlib import Path -from typing import List, Optional, Union, get_args +from typing import Any, List, Optional, Union, get_args import sys import inspect import tarfile @@ -339,7 +339,7 @@ def __init__( self._finalizer = weakref.finalize(self, self.shutdown) # workaround for deprecated#72 (@deprecated decorator does not work for subclasses): setattr(self, 'process', - deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process'))) + deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')(getattr(self, 'process'))) def show_help(self, subcommand=None): """ diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index a675ff129..22837e212 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -234,10 +234,10 @@ def get_cached_processor(parameter: dict, processor_class): def get_processor( processor_class, parameter: Optional[dict] = None, - workspace: Workspace = None, - page_id: str = None, - input_file_grp: List[str] = None, - output_file_grp: List[str] = None, + workspace: Optional[Workspace] = None, + page_id: Optional[str] = None, + input_file_grp: Optional[List[str]] = None, + output_file_grp: Optional[List[str]] = None, instance_caching: bool = False, ): if processor_class: @@ -258,6 +258,7 @@ def get_processor( else: # avoid passing workspace already (deprecated chdir behaviour) processor = processor_class(None, parameter=parameter) + assert processor # set current processing parameters processor.workspace = workspace processor.page_id = page_id diff --git a/tests/data/__init__.py b/tests/data/__init__.py index 11b7b01cc..4dcf29fa0 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -1,3 +1,4 @@ +from functools import cached_property import json import os from time import sleep @@ -72,15 +73,15 @@ def __init__(self, *args, **kwargs): def process(self): pass class DummyProcessorWithOutput(Processor): - @property + @cached_property def ocrd_tool(self): return DUMMY_TOOL - @property + @cached_property def version(self): return '0.0.1' - @property + @cached_property def executable(self): return 'ocrd-test' @@ -128,15 +129,15 @@ def process_page_pcgts(self, pcgts, page_id=None): return OcrdPageResult(pcgts) class DummyProcessorWithOutputFailures(Processor): - @property + @cached_property def ocrd_tool(self): return DUMMY_TOOL - @property + @cached_property def version(self): return '0.0.1' - @property + @cached_property def executable(self): return 'ocrd-test' diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 4305f0e68..1497927a0 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -1,3 +1,4 @@ +from functools import cached_property import json from contextlib import ExitStack @@ -188,7 +189,7 @@ def test_params_preset_resolve(self): def test_params(self): class ParamTestProcessor(Processor): - @property + @cached_property def ocrd_tool(self): return {} proc = ParamTestProcessor(None) From 73a395e40ce4aed9007a6959af56476503dcb008 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 5 Sep 2024 22:19:23 +0200 Subject: [PATCH 179/249] Processor.verify: revert 5819c816 (we still have no defaults in json loaded from v2) --- src/ocrd/processor/base.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 693176800..02595f4c2 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -374,10 +374,13 @@ def assert_file_grp_cardinality(grps : List[str], spec : Union[int, List[int]], assert len(grps) >= minimum, msg % (len(grps), str(spec)) if maximum > 0: assert len(grps) <= maximum, msg % (len(grps), str(spec)) - assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'], - "Unexpected number of input file groups %d vs %s") - assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'], - "Unexpected number of output file groups %d vs %s") + # FIXME: enforce unconditionally as soon as grace period for deprecation is over + if 'input_file_grp_cardinality' in self.ocrd_tool: + assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'], + "Unexpected number of input file groups %d vs %s") + if 'output_file_grp_cardinality' in self.ocrd_tool: + assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'], + "Unexpected number of output file groups %d vs %s") for input_file_grp in input_file_grps: assert input_file_grp in self.workspace.mets.file_groups # keep this for backwards compatibility: From 3382ad985f911aadcfb1a1a3b62de8de25d424a0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 5 Sep 2024 22:21:52 +0200 Subject: [PATCH 180/249] Processor.process_page_file / OcrdPageResultImage: allow None instead of AlternativeImageType --- src/ocrd/processor/base.py | 2 ++ src/ocrd/processor/ocrd_page_result.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 02595f4c2..94e8cce54 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -619,6 +619,8 @@ def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: image_result.alternative_image.set_imageHeight(image_result.pil.height) elif isinstance(image_result.alternative_image, AlternativeImageType): image_result.alternative_image.set_filename(image_file_path) + elif image_result.alternative_image is None: + pass # do not reference in PAGE result else: raise ValueError(f"process_page_pcgts returned an OcrdPageResultImage of unknown type " f"{type(image_result.alternative_image)}") diff --git a/src/ocrd/processor/ocrd_page_result.py b/src/ocrd/processor/ocrd_page_result.py index dcd8ccd44..5f21a72f5 100644 --- a/src/ocrd/processor/ocrd_page_result.py +++ b/src/ocrd/processor/ocrd_page_result.py @@ -1,5 +1,5 @@ from dataclasses import dataclass, field -from typing import List, Union +from typing import List, Union, Optional from ocrd_models.ocrd_page import OcrdPage from PIL.Image import Image @@ -9,7 +9,7 @@ class OcrdPageResultImage(): pil : Image file_id_suffix : str - alternative_image : Union[AlternativeImageType, PageType] + alternative_image : Optional[Union[AlternativeImageType, PageType]] @dataclass class OcrdPageResult(): From cad477723599ef5ae271e73e5f39f91fef8ae1e8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 14 Sep 2024 00:59:51 +0200 Subject: [PATCH 181/249] PcGts.Page.id / make_xml_id: replace '/' with '_' --- src/ocrd_utils/str.py | 3 ++- tests/model/test_ocrd_page.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ocrd_utils/str.py b/src/ocrd_utils/str.py index 6a973fac7..13d03cc5b 100644 --- a/src/ocrd_utils/str.py +++ b/src/ocrd_utils/str.py @@ -108,10 +108,11 @@ def make_xml_id(idstr: str) -> str: ret = idstr if not REGEX_FILE_ID.fullmatch(ret): ret = ret.replace(':', '_') + ret = ret.replace('/', '_') ret = re.sub(r'^([^a-zA-Z_])', r'id_\1', ret) ret = re.sub(r'[^\w.-]', r'', ret) return ret - + def nth_url_segment(url, n=-1): """ Return the last /-delimited segment of a URL-like string diff --git a/tests/model/test_ocrd_page.py b/tests/model/test_ocrd_page.py index 7dc130809..97335775d 100644 --- a/tests/model/test_ocrd_page.py +++ b/tests/model/test_ocrd_page.py @@ -460,7 +460,7 @@ def test_id(): # TODO: is this *really* desired? # I would expect for a single Page-Element the ID is like from the top-level-Pgts-Container, not like a fileName - assert pcgts.get_Page().id == 'OCR-D-IMG/INPUT_0017.tif' + assert pcgts.get_Page().id == 'OCR-D-IMG_INPUT_0017.tif' if __name__ == '__main__': From 10b2abc0b0d7cd4125c5ecb66f66306b05119873 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 00:35:07 +0200 Subject: [PATCH 182/249] ocrd.cli.ocrd-tool resolve-resource: fix (forgot to print result) --- src/ocrd/cli/ocrd_tool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/cli/ocrd_tool.py b/src/ocrd/cli/ocrd_tool.py index fa815daeb..3ceaba40c 100644 --- a/src/ocrd/cli/ocrd_tool.py +++ b/src/ocrd/cli/ocrd_tool.py @@ -125,7 +125,7 @@ def ocrd_tool_tool_list_resources(ctx): @click.argument('res_name') @pass_ocrd_tool def ocrd_tool_tool_resolve_resource(ctx, res_name): - ctx.processor(None).resolve_resource(res_name) + print(ctx.processor(None).resolve_resource(res_name)) @ocrd_tool_tool.command('show-resource', help="Dump a tool's file resource") @click.argument('res_name') From bd644441605cba4ae5cb433aba7b92bc3c475ff3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 14 Sep 2024 00:10:12 +0200 Subject: [PATCH 183/249] processor CLI: delegate --resolve-resource, too --- src/ocrd/decorators/ocrd_cli_options.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ocrd/decorators/ocrd_cli_options.py b/src/ocrd/decorators/ocrd_cli_options.py index 944f60645..a401264ed 100644 --- a/src/ocrd/decorators/ocrd_cli_options.py +++ b/src/ocrd/decorators/ocrd_cli_options.py @@ -43,6 +43,7 @@ def cli(mets_url): option('--address', type=ServerAddressParamType()), option('--queue', type=QueueServerParamType()), option('--database', type=DatabaseParamType()), + option('-R', '--resolve-resource'), option('-C', '--show-resource'), option('-L', '--list-resources', is_flag=True, default=False), option('-J', '--dump-json', is_flag=True, default=False), From 71e9841ad5a1c6e73c83a605e9f9a26a9b2e2b25 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 15 Sep 2024 16:32:55 +0200 Subject: [PATCH 184/249] METS Server: also export+delegate physical_pages --- src/ocrd/mets_server.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 2f7b9842b..101727e06 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -88,6 +88,14 @@ def create(file_groups: List[str]): return OcrdFileGroupListModel(file_groups=file_groups) +class OcrdPageListModel(BaseModel): + physical_pages: List[str] = Field() + + @staticmethod + def create(physical_pages: List[str]): + return OcrdPageListModel(physical_pages=physical_pages) + + class OcrdAgentListModel(BaseModel): agents: List[OcrdAgentModel] = Field() @@ -210,6 +218,17 @@ def workspace_path(self): ).json()["text"] return self.ws_dir_path + @property + def physical_pages(self) -> List[str]: + if not self.multiplexing_mode: + return self.session.request("GET", f"{self.url}/physical_pages").json()["physical_pages"] + else: + return self.session.request( + "POST", + self.url, + json=MpxReq.physical_pages(self.ws_dir_path) + ).json()["physical_pages"] + @property def file_groups(self): if not self.multiplexing_mode: @@ -349,6 +368,11 @@ def workspace_path(ws_dir_path: str) -> Dict: return MpxReq.__args_wrapper( ws_dir_path, method_type="GET", response_type="text", request_url="workspace_path", request_data={}) + @staticmethod + def physical_pages(ws_dir_path: str) -> Dict: + return MpxReq.__args_wrapper( + ws_dir_path, method_type="GET", response_type="dict", request_url="physical_pages", request_data={}) + @staticmethod def file_groups(ws_dir_path: str) -> Dict: return MpxReq.__args_wrapper( @@ -468,6 +492,10 @@ async def unique_identifier(): async def workspace_path(): return Response(content=workspace.directory, media_type="text/plain") + @app.get(path='/physical_pages', response_model=OcrdPageListModel) + async def physical_pages(): + return {'physical_pages': workspace.mets.physical_pages} + @app.get(path='/file_groups', response_model=OcrdFileGroupListModel) async def file_groups(): return {'file_groups': workspace.mets.file_groups} From 01ccdf152456e261a9334d2bebb9f1703ff53477 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 14 Sep 2024 00:57:08 +0200 Subject: [PATCH 185/249] ocrd.cli.workspace: consistently pass on --mets-server-url and --backup (also, simplify) --- src/ocrd/cli/workspace.py | 85 +++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 47 deletions(-) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index 3aece3493..ca59916cd 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -36,6 +36,17 @@ def __init__(self, directory, mets_url, mets_basename=DEFAULT_METS_BASENAME, met = self.resolver.resolve_mets_arguments(directory, mets_url, mets_basename, mets_server_url) self.automatic_backup = automatic_backup + def workspace(self): + return Workspace( + self.resolver, + directory=self.directory, + mets_basename=self.mets_basename, + automatic_backup=self.automatic_backup, + mets_server_url=self.mets_server_url, + ) + def backup_manager(self): + return WorkspaceBackupManager(self.workspace()) + pass_workspace = click.make_pass_decorator(WorkspaceCtx) @@ -138,6 +149,7 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR clone' instead of argument 'WORKSPACE_DIR' ('%s')" % workspace_dir)) ctx.directory = workspace_dir + assert not ctx.mets_server_url workspace = ctx.resolver.workspace_from_url( mets_url, dst_dir=ctx.directory, @@ -173,10 +185,11 @@ def workspace_init(ctx, clobber_mets, directory): if directory: LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory)) ctx.directory = directory + assert not ctx.mets_server_url workspace = ctx.resolver.workspace_from_nothing( directory=ctx.directory, mets_basename=ctx.mets_basename, - clobber_mets=clobber_mets + clobber_mets=clobber_mets, ) workspace.save_mets() print(workspace.directory) @@ -200,13 +213,7 @@ def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_ Add a file or http(s) URL FNAME to METS in a workspace. If FNAME is not an http(s) URL and is not a workspace-local existing file, try to copy to workspace. """ - workspace = Workspace( - ctx.resolver, - directory=ctx.directory, - mets_basename=ctx.mets_basename, - automatic_backup=ctx.automatic_backup, - mets_server_url=ctx.mets_server_url, - ) + workspace = ctx.workspace() log = getLogger('ocrd.cli.workspace.add') if not mimetype: @@ -313,13 +320,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi """ log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name - workspace = Workspace( - ctx.resolver, - directory=ctx.directory, - mets_basename=ctx.mets_basename, - automatic_backup=ctx.automatic_backup, - mets_server_url=ctx.mets_server_url, - ) + workspace = ctx.workspace() try: pat = re.compile(regex) @@ -455,12 +456,7 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, incl output_field = [snake_to_camel.get(x, x) for x in output_field] modified_mets = False ret = [] - workspace = Workspace( - ctx.resolver, - directory=ctx.directory, - mets_basename=ctx.mets_basename, - mets_server_url=ctx.mets_server_url, - ) + workspace = ctx.workspace() with pushd_popd(workspace.directory): for f in workspace.find_files( file_id=file_id, @@ -510,7 +506,7 @@ def workspace_remove_file(ctx, id, force, keep_file): # pylint: disable=redefin (If any ``ID`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() for i in id: workspace.remove_file(i, force=force, keep_file=keep_file) workspace.save_mets() @@ -528,7 +524,7 @@ def rename_group(ctx, old, new): """ Rename fileGrp (USE attribute ``NEW`` to ``OLD``). """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) + workspace = ctx.workspace() workspace.rename_file_group(old, new) workspace.save_mets() @@ -549,7 +545,7 @@ def remove_group(ctx, group, recursive, force, keep_files): (If any ``GROUP`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) + workspace = ctx.workspace() for g in group: workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files) workspace.save_mets() @@ -571,7 +567,7 @@ def prune_files(ctx, file_grp, mimetype, page_id, file_id): (If any ``FILTER`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() with pushd_popd(workspace.directory): for f in workspace.find_files( file_id=file_id, @@ -608,8 +604,7 @@ def clean(ctx, dry_run, directories, path_glob): If no PATH_GLOB are specified, then all files and directories may match. """ - log = getLogger('ocrd.cli.workspace.clean') - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() allowed_files = [normpath(f.local_filename) for f in workspace.find_files(local_only=True)] allowed_files.append(relpath(workspace.mets_target, start=workspace.directory)) allowed_dirs = set(dirname(path) for path in allowed_files) @@ -627,7 +622,7 @@ def clean(ctx, dry_run, directories, path_glob): if normpath(path) in allowed_files: continue if dry_run: - log.info('unlink(%s)' % path) + ctx.log.info('unlink(%s)' % path) else: unlink(path) if not directories: @@ -637,7 +632,7 @@ def clean(ctx, dry_run, directories, path_glob): if normpath(path) in allowed_dirs: continue if dry_run: - log.info('rmdir(%s)' % path) + ctx.log.info('rmdir(%s)' % path) else: rmdir(path) @@ -651,7 +646,7 @@ def list_groups(ctx): """ List fileGrp USE attributes """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) + workspace = ctx.workspace() print("\n".join(workspace.mets.file_groups)) # ---------------------------------------------------------------------- @@ -677,7 +672,7 @@ def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page (If any ``FILTER`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) + workspace = ctx.workspace() find_kwargs = {} if page_id_range and 'ID' in output_field: find_kwargs['pageId'] = page_id_range @@ -724,7 +719,7 @@ def get_id(ctx): """ Get METS id if any """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) + workspace = ctx.workspace() ID = workspace.mets.unique_identifier if ID: print(ID) @@ -744,7 +739,7 @@ def set_id(ctx, id): # pylint: disable=redefined-builtin Otherwise will create a new {{ ID }}. """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() workspace.mets.unique_identifier = id workspace.save_mets() @@ -767,7 +762,7 @@ def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id): if contentids: update_kwargs['CONTENTIDS'] = contentids try: - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() workspace.mets.update_physical_page_attributes(page_id, **update_kwargs) workspace.save_mets() except Exception as err: @@ -805,7 +800,7 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa mets_path = Path(mets_path) if filegrp_mapping: filegrp_mapping = loads(filegrp_mapping) - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name)) workspace.merge( other_workspace, @@ -829,11 +824,12 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa # ---------------------------------------------------------------------- @workspace_cli.group('backup') -@click.pass_context +@pass_workspace def workspace_backup_cli(ctx): # pylint: disable=unused-argument """ Backing and restoring workspaces - dev edition """ + assert not ctx.mets_server_url, "Workspace backups currently not interoperable with METS Server" @workspace_backup_cli.command('add') @pass_workspace @@ -841,7 +837,7 @@ def workspace_backup_add(ctx): """ Create a new backup """ - backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)) + backup_manager = ctx.backup_manager() backup_manager.add() @workspace_backup_cli.command('list') @@ -850,7 +846,7 @@ def workspace_backup_list(ctx): """ List backups """ - backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)) + backup_manager = ctx.backup_manager() for b in backup_manager.list(): print(b) @@ -862,7 +858,7 @@ def workspace_backup_restore(ctx, choose_first, bak): """ Restore backup BAK """ - backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)) + backup_manager = ctx.backup_manager() backup_manager.restore(bak, choose_first) @workspace_backup_cli.command('undo') @@ -871,7 +867,7 @@ def workspace_backup_undo(ctx): """ Restore the last backup """ - backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)) + backup_manager = ctx.backup_manager() backup_manager.undo() @@ -888,13 +884,8 @@ def workspace_serve_cli(ctx): # pylint: disable=unused-argument @workspace_serve_cli.command('stop') @pass_workspace def workspace_serve_stop(ctx): # pylint: disable=unused-argument - """Stop the METS server""" - workspace = Workspace( - ctx.resolver, - directory=ctx.directory, - mets_basename=ctx.mets_basename, - mets_server_url=ctx.mets_server_url, - ) + """Stop the METS server (saving changes to disk)""" + workspace = ctx.workspace() workspace.mets.stop() @workspace_serve_cli.command('start') From 3301f9c64071d0fe7a8b99038b8080263c2bb6a1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 14 Sep 2024 00:57:32 +0200 Subject: [PATCH 186/249] ocrd.cli.workspace server: add 'reload' and 'save' --- src/ocrd/cli/workspace.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index ca59916cd..ca4e8629d 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -888,6 +888,20 @@ def workspace_serve_stop(ctx): # pylint: disable=unused-argument workspace = ctx.workspace() workspace.mets.stop() +@workspace_serve_cli.command('reload') +@pass_workspace +def workspace_serve_reload(ctx): # pylint: disable=unused-argument + """Reload the METS server from disk""" + workspace = ctx.workspace() + workspace.mets.reload() + +@workspace_serve_cli.command('save') +@pass_workspace +def workspace_serve_save(ctx): # pylint: disable=unused-argument + """Save the METS changes to disk""" + workspace = ctx.workspace() + workspace.mets.save() + @workspace_serve_cli.command('start') @pass_workspace def workspace_serve_start(ctx): # pylint: disable=unused-argument From dc2c758e80318b1f230ad1ce80f5c38ac97d425e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 00:32:32 +0200 Subject: [PATCH 187/249] ocrd.cli.bashlib input-files: pass on --mets-server-url, too --- src/ocrd/cli/bashlib.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/ocrd/cli/bashlib.py b/src/ocrd/cli/bashlib.py index d46c81ee4..b6817abe9 100644 --- a/src/ocrd/cli/bashlib.py +++ b/src/ocrd/cli/bashlib.py @@ -76,10 +76,10 @@ def bashlib_constants(name): @click.option('--ocrd-tool', help="path to ocrd-tool.json of processor to feed", default=None) @click.option('--executable', help="name of processor executable in ocrd-tool.json", default=None) @click.option('-m', '--mets', help="METS to process", default=DEFAULT_METS_BASENAME) -@click.option('-w', '--working-dir', help="Working Directory") +@click.option('-U', '--mets-server-url', help='TCP host URI or UDS path of METS server', default=None) +@click.option('-d', '--working-dir', help="Working Directory") @click.option('-I', '--input-file-grp', help='File group(s) used as input.', default=None) @click.option('-O', '--output-file-grp', help='File group(s) used as output.', default=None) -# repeat some other processor options for convenience (will be ignored here) @click.option('-g', '--page-id', help="ID(s) of the pages to process") @click.option('--overwrite', is_flag=True, default=False, help="Remove output pages/images if they already exist\n" "(with '--page-id', remove only those).\n" @@ -126,9 +126,10 @@ def metadata_location(self): def executable(self): # needed for ocrd_tool lookup return executable + processor_class = FullBashlibProcessor else: # we have no true metadata file, so fill in just to make it work - class FullBashlibProcessor(BashlibProcessor): + class UnknownBashlibProcessor(BashlibProcessor): @property def ocrd_tool(self): # needed to satisfy the validator @@ -142,5 +143,6 @@ def ocrd_tool(self): def version(self): # needed to satisfy the validator and wrapper return '1.0' + processor_class = UnknownBashlibProcessor - ocrd_cli_wrap_processor(FullBashlibProcessor, **kwargs) + ocrd_cli_wrap_processor(processor_class, **kwargs) From 42af6a33ccd6b72899606b4c69789951a1cf445a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 00:36:03 +0200 Subject: [PATCH 188/249] ocrd.cli.validate tasks: pass on --mets-server-url, too --- src/ocrd/cli/validate.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/ocrd/cli/validate.py b/src/ocrd/cli/validate.py index 61d26988a..a1ec8fafd 100644 --- a/src/ocrd/cli/validate.py +++ b/src/ocrd/cli/validate.py @@ -102,16 +102,19 @@ def validate_page(page, **kwargs): @validate_cli.command('tasks') @click.option('--workspace', nargs=1, required=False, help='Workspace directory these tasks are to be run. If omitted, only validate syntax') @click.option('-M', '--mets-basename', nargs=1, default=DEFAULT_METS_BASENAME, help='Basename of the METS file, used in conjunction with --workspace') +@click.option('-U', '--mets-server-url', help='TCP host URI or UDS path of METS server') @click.option('--overwrite', is_flag=True, default=False, help='When checking against a concrete workspace, simulate overwriting output or page range.') @click.option('-g', '--page-id', help="ID(s) of the pages to process") @click.argument('tasks', nargs=-1, required=True) -def validate_process(tasks, workspace, mets_basename, overwrite, page_id): +def validate_process(tasks, workspace, mets_basename, mets_server_url, overwrite, page_id): ''' Validate a sequence of tasks passable to `ocrd process` ''' if workspace: - _inform_of_result(validate_tasks([ProcessorTask.parse(t) for t in tasks], - Workspace(Resolver(), directory=workspace, mets_basename=mets_basename), page_id=page_id, overwrite=overwrite)) + _inform_of_result(validate_tasks( + [ProcessorTask.parse(t) for t in tasks], + Workspace(Resolver(), directory=workspace, mets_basename=mets_basename, mets_server_url=mets_server_url), + page_id=page_id, overwrite=overwrite)) else: for t in [ProcessorTask.parse(t) for t in tasks]: _inform_of_result(t.validate()) From 7ea8d57e688621810cf6d04e261edcfd30739d75 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 00:48:47 +0200 Subject: [PATCH 189/249] Processor.process_workspace(): do not show NotImplementedError context if fallback process() raises anything itself --- src/ocrd/processor/base.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 94e8cce54..fac9825bf 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -549,7 +549,11 @@ def process_workspace(self, workspace: Workspace) -> None: except NotImplementedError: # fall back to deprecated method - self.process() + try: + self.process() + except Exception as err: + # suppress the NotImplementedError context + raise err from None def _copy_page_file(self, input_file : OcrdFileType) -> None: """ From 975125644da4be0a71f55016dab4a066da2d31b1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 00:43:03 +0200 Subject: [PATCH 190/249] Processor.verify: check output fileGrps as well (or OCRD_EXISTING_OUTPUT=OVERWRITE|SKIP or disjoint --page-id) --- src/ocrd/processor/base.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index fac9825bf..d669d29f6 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -358,6 +358,7 @@ def verify(self): """ Verify that :py:attr:`input_file_grp` and :py:attr:`output_file_grp` fulfill the processor's requirements. """ + # verify input and output file groups in parameters assert self.input_file_grp is not None assert self.output_file_grp is not None input_file_grps = self.input_file_grp.split(',') @@ -381,8 +382,16 @@ def assert_file_grp_cardinality(grps : List[str], spec : Union[int, List[int]], if 'output_file_grp_cardinality' in self.ocrd_tool: assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'], "Unexpected number of output file groups %d vs %s") + # verify input and output file groups in METS for input_file_grp in input_file_grps: - assert input_file_grp in self.workspace.mets.file_groups + assert input_file_grp in self.workspace.mets.file_groups, \ + f"input fileGrp {input_file_grp} does not exist in workspace {self.workspace}" + for output_file_grp in output_file_grps: + assert output_file_grp not in self.workspace.mets.file_groups \ + or config.OCRD_EXISTING_OUTPUT in ['OVERWRITE', 'SKIP'] \ + or not any(self.workspace.mets.find_files( + pageId=self.page_id, fileGrp=output_file_grp)), \ + f"output fileGrp {output_file_grp} already exists in workspace {self.workspace}" # keep this for backwards compatibility: return True From f66753ae1f2f35fe3a19642369c9cb96aba49058 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 00:47:14 +0200 Subject: [PATCH 191/249] run_processor: be robust if ocrd_tool is missing steps --- src/ocrd/processor/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index 22837e212..2cbbbd97e 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -89,7 +89,7 @@ def run_processor( ocrd_tool = processor.ocrd_tool name = '%s v%s' % (ocrd_tool['executable'], processor.version) - otherrole = ocrd_tool['steps'][0] + otherrole = ocrd_tool.get('steps', [''])[0] logProfile = getLogger('ocrd.process.profile') log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole) t0_wall = perf_counter() From eb12a809593e17f9329fc36b55cf9e1f99866e45 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 00:38:11 +0200 Subject: [PATCH 192/249] lib.bash: fix errexit --- src/ocrd/lib.bash | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/ocrd/lib.bash b/src/ocrd/lib.bash index 476b41096..b68829abe 100644 --- a/src/ocrd/lib.bash +++ b/src/ocrd/lib.bash @@ -27,8 +27,8 @@ ocrd__log () { ## Ensure minimum version # ht https://stackoverflow.com/posts/4025065 ocrd__minversion () { - local minversion_raw="$1" set -e + local minversion_raw="$1" local version_raw=$(ocrd --version|sed 's/ocrd, version //') local version_mmp=$(echo "$version_raw" | grep -Eo '([0-9]+\.?){3}') local version_prerelease_suffix="${version_raw#$version_mmp}" @@ -123,6 +123,7 @@ ocrd__usage () { ## declare -A ocrd__argv=() ## ``` ocrd__parse_argv () { + set -e # if [[ -n "$ZSH_VERSION" ]];then # print -r -- ${+ocrd__argv} ${(t)ocrd__argv} @@ -140,6 +141,7 @@ ocrd__parse_argv () { exit 1 fi + ocrd__argv[debug]=false ocrd__argv[overwrite]=false ocrd__argv[profile]=false ocrd__argv[profile_file]= @@ -170,6 +172,7 @@ ocrd__parse_argv () { -w|--working-dir) ocrd__argv[working_dir]=$(realpath "$2") ; shift ;; -m|--mets) ocrd__argv[mets_file]=$(realpath "$2") ; shift ;; -U|--mets-server-url) ocrd__argv[mets_server_url]="$2" ; shift ;; + --debug) ocrd__argv[debug]=true ;; --overwrite) ocrd__argv[overwrite]=true ;; --profile) ocrd__argv[profile]=true ;; --profile-file) ocrd__argv[profile_file]=$(realpath "$2") ; shift ;; @@ -265,6 +268,7 @@ $params_parsed" } ocrd__wrap () { + set -e declare -gx OCRD_TOOL_JSON="$1" declare -gx OCRD_TOOL_NAME="$2" From 3355ea49f1364483879fad8d0a5b6748ec0f44c7 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 00:41:25 +0200 Subject: [PATCH 193/249] lib.bash input-files: pass on --mets-server-url, --overwrite, and parameters (necessary for required params) --- src/ocrd/lib.bash | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/src/ocrd/lib.bash b/src/ocrd/lib.bash index b68829abe..00c4936c8 100644 --- a/src/ocrd/lib.bash +++ b/src/ocrd/lib.bash @@ -136,6 +136,10 @@ ocrd__parse_argv () { ocrd__raise "Must set \$params (declare -A params)" fi + if ! declare -p "params_json" >/dev/null 2>/dev/null ;then + ocrd__raise "Must set \$params_json (declare params_json)" + fi + if [[ $# = 0 ]];then ocrd__usage exit 1 @@ -264,6 +268,7 @@ ocrd__parse_argv () { $params_parsed" } eval "$params_parsed" + params_json="$(ocrd ocrd-tool "$OCRD_TOOL_JSON" tool $OCRD_TOOL_NAME parse-params --json "${__parameters[@]}" "${__parameter_overrides[@]}")" } @@ -276,6 +281,7 @@ ocrd__wrap () { shift declare -Agx params params=() + declare -g params_json declare -Agx ocrd__argv ocrd__argv=() @@ -297,22 +303,26 @@ ocrd__wrap () { ocrd__parse_argv "$@" - i=0 - declare -ag ocrd__files=() - while read line; do - eval declare -Ag "ocrd__file$i=( $line )" - eval "ocrd__files[$i]=ocrd__file$i" - let ++i - done < <(ocrd bashlib input-files \ + declare -ag ocrd__files + IFS=$'\n' + ocrd__files=( $(ocrd bashlib input-files \ --ocrd-tool $OCRD_TOOL_JSON \ --executable $OCRD_TOOL_NAME \ + $(if [[ ${ocrd__argv[debug]} = true ]]; then echo --debug; fi) \ + $(if [[ ${ocrd__argv[overwrite]} = true ]]; then echo --overwrite; fi) \ -m "${ocrd__argv[mets_file]}" \ + -d "${ocrd__argv[working_dir]}" \ + ${ocrd__argv[mets_server_url]:+-U} ${ocrd__argv[mets_server_url]:-} \ + -p "$params_json" \ -I "${ocrd__argv[input_file_grp]}" \ -O "${ocrd__argv[output_file_grp]}" \ - ${ocrd__argv[page_id]:+-g} ${ocrd__argv[page_id]:-}) + ${ocrd__argv[page_id]:+-g} ${ocrd__argv[page_id]:-}) ) + IFS=$' \t\n' } ## usage: pageId=$(ocrd__input_file 3 pageId) ocrd__input_file() { - eval echo "\${${ocrd__files[$1]}[$2]}" + declare -A input_file + eval input_file=( "${ocrd__files[$1]}" ) + eval echo "${input_file[$2]}" } From c597de69b65ca1ef46723fe512c5303360572c9d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 5 Sep 2024 22:29:38 +0200 Subject: [PATCH 194/249] add ocrd-filter processor --- pyproject.toml | 1 + .../processor/builtin/dummy/ocrd-tool.json | 50 +++++++ .../processor/builtin/filter_processor.py | 135 ++++++++++++++++++ 3 files changed, 186 insertions(+) create mode 100644 src/ocrd/processor/builtin/filter_processor.py diff --git a/pyproject.toml b/pyproject.toml index 5a081bb91..0e643c23a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ Issues = "https://github.com/OCR-D/core/issues" [project.scripts] ocrd = "ocrd.cli:cli" ocrd-dummy = "ocrd.processor.builtin.dummy_processor:cli" +ocrd-filter = "ocrd.processor.builtin.filter_processor:cli" [tool.setuptools] include-package-data = true diff --git a/src/ocrd/processor/builtin/dummy/ocrd-tool.json b/src/ocrd/processor/builtin/dummy/ocrd-tool.json index ef4a4810f..2f65f58ea 100644 --- a/src/ocrd/processor/builtin/dummy/ocrd-tool.json +++ b/src/ocrd/processor/builtin/dummy/ocrd-tool.json @@ -16,6 +16,56 @@ "description": "Whether to actually copy files (true) or just create PAGE-XML as a side effect (false)" } } + }, + "ocrd-filter": { + "executable": "ocrd-filter", + "description": "Bare-bones processor can be dynamically configured to remove segments based on XPath queries", + "steps": ["recognition/post-correction"], + "categories": ["Quality assurance"], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, + "parameters": { + "type": { + "type": "string", + "default": "all", + "enum": [ + "all", + "region", + "line", + "word", + "glyph", + "NoiseRegion", + "LineDrawingRegion", + "AdvertRegion", + "ImageRegion", + "ChartRegion", + "MusicRegion", + "GraphicRegion", + "UnknownRegion", + "CustomRegion", + "SeparatorRegion", + "MathsRegion", + "TextRegion", + "MapRegion", + "ChemRegion", + "TableRegion", + "TextLine", + "Word", + "Glyph" + ], + "description": "Which type of segments to remove from. Either a precise element name ('TextRegion', 'TextLine') or an alias ('all', 'region', 'line', 'word', 'glyph')." + }, + "query": { + "type": "string", + "default": "", + "description": "Which segments to select for removal (XPath predicate). Matches unconditionally, if empty. For example, to remove elements with low text confidence, set 'TextEquiv/@conf < 0.7'. Or low layout confidence, 'Coords/@conf < 0.7'.\nSupports extra predicates 'pixelarea()' for the number of pixels of the bounding box, and 'textequiv()' for the first TextEquiv unicode string. For example, to remove high pixel-to-character rate, set 'pixelarea(.) div string-length(textequiv(.)) > 500'." + }, + "plot": { + "type": "boolean", + "default": false, + "description": "Whether to extract an image for each filtered segment and write to the output fileGrp." + } + } } } } diff --git a/src/ocrd/processor/builtin/filter_processor.py b/src/ocrd/processor/builtin/filter_processor.py new file mode 100644 index 000000000..b5c1fa9ad --- /dev/null +++ b/src/ocrd/processor/builtin/filter_processor.py @@ -0,0 +1,135 @@ +# pylint: disable=missing-module-docstring,invalid-name +from typing import Optional + +from lxml import etree +import click + +from ocrd import Processor, OcrdPageResult, OcrdPageResultImage +from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor +from ocrd_models.ocrd_file import OcrdFileType +from ocrd_models.ocrd_page import OcrdPage, to_xml +from ocrd_utils import ( + make_file_id, + MIME_TO_EXT, + MIMETYPE_PAGE, + xywh_from_points, + parse_json_string_with_comments, + resource_string, + config +) +from ocrd_modelfactory import page_from_file + +def pc_area(ctxt, node): + # FIXME find out why this gets passed as list + node = node[0] + coords = node.find(f'{node.prefix}:Coords', node.nsmap) + if coords is None: + return 0 + points = coords.attrib['points'] + xywh = xywh_from_points(points) + return xywh['w'] * xywh['h'] + +def pc_text(ctxt, node): + # FIXME find out why this gets passed as list + node = node[0] + equiv = node.find(f'{node.prefix}:TextEquiv', node.nsmap) + if equiv is None: + return '' + string = equiv.find(f'{node.prefix}:Unicode', node.nsmap) + if string is None: + return '' + return string.text + +class FilterProcessor(Processor): + + def setup(self): + ns = etree.FunctionNamespace(None) + ns['pixelarea'] = pc_area + # cannot use text() - conflicts with builtin fn + ns['textequiv'] = pc_text + + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: + """ + Remove segments based on flexible selection criteria. + + Open and deserialise PAGE input file, then iterate over the segment hierarchy + down to the level required for ``type``. + + Remove any segments of type ``type`` which also evaluate the XPath predicate ``query`` + to true (or non-empty). + + If ``plot`` is `true`, then extract and write an image file for all removed segments + to the output fileGrp (without reference to the PAGE). + + Produce a new PAGE output file by serialising the resulting hierarchy. + """ + pcgts = input_pcgts[0] + result = OcrdPageResult(pcgts) + root = pcgts.etree + NS = {'re': 'http://exslt.org/regular-expressions', + 'pc': root.nsmap[root.prefix], + root.prefix: root.nsmap[root.prefix]} + segtype = self.parameter['type'] + segpred = self.parameter['query'] + if segtype == 'region': + segments = pcgts.get_Page().get_AllRegions() + elif segtype == 'line': + segments = pcgts.get_Page().get_AllTextLines() + elif segtype == 'word': + lines = pcgts.get_Page().get_AllTextLines() + segments = [word for line in lines for word in line.get_Word() or []] + elif segtype == 'glyph': + lines = pcgts.get_Page().get_AllTextLines() + segments = [glyph for line in lines for word in line.get_Word() or [] for glyph in word.get_Glyph() or []] + else: + nodes = [node.attrib['id'] for node in pcgts.etree.xpath(f'//pc:{segtype}', namespaces=NS)] + regions = pcgts.get_Page().get_AllRegions() + textregions = [region for region in regions if region.original_tagname_ == 'TextRegion'] + lines = [line for region in textregions for line in region.get_TextLine() or []] + words = [word for line in lines for word in line.get_Word() or []] + glyphs = [glyph for word in words for glyph in word.get_Glyph() or []] + segments = [segment for segment in regions + lines + words + glyphs + if segment.id in nodes or segtype == 'all'] + if not(len(segments)): + self.logger.info("no matches") + return result + if self.parameter['plot']: + page_image, page_coords, _ = self.workspace.image_from_page(pcgts.get_Page(), page_id) + for segment in segments: + node = pcgts.mapping[id(segment)] + if not segpred or node.xpath(segpred, namespaces=NS): + segtype = segment.original_tagname_ + self.logger.info("matched %s segment %s", segtype, segment.id) + parent = segment.parent_object_ + partype = parent.__class__.__name__.replace('Type', '') + if partype == 'Page': + getattr(parent, 'get_' + segtype)().remove(segment) + elif partype.endswith('Region'): + if segtype.endswith('Region'): + getattr(parent, 'get_' + segtype)().remove(segment) + else: + parent.TextLine.remove(segment) + elif partype == 'TextLine': + parent.Word.remove(segment) + elif partype == 'Word': + parent.Glyph.remove(segment) + else: + raise Exception(f"unexpected type ({partype}) of parent for matched segment ({segtype})") + segment.parent_object_ = None + if self.parameter['plot']: + segment_image, _ = self.workspace.image_from_segment(segment, page_image, page_coords) + result.images.append(OcrdPageResultImage(segment_image, segment.id + '.IMG', None)) + return result + + @property + def metadata_filename(self): + return 'processor/builtin/dummy/ocrd-tool.json' + + @property + def executable(self): + return 'ocrd-filter' + +@click.command() +@ocrd_cli_options +def cli(*args, **kwargs): + return ocrd_cli_wrap_processor(FilterProcessor, *args, **kwargs) From 465ebdb128899f9bddde119d0319adfe71e9f13f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 7 Sep 2024 14:26:32 +0200 Subject: [PATCH 195/249] ocrd-filter: also remove removed segments from ReadingOrder --- src/ocrd/processor/builtin/dummy/ocrd-tool.json | 2 +- src/ocrd/processor/builtin/filter_processor.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/ocrd/processor/builtin/dummy/ocrd-tool.json b/src/ocrd/processor/builtin/dummy/ocrd-tool.json index 2f65f58ea..97174a407 100644 --- a/src/ocrd/processor/builtin/dummy/ocrd-tool.json +++ b/src/ocrd/processor/builtin/dummy/ocrd-tool.json @@ -58,7 +58,7 @@ "query": { "type": "string", "default": "", - "description": "Which segments to select for removal (XPath predicate). Matches unconditionally, if empty. For example, to remove elements with low text confidence, set 'TextEquiv/@conf < 0.7'. Or low layout confidence, 'Coords/@conf < 0.7'.\nSupports extra predicates 'pixelarea()' for the number of pixels of the bounding box, and 'textequiv()' for the first TextEquiv unicode string. For example, to remove high pixel-to-character rate, set 'pixelarea(.) div string-length(textequiv(.)) > 500'." + "description": "Which segments to select for removal (XPath predicate). Matches unconditionally, if empty. For example, to remove elements with low text confidence, set 'pc:TextEquiv/@conf < 0.7'. Or low layout confidence, 'pc:Coords/@conf < 0.7'.\nSupports extra predicates 'pixelarea()' for the number of pixels of the bounding box, and 'textequiv()' for the first TextEquiv unicode string. For example, to remove high pixel-to-character rate, set 'pixelarea(.) div string-length(textequiv(.)) > 500'." }, "plot": { "type": "boolean", diff --git a/src/ocrd/processor/builtin/filter_processor.py b/src/ocrd/processor/builtin/filter_processor.py index b5c1fa9ad..70e4a0c3d 100644 --- a/src/ocrd/processor/builtin/filter_processor.py +++ b/src/ocrd/processor/builtin/filter_processor.py @@ -93,6 +93,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional if not(len(segments)): self.logger.info("no matches") return result + rodict = pcgts.get_Page().get_ReadingOrderGroups() if self.parameter['plot']: page_image, page_coords, _ = self.workspace.image_from_page(pcgts.get_Page(), page_id) for segment in segments: @@ -116,6 +117,13 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional else: raise Exception(f"unexpected type ({partype}) of parent for matched segment ({segtype})") segment.parent_object_ = None + if segtype.endswith('Region') and segment.id in rodict: + # remove from ReadingOrder as well + roelem = rodict[segment.id] + rorefs = getattr(roelem.parent_object_, roelem.__class__.__name__.replace('Type', '')) + rorefs.remove(roelem) + roelem.parent_object_ = None + del rodict[segment.id] if self.parameter['plot']: segment_image, _ = self.workspace.image_from_segment(segment, page_image, page_coords) result.images.append(OcrdPageResultImage(segment_image, segment.id + '.IMG', None)) From 6983fd6ee7e832b1a97ad3d61803a038485e3197 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 10 Sep 2024 19:16:05 +0200 Subject: [PATCH 196/249] ocrd-filter: register XPath functions under PAGE prefix/NS, precompile, avoid buggy lxml global registration mechanism --- .../processor/builtin/dummy/ocrd-tool.json | 2 +- .../processor/builtin/filter_processor.py | 169 ++++++++++++------ 2 files changed, 119 insertions(+), 52 deletions(-) diff --git a/src/ocrd/processor/builtin/dummy/ocrd-tool.json b/src/ocrd/processor/builtin/dummy/ocrd-tool.json index 97174a407..3d73169ec 100644 --- a/src/ocrd/processor/builtin/dummy/ocrd-tool.json +++ b/src/ocrd/processor/builtin/dummy/ocrd-tool.json @@ -58,7 +58,7 @@ "query": { "type": "string", "default": "", - "description": "Which segments to select for removal (XPath predicate). Matches unconditionally, if empty. For example, to remove elements with low text confidence, set 'pc:TextEquiv/@conf < 0.7'. Or low layout confidence, 'pc:Coords/@conf < 0.7'.\nSupports extra predicates 'pixelarea()' for the number of pixels of the bounding box, and 'textequiv()' for the first TextEquiv unicode string. For example, to remove high pixel-to-character rate, set 'pixelarea(.) div string-length(textequiv(.)) > 500'." + "description": "Which segments to select for removal (XPath predicate). Matches unconditionally, if empty. For example, to remove elements with low text confidence, set 'pc:TextEquiv/@conf < 0.7'. Or low layout confidence, 'pc:Coords/@conf < 0.7'.\nSupports extra predicates 'pc:area()' for the number of pixels of the bounding box, and 'pc:text()' for the first TextEquiv unicode string. For example, to remove high pixel-to-character rate, set 'pc:area(.) div string-length(pc:text(.)) > 500'." }, "plot": { "type": "boolean", diff --git a/src/ocrd/processor/builtin/filter_processor.py b/src/ocrd/processor/builtin/filter_processor.py index 70e4a0c3d..1db9c0b13 100644 --- a/src/ocrd/processor/builtin/filter_processor.py +++ b/src/ocrd/processor/builtin/filter_processor.py @@ -8,6 +8,7 @@ from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor from ocrd_models.ocrd_file import OcrdFileType from ocrd_models.ocrd_page import OcrdPage, to_xml +from ocrd_models.constants import NAMESPACES from ocrd_utils import ( make_file_id, MIME_TO_EXT, @@ -19,34 +20,110 @@ ) from ocrd_modelfactory import page_from_file -def pc_area(ctxt, node): - # FIXME find out why this gets passed as list - node = node[0] - coords = node.find(f'{node.prefix}:Coords', node.nsmap) - if coords is None: - return 0 - points = coords.attrib['points'] - xywh = xywh_from_points(points) - return xywh['w'] * xywh['h'] - -def pc_text(ctxt, node): - # FIXME find out why this gets passed as list - node = node[0] - equiv = node.find(f'{node.prefix}:TextEquiv', node.nsmap) - if equiv is None: - return '' - string = equiv.find(f'{node.prefix}:Unicode', node.nsmap) - if string is None: - return '' - return string.text +def xpath(func, *, ns_uri: Optional[str] = None, ns_prefix: Optional[str] = ''): + ns = etree.FunctionNamespace(ns_uri) + if ns_prefix: + # FIXME: this crashes lxml (even with just a single thread) when called repeatedly + # we work around this by using the `extensions` kwarg to XPath init in setup() below + # (i.e. registerLocalFunctions instead of registerGlobalFunctions) + #ns.prefix = ns_prefix + raise NotImplementedError() + name = func.__name__.replace('_', '-') + if ns_prefix and name.startswith(ns_prefix): + name = name[len(ns_prefix):] + if name.startswith('-'): + name = name[1:] + ns[name] = func + return func -class FilterProcessor(Processor): +def pc_xpath(func): + return xpath(func, ns_uri=NAMESPACES['page'], ns_prefix='pc') + +#@pc_xpath +def pc_area(ctxt, nodes): + """ + Extract Coords/@points from all nodes, calculate the bounding + box, and accumulate areas. + """ + area = 0 + for node in nodes: + coords = node.find(f'{node.prefix}:Coords', node.nsmap) + if coords is None: + continue + points = coords.attrib['points'] + xywh = xywh_from_points(points) + area += xywh['w'] * xywh['h'] + return area + +#@pc_xpath +def pc_text(ctxt, nodes): + """ + Extract TextEquiv/Unicode from all nodes, then concatenate + (interspersed with spaces or newlines). + """ + text = '' + for node in nodes: + if text and node.tag.endswith('Region'): + text += '\n' + if text and node.tag.endswith('Line'): + text += '\n' + if text and node.tag.endswith('Word'): + text += ' ' + equiv = node.find(f'{node.prefix}:TextEquiv', node.nsmap) + if equiv is None: + continue + string = equiv.find(f'{node.prefix}:Unicode', node.nsmap) + if string is None: + continue + text += str(string.text) + return text +_SEGTYPES = [ + "NoiseRegion", + "LineDrawingRegion", + "AdvertRegion", + "ImageRegion", + "ChartRegion", + "MusicRegion", + "GraphicRegion", + "UnknownRegion", + "CustomRegion", + "SeparatorRegion", + "MathsRegion", + "TextRegion", + "MapRegion", + "ChemRegion", + "TableRegion", + "TextLine", + "Word", + "Glyph" +] + +class FilterProcessor(Processor): def setup(self): - ns = etree.FunctionNamespace(None) - ns['pixelarea'] = pc_area - # cannot use text() - conflicts with builtin fn - ns['textequiv'] = pc_text + NS = {'re': 'http://exslt.org/regular-expressions', + 'pc': NAMESPACES['page']} + extensions = {(NAMESPACES['page'], 'area'): pc_area, + (NAMESPACES['page'], 'text'): pc_text} + segtype = self.parameter['type'] + if segtype == 'all': + segtype = '|'.join('//pc:' + segtype for segtype in _SEGTYPES) + elif segtype == 'region': + segtype = '|'.join('//pc:' + segtype for segtype in _SEGTYPES if segtype.endswith('Region')) + elif segtype == 'line': + segtype = '//pc:TextLine' + elif segtype == 'word': + segtype = '//pc:Word' + elif segtype == 'glyph': + segtype = '//pc:Glyph' + else: + segtype = '//pc:' + segtype + self.segtypexpath = etree.XPath(segtype, namespaces=NS, extensions=extensions) + segpred = self.parameter['query'] + if segpred: + self.segpredxpath = etree.XPath(segpred, namespaces=NS, extensions=extensions) + else: + self.segpredxpath = lambda: True def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """ @@ -65,31 +142,18 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional """ pcgts = input_pcgts[0] result = OcrdPageResult(pcgts) - root = pcgts.etree - NS = {'re': 'http://exslt.org/regular-expressions', - 'pc': root.nsmap[root.prefix], - root.prefix: root.nsmap[root.prefix]} - segtype = self.parameter['type'] - segpred = self.parameter['query'] - if segtype == 'region': - segments = pcgts.get_Page().get_AllRegions() - elif segtype == 'line': - segments = pcgts.get_Page().get_AllTextLines() - elif segtype == 'word': - lines = pcgts.get_Page().get_AllTextLines() - segments = [word for line in lines for word in line.get_Word() or []] - elif segtype == 'glyph': - lines = pcgts.get_Page().get_AllTextLines() - segments = [glyph for line in lines for word in line.get_Word() or [] for glyph in word.get_Glyph() or []] - else: - nodes = [node.attrib['id'] for node in pcgts.etree.xpath(f'//pc:{segtype}', namespaces=NS)] - regions = pcgts.get_Page().get_AllRegions() - textregions = [region for region in regions if region.original_tagname_ == 'TextRegion'] - lines = [line for region in textregions for line in region.get_TextLine() or []] - words = [word for line in lines for word in line.get_Word() or []] - glyphs = [glyph for word in words for glyph in word.get_Glyph() or []] - segments = [segment for segment in regions + lines + words + glyphs - if segment.id in nodes or segtype == 'all'] + nodes = [node.attrib['id'] for node in self.segtypexpath(pcgts.etree)] + if self.segtypexpath.error_log: + self.logger.error(self.segtypexpath.error_log) + # get PAGE objects from matching etree nodes + # FIXME: this should be easier (OcrdPage should have id lookup mechanism) + regions = pcgts.get_Page().get_AllRegions() + textregions = [region for region in regions if region.original_tagname_ == 'TextRegion'] + lines = [line for region in textregions for line in region.get_TextLine() or []] + words = [word for line in lines for word in line.get_Word() or []] + glyphs = [glyph for word in words for glyph in word.get_Glyph() or []] + segments = [segment for segment in regions + lines + words + glyphs + if segment.id in nodes] if not(len(segments)): self.logger.info("no matches") return result @@ -98,7 +162,8 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional page_image, page_coords, _ = self.workspace.image_from_page(pcgts.get_Page(), page_id) for segment in segments: node = pcgts.mapping[id(segment)] - if not segpred or node.xpath(segpred, namespaces=NS): + assert isinstance(node, etree._Element) + if self.segpredxpath(node): segtype = segment.original_tagname_ self.logger.info("matched %s segment %s", segtype, segment.id) parent = segment.parent_object_ @@ -127,6 +192,8 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional if self.parameter['plot']: segment_image, _ = self.workspace.image_from_segment(segment, page_image, page_coords) result.images.append(OcrdPageResultImage(segment_image, segment.id + '.IMG', None)) + if self.segpredxpath.error_log: + self.logger.error(self.segpredxpath.error_log) return result @property From 62712364d3e38dd0ebfd324917515a30a07b461b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 15 Sep 2024 00:00:26 +0200 Subject: [PATCH 197/249] ocrd-filter: simplify parameters (just 'select' instead of 'type' and 'query'), use 'elementpath.XPathParser.external_function' with global registration instead of 'etree.FunctionNamespace' with local extension --- requirements.txt | 1 + .../processor/builtin/dummy/ocrd-tool.json | 36 +---- .../processor/builtin/filter_processor.py | 125 ++++++++---------- 3 files changed, 58 insertions(+), 104 deletions(-) diff --git a/requirements.txt b/requirements.txt index e78c18661..1c14260ae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ click >=7 cryptography < 43.0.0 Deprecated == 1.2.0 docker +elementpath fastapi>=0.78.0 filetype Flask diff --git a/src/ocrd/processor/builtin/dummy/ocrd-tool.json b/src/ocrd/processor/builtin/dummy/ocrd-tool.json index 3d73169ec..c79afcacb 100644 --- a/src/ocrd/processor/builtin/dummy/ocrd-tool.json +++ b/src/ocrd/processor/builtin/dummy/ocrd-tool.json @@ -25,40 +25,10 @@ "input_file_grp_cardinality": 1, "output_file_grp_cardinality": 1, "parameters": { - "type": { + "select": { "type": "string", - "default": "all", - "enum": [ - "all", - "region", - "line", - "word", - "glyph", - "NoiseRegion", - "LineDrawingRegion", - "AdvertRegion", - "ImageRegion", - "ChartRegion", - "MusicRegion", - "GraphicRegion", - "UnknownRegion", - "CustomRegion", - "SeparatorRegion", - "MathsRegion", - "TextRegion", - "MapRegion", - "ChemRegion", - "TableRegion", - "TextLine", - "Word", - "Glyph" - ], - "description": "Which type of segments to remove from. Either a precise element name ('TextRegion', 'TextLine') or an alias ('all', 'region', 'line', 'word', 'glyph')." - }, - "query": { - "type": "string", - "default": "", - "description": "Which segments to select for removal (XPath predicate). Matches unconditionally, if empty. For example, to remove elements with low text confidence, set 'pc:TextEquiv/@conf < 0.7'. Or low layout confidence, 'pc:Coords/@conf < 0.7'.\nSupports extra predicates 'pc:area()' for the number of pixels of the bounding box, and 'pc:text()' for the first TextEquiv unicode string. For example, to remove high pixel-to-character rate, set 'pc:area(.) div string-length(pc:text(.)) > 500'." + "default": "//*[ends-with(local-name(),'Region')]", + "description": "Which segments to select for removal. An XPath 2.0 query expression (path and optional predicates), with 'pc' as namespace prefix for PAGE-XML and our extension functions (see help text). Only selection of segment hierarchy elements is allowed (so e.g. `*` would be equivalent to `pc:NoiseRegion|pc:LineDrawingRegion|pc:AdvertRegion|pc:ImageRegion|pc:ChartRegion|pc:MusicRegion|pc:GraphicRegion|pc:UnknownRegion|pc:CustomRegion|pc:SeparatorRegion|pc:MathsRegion|pc:TextRegion|pc:MapRegion|pc:ChemRegion|pc:TableRegion|pc:TextLine|pc:Word|pc:Glyph`, but `pc:MetadataItem` or `pc:Border` or `pc:Coords` would not match).\nFor example, to remove words or glyphs with low text confidence, select '(pc:Word|pc:Glyph)[pc:TextEquiv/@conf < 0.7]'. Or low layout confidence, '*[pc:Coords/@conf < 0.7]'.\nTo remove high pixel-to-character rate, select '*[pc:pixelarea(.) div string-length(pc:textequiv(.)) > 10000]'." }, "plot": { "type": "boolean", diff --git a/src/ocrd/processor/builtin/filter_processor.py b/src/ocrd/processor/builtin/filter_processor.py index 1db9c0b13..a8beb09da 100644 --- a/src/ocrd/processor/builtin/filter_processor.py +++ b/src/ocrd/processor/builtin/filter_processor.py @@ -2,6 +2,7 @@ from typing import Optional from lxml import etree +import elementpath import click from ocrd import Processor, OcrdPageResult, OcrdPageResultImage @@ -20,33 +21,31 @@ ) from ocrd_modelfactory import page_from_file +PARSER = elementpath.XPath2Parser(namespaces={**NAMESPACES, 'pc': NAMESPACES['page']}) + def xpath(func, *, ns_uri: Optional[str] = None, ns_prefix: Optional[str] = ''): - ns = etree.FunctionNamespace(ns_uri) - if ns_prefix: - # FIXME: this crashes lxml (even with just a single thread) when called repeatedly - # we work around this by using the `extensions` kwarg to XPath init in setup() below - # (i.e. registerLocalFunctions instead of registerGlobalFunctions) - #ns.prefix = ns_prefix - raise NotImplementedError() name = func.__name__.replace('_', '-') if ns_prefix and name.startswith(ns_prefix): name = name[len(ns_prefix):] if name.startswith('-'): name = name[1:] - ns[name] = func + # register + PARSER.external_function(func, name=name, prefix=ns_prefix) return func def pc_xpath(func): return xpath(func, ns_uri=NAMESPACES['page'], ns_prefix='pc') -#@pc_xpath -def pc_area(ctxt, nodes): +@pc_xpath +def pc_pixelarea(nodes): """ Extract Coords/@points from all nodes, calculate the bounding box, and accumulate areas. """ area = 0 for node in nodes: + # FIXME: find out why we need to go to the parent here + node = node.parent.value coords = node.find(f'{node.prefix}:Coords', node.nsmap) if coords is None: continue @@ -55,14 +54,16 @@ def pc_area(ctxt, nodes): area += xywh['w'] * xywh['h'] return area -#@pc_xpath -def pc_text(ctxt, nodes): +@pc_xpath +def pc_textequiv(nodes): """ Extract TextEquiv/Unicode from all nodes, then concatenate (interspersed with spaces or newlines). """ text = '' for node in nodes: + # FIXME: find out why we need to go to the parent here + node = node.parent.value if text and node.tag.endswith('Region'): text += '\n' if text and node.tag.endswith('Line'): @@ -101,39 +102,26 @@ def pc_text(ctxt, nodes): class FilterProcessor(Processor): def setup(self): - NS = {'re': 'http://exslt.org/regular-expressions', - 'pc': NAMESPACES['page']} - extensions = {(NAMESPACES['page'], 'area'): pc_area, - (NAMESPACES['page'], 'text'): pc_text} - segtype = self.parameter['type'] - if segtype == 'all': - segtype = '|'.join('//pc:' + segtype for segtype in _SEGTYPES) - elif segtype == 'region': - segtype = '|'.join('//pc:' + segtype for segtype in _SEGTYPES if segtype.endswith('Region')) - elif segtype == 'line': - segtype = '//pc:TextLine' - elif segtype == 'word': - segtype = '//pc:Word' - elif segtype == 'glyph': - segtype = '//pc:Glyph' - else: - segtype = '//pc:' + segtype - self.segtypexpath = etree.XPath(segtype, namespaces=NS, extensions=extensions) - segpred = self.parameter['query'] - if segpred: - self.segpredxpath = etree.XPath(segpred, namespaces=NS, extensions=extensions) - else: - self.segpredxpath = lambda: True + token = PARSER.parse(self.parameter['select']) + def select(root): + context = elementpath.XPathContext(root) + return token.get_results(context) + self.selectxpath = select def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """ - Remove segments based on flexible selection criteria. + Remove PAGE segment hierarchy elements based on flexible selection criteria. Open and deserialise PAGE input file, then iterate over the segment hierarchy - down to the level required for ``type``. + down to the level required for ``select`` (which could be multiple levels at once). + + Remove any segments matching XPath query ``select`` from that hierarchy (and from + the `ReadingOrder` if it is a region type). - Remove any segments of type ``type`` which also evaluate the XPath predicate ``query`` - to true (or non-empty). + \b + Besides full XPath 2.0 syntax, this supports extra predicates: + - `pc:pixelarea()` for the number of pixels of the bounding box (or sum area on node sets), + - `pc:textequiv()` for the first TextEquiv unicode string (or concatenated string on node sets). If ``plot`` is `true`, then extract and write an image file for all removed segments to the output fileGrp (without reference to the PAGE). @@ -142,9 +130,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional """ pcgts = input_pcgts[0] result = OcrdPageResult(pcgts) - nodes = [node.attrib['id'] for node in self.segtypexpath(pcgts.etree)] - if self.segtypexpath.error_log: - self.logger.error(self.segtypexpath.error_log) + nodes = [node.attrib['id'] for node in self.selectxpath(pcgts.etree) if 'id' in node.attrib] # get PAGE objects from matching etree nodes # FIXME: this should be easier (OcrdPage should have id lookup mechanism) regions = pcgts.get_Page().get_AllRegions() @@ -163,37 +149,34 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional for segment in segments: node = pcgts.mapping[id(segment)] assert isinstance(node, etree._Element) - if self.segpredxpath(node): - segtype = segment.original_tagname_ - self.logger.info("matched %s segment %s", segtype, segment.id) - parent = segment.parent_object_ - partype = parent.__class__.__name__.replace('Type', '') - if partype == 'Page': + segtype = segment.original_tagname_ + self.logger.info("matched %s segment %s", segtype, segment.id) + parent = segment.parent_object_ + partype = parent.__class__.__name__.replace('Type', '') + if partype == 'Page': + getattr(parent, 'get_' + segtype)().remove(segment) + elif partype.endswith('Region'): + if segtype.endswith('Region'): getattr(parent, 'get_' + segtype)().remove(segment) - elif partype.endswith('Region'): - if segtype.endswith('Region'): - getattr(parent, 'get_' + segtype)().remove(segment) - else: - parent.TextLine.remove(segment) - elif partype == 'TextLine': - parent.Word.remove(segment) - elif partype == 'Word': - parent.Glyph.remove(segment) else: - raise Exception(f"unexpected type ({partype}) of parent for matched segment ({segtype})") - segment.parent_object_ = None - if segtype.endswith('Region') and segment.id in rodict: - # remove from ReadingOrder as well - roelem = rodict[segment.id] - rorefs = getattr(roelem.parent_object_, roelem.__class__.__name__.replace('Type', '')) - rorefs.remove(roelem) - roelem.parent_object_ = None - del rodict[segment.id] - if self.parameter['plot']: - segment_image, _ = self.workspace.image_from_segment(segment, page_image, page_coords) - result.images.append(OcrdPageResultImage(segment_image, segment.id + '.IMG', None)) - if self.segpredxpath.error_log: - self.logger.error(self.segpredxpath.error_log) + parent.TextLine.remove(segment) + elif partype == 'TextLine': + parent.Word.remove(segment) + elif partype == 'Word': + parent.Glyph.remove(segment) + else: + raise Exception(f"unexpected type ({partype}) of parent for matched segment ({segtype})") + segment.parent_object_ = None + if segtype.endswith('Region') and segment.id in rodict: + # remove from ReadingOrder as well + roelem = rodict[segment.id] + rorefs = getattr(roelem.parent_object_, roelem.__class__.__name__.replace('Type', '')) + rorefs.remove(roelem) + roelem.parent_object_ = None + del rodict[segment.id] + if self.parameter['plot']: + segment_image, _ = self.workspace.image_from_segment(segment, page_image, page_coords) + result.images.append(OcrdPageResultImage(segment_image, segment.id + '.IMG', None)) return result @property From f05f840b5ae68ffeda99444a217828e2a2ef9904 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 00:44:04 +0200 Subject: [PATCH 198/249] lib.bash input-files: do not try to validate tasks here (impossible to get right with required parameters, and now covered by wrapped Processor.verify() already) --- src/ocrd/lib.bash | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/ocrd/lib.bash b/src/ocrd/lib.bash index 00c4936c8..52bde3025 100644 --- a/src/ocrd/lib.bash +++ b/src/ocrd/lib.bash @@ -249,17 +249,6 @@ ocrd__parse_argv () { trap showtime DEBUG fi - # check fileGrps - local _valopts=( --workspace "${ocrd__argv[working_dir]}" --mets-basename "$(basename ${ocrd__argv[mets_file]})" ) - if [[ ${ocrd__argv[overwrite]} = true ]]; then - _valopts+=( --overwrite ) - fi - if [[ -n "${ocrd__argv[page_id]:-}" ]]; then - _valopts+=( --page-id "${ocrd__argv[page_id]}" ) - fi - _valopts+=( "${OCRD_TOOL_NAME#ocrd-} -I ${ocrd__argv[input_file_grp]} -O ${ocrd__argv[output_file_grp]} ${__parameters[*]@Q} ${__parameter_overrides[*]@Q}" ) - ocrd validate tasks "${_valopts[@]}" || exit $? - # check parameters local params_parsed retval params_parsed="$(ocrd ocrd-tool "$OCRD_TOOL_JSON" tool $OCRD_TOOL_NAME parse-params "${__parameters[@]}" "${__parameter_overrides[@]}")" || { From 09cad0ff5434418536f68f11b91598db015786e2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 15 Sep 2024 01:21:24 +0200 Subject: [PATCH 199/249] ocrd_models.OcrdPage: add XPath 2.0 parser and extended functions --- src/ocrd_models/ocrd_page.py | 15 +++++++++ src/ocrd_models/xpath_functions.py | 51 ++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) create mode 100644 src/ocrd_models/xpath_functions.py diff --git a/src/ocrd_models/ocrd_page.py b/src/ocrd_models/ocrd_page.py index 3f0cc690f..6a8ea4586 100644 --- a/src/ocrd_models/ocrd_page.py +++ b/src/ocrd_models/ocrd_page.py @@ -4,6 +4,7 @@ from io import StringIO from typing import Dict, Union from lxml import etree as ET +from elementpath import XPath2Parser, XPathContext __all__ = [ 'parse', @@ -132,6 +133,7 @@ ) from .constants import NAMESPACES +from .xpath_functions import pc_functions # add docstrings parse.__doc__ = ( @@ -195,6 +197,19 @@ def __init__( self.etree = etree self.mapping = mapping self.revmap = revmap + self.xpath_parser = XPath2Parser(namespaces={ + 'page': NAMESPACES['page'], + 'pc': NAMESPACES['page']}) + for func in pc_functions: + name = func.__name__.replace('_', '-') + if name.startswith('pc-'): + name = name[3:] + elif name.startswith('pc'): + name = name[2:] + # register + self.xpath_parser.external_function(func, name=name, prefix='pc') + self.xpath_context = XPathContext(self.etree) + self.xpath = lambda expression: self.xpath_parser.parse(expression).get_results(self.xpath_context) def __getattr__(self, name): return getattr(self._pcgts, name) diff --git a/src/ocrd_models/xpath_functions.py b/src/ocrd_models/xpath_functions.py new file mode 100644 index 000000000..c204811ca --- /dev/null +++ b/src/ocrd_models/xpath_functions.py @@ -0,0 +1,51 @@ +from ocrd_utils import xywh_from_points + +pc_functions = [] + +def _export(func): + pc_functions.append(func) + return func + +@_export +def pc_pixelarea(nodes): + """ + Extract Coords/@points from all nodes, calculate the bounding + box, and accumulate areas. + """ + area = 0 + for node in nodes: + # FIXME: find out why we need to go to the parent here + node = node.parent.value + coords = node.find(f'{node.prefix}:Coords', node.nsmap) + if coords is None: + continue + points = coords.attrib['points'] + xywh = xywh_from_points(points) + area += xywh['w'] * xywh['h'] + return area + +@_export +def pc_textequiv(nodes): + """ + Extract TextEquiv/Unicode from all nodes, then concatenate + (interspersed with spaces or newlines). + """ + text = '' + for node in nodes: + # FIXME: find out why we need to go to the parent here + node = node.parent.value + if text and node.tag.endswith('Region'): + text += '\n' + if text and node.tag.endswith('Line'): + text += '\n' + if text and node.tag.endswith('Word'): + text += ' ' + equiv = node.find(f'{node.prefix}:TextEquiv', node.nsmap) + if equiv is None: + continue + string = equiv.find(f'{node.prefix}:Unicode', node.nsmap) + if string is None: + continue + text += str(string.text) + return text + From b5c11919c03db90e302a160ff9f28229e55f12dc Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 00:46:31 +0200 Subject: [PATCH 200/249] Processor / Workspace.add_file: always force if config.OCRD_EXISTING_OUTPUT==OVERWRITE --- src/ocrd/processor/base.py | 3 --- src/ocrd/processor/builtin/dummy_processor.py | 2 -- src/ocrd/workspace.py | 3 +++ 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index d669d29f6..26ea532d1 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -590,7 +590,6 @@ def _copy_page_file(self, input_file : OcrdFileType) -> None: local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), mimetype=MIMETYPE_PAGE, content=to_xml(input_pcgts), - force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', ) def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: @@ -643,7 +642,6 @@ def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: self.output_file_grp, page_id=page_id, file_path=image_file_path, - force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', ) result.pcgts.set_pcGtsId(output_file_id) self.add_metadata(result.pcgts) @@ -654,7 +652,6 @@ def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), mimetype=MIMETYPE_PAGE, content=to_xml(result.pcgts), - force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', ) def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index a5f217a15..72a260968 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -47,7 +47,6 @@ def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None: mimetype=input_file.mimetype, local_filename=local_filename, content=f.read(), - force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', ) file_id = file_id + '_PAGE' pcgts = page_from_file(output_file) @@ -62,7 +61,6 @@ def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None: local_filename=join(self.output_file_grp, file_id + '.xml'), mimetype=MIMETYPE_PAGE, content=to_xml(pcgts), - force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', ) else: if self.parameter['copy_files']: diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index 4a99a112c..3cbc58c78 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -19,6 +19,7 @@ from ocrd_modelfactory import exif_from_filename, page_from_file from ocrd_utils import ( atomic_write, + config, getLogger, image_from_polygon, coordinates_of_segment, @@ -427,6 +428,8 @@ def add_file(self, file_grp, content=None, **kwargs) -> Union[OcrdFile, ClientSi kwargs["pageId"] = kwargs.pop("page_id") if "file_id" in kwargs: kwargs["ID"] = kwargs.pop("file_id") + if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE': + kwargs["force"] = True ret = self.mets.add_file(file_grp, **kwargs) From 634384931ca758f82a949f722fdb24d6c5ae0d2f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 15 Sep 2024 01:22:07 +0200 Subject: [PATCH 201/249] ocrd-filter: adapt (just delegate to OcrdPage.xpath) --- src/ocrd/processor/builtin/dummy_processor.py | 3 - .../processor/builtin/filter_processor.py | 105 +----------------- 2 files changed, 4 insertions(+), 104 deletions(-) diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index 72a260968..bf7e2940b 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -13,9 +13,6 @@ make_file_id, MIME_TO_EXT, MIMETYPE_PAGE, - parse_json_string_with_comments, - resource_string, - config ) from ocrd_modelfactory import page_from_file diff --git a/src/ocrd/processor/builtin/filter_processor.py b/src/ocrd/processor/builtin/filter_processor.py index a8beb09da..10b5572c3 100644 --- a/src/ocrd/processor/builtin/filter_processor.py +++ b/src/ocrd/processor/builtin/filter_processor.py @@ -2,112 +2,13 @@ from typing import Optional from lxml import etree -import elementpath import click from ocrd import Processor, OcrdPageResult, OcrdPageResultImage from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor -from ocrd_models.ocrd_file import OcrdFileType -from ocrd_models.ocrd_page import OcrdPage, to_xml -from ocrd_models.constants import NAMESPACES -from ocrd_utils import ( - make_file_id, - MIME_TO_EXT, - MIMETYPE_PAGE, - xywh_from_points, - parse_json_string_with_comments, - resource_string, - config -) -from ocrd_modelfactory import page_from_file - -PARSER = elementpath.XPath2Parser(namespaces={**NAMESPACES, 'pc': NAMESPACES['page']}) - -def xpath(func, *, ns_uri: Optional[str] = None, ns_prefix: Optional[str] = ''): - name = func.__name__.replace('_', '-') - if ns_prefix and name.startswith(ns_prefix): - name = name[len(ns_prefix):] - if name.startswith('-'): - name = name[1:] - # register - PARSER.external_function(func, name=name, prefix=ns_prefix) - return func - -def pc_xpath(func): - return xpath(func, ns_uri=NAMESPACES['page'], ns_prefix='pc') - -@pc_xpath -def pc_pixelarea(nodes): - """ - Extract Coords/@points from all nodes, calculate the bounding - box, and accumulate areas. - """ - area = 0 - for node in nodes: - # FIXME: find out why we need to go to the parent here - node = node.parent.value - coords = node.find(f'{node.prefix}:Coords', node.nsmap) - if coords is None: - continue - points = coords.attrib['points'] - xywh = xywh_from_points(points) - area += xywh['w'] * xywh['h'] - return area - -@pc_xpath -def pc_textequiv(nodes): - """ - Extract TextEquiv/Unicode from all nodes, then concatenate - (interspersed with spaces or newlines). - """ - text = '' - for node in nodes: - # FIXME: find out why we need to go to the parent here - node = node.parent.value - if text and node.tag.endswith('Region'): - text += '\n' - if text and node.tag.endswith('Line'): - text += '\n' - if text and node.tag.endswith('Word'): - text += ' ' - equiv = node.find(f'{node.prefix}:TextEquiv', node.nsmap) - if equiv is None: - continue - string = equiv.find(f'{node.prefix}:Unicode', node.nsmap) - if string is None: - continue - text += str(string.text) - return text - -_SEGTYPES = [ - "NoiseRegion", - "LineDrawingRegion", - "AdvertRegion", - "ImageRegion", - "ChartRegion", - "MusicRegion", - "GraphicRegion", - "UnknownRegion", - "CustomRegion", - "SeparatorRegion", - "MathsRegion", - "TextRegion", - "MapRegion", - "ChemRegion", - "TableRegion", - "TextLine", - "Word", - "Glyph" -] +from ocrd_models import OcrdPage class FilterProcessor(Processor): - def setup(self): - token = PARSER.parse(self.parameter['select']) - def select(root): - context = elementpath.XPathContext(root) - return token.get_results(context) - self.selectxpath = select - def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """ Remove PAGE segment hierarchy elements based on flexible selection criteria. @@ -130,7 +31,9 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional """ pcgts = input_pcgts[0] result = OcrdPageResult(pcgts) - nodes = [node.attrib['id'] for node in self.selectxpath(pcgts.etree) if 'id' in node.attrib] + nodes = [node.attrib['id'] + for node in pcgts.xpath(self.parameter['select']) + if 'id' in node.attrib] # get PAGE objects from matching etree nodes # FIXME: this should be easier (OcrdPage should have id lookup mechanism) regions = pcgts.get_Page().get_AllRegions() From cbe465aabb5a86c07a9120ae07c0721d31b9779f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 14 Sep 2024 01:02:19 +0200 Subject: [PATCH 202/249] test processors: no need for 'force' kwarg anymore --- tests/data/__init__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/data/__init__.py b/tests/data/__init__.py index 4dcf29fa0..56779a611 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -101,7 +101,6 @@ def process(self): mimetype=input_file.mimetype, local_filename=os.path.join(self.output_file_grp, file_id), content='CONTENT', - force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', ) class DummyProcessorWithOutputSleep(Processor): @@ -157,7 +156,6 @@ def process_page_file(self, input_file): local_filename=os.path.join(self.output_file_grp, output_file_id), mimetype=input_file.mimetype, content='CONTENT', - force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', ) class DummyProcessorWithOutputLegacy(Processor): @@ -181,7 +179,6 @@ def process(self): mimetype=input_file.mimetype, local_filename=os.path.join(self.output_file_grp, file_id), content='CONTENT', - force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', ) class IncompleteProcessor(Processor): From c47ae77cf57b2fbe4227cc3c9541de5a7d0f1031 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 15 Sep 2024 23:50:45 +0200 Subject: [PATCH 203/249] OcrdPage: migrate to newest generateds, adapt user methods, re-generate --- requirements_test.txt | 2 +- src/ocrd_models/ocrd_page_generateds.py | 4251 +++++++++++------ src/ocrd_page_user_methods.py | 2 +- ...upType.py => _exportChildren_GroupType.py} | 11 +- 4 files changed, 2823 insertions(+), 1443 deletions(-) rename src/ocrd_page_user_methods/{exportChildren_GroupType.py => _exportChildren_GroupType.py} (65%) diff --git a/requirements_test.txt b/requirements_test.txt index a6a87918f..585bb5395 100644 --- a/requirements_test.txt +++ b/requirements_test.txt @@ -1,7 +1,7 @@ autopep8 cryptography < 43.0.0 pytest >= 4.0.0 -generateDS == 2.35.20 +generateDS == 2.44.1 pytest-benchmark >= 3.2.3 pytest-timeout coverage >= 4.5.2 diff --git a/src/ocrd_models/ocrd_page_generateds.py b/src/ocrd_models/ocrd_page_generateds.py index f2b7c0551..97d5a800b 100644 --- a/src/ocrd_models/ocrd_page_generateds.py +++ b/src/ocrd_models/ocrd_page_generateds.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # -# Generated Sat Sep 7 14:17:39 2024 by generateDS.py version 2.35.20. +# Generated Sun Sep 15 21:49:27 2024 by generateDS.py version 2.44.1. # Python 3.8.17+ (heads/3.8-dirty:1663f8ba84, Aug 15 2023, 18:13:01) [GCC 8.3.0] # # Command line options: @@ -24,21 +24,23 @@ # core # +import sys +try: + ModulenotfoundExp_ = ModuleNotFoundError +except NameError: + ModulenotfoundExp_ = ImportError from itertools import zip_longest import os -import sys import re as re_ import base64 import datetime as datetime_ import decimal as decimal_ -try: - from lxml import etree as etree_ -except ImportError: - from xml.etree import ElementTree as etree_ +from lxml import etree as etree_ Validate_simpletypes_ = True SaveElementTreeNode = True +TagNamePrefix = "" if sys.version_info.major == 2: BaseStrType_ = basestring else: @@ -97,7 +99,7 @@ def parsexmlstring_(instring, parser=None, **kwargs): # Additionally, the generatedsnamespaces module can contain a python # dictionary named GenerateDSNamespaceTypePrefixes that associates element # types with the namespace prefixes that are to be added to the -# "xsi:type" attribute value. See the exportAttributes method of +# "xsi:type" attribute value. See the _exportAttributes method of # any generated element type and the generation of "xsi:type" for an # example of the use of this table. # An example table: @@ -112,11 +114,11 @@ def parsexmlstring_(instring, parser=None, **kwargs): try: from generatedsnamespaces import GenerateDSNamespaceDefs as GenerateDSNamespaceDefs_ -except ImportError: +except ModulenotfoundExp_ : GenerateDSNamespaceDefs_ = {} try: from generatedsnamespaces import GenerateDSNamespaceTypePrefixes as GenerateDSNamespaceTypePrefixes_ -except ImportError: +except ModulenotfoundExp_ : GenerateDSNamespaceTypePrefixes_ = {} # @@ -127,7 +129,7 @@ def parsexmlstring_(instring, parser=None, **kwargs): # try: from generatedscollector import GdsCollector as GdsCollector_ -except ImportError: +except ModulenotfoundExp_ : class GdsCollector_(object): @@ -161,7 +163,7 @@ def write_messages(self, outstream): try: from enum import Enum -except ImportError: +except ModulenotfoundExp_ : Enum = object # @@ -174,7 +176,7 @@ def write_messages(self, outstream): class GeneratedsSuper(object): __hash__ = object.__hash__ - tzoff_pattern = re_.compile(r'(\+|-)((0\d|1[0-3]):[0-5]\d|14:00)$') + tzoff_pattern = re_.compile('(\\+|-)((0[0-9]|1[0-3]):[0-5][0-9]|14:00)$') class _FixedOffsetTZ(datetime_.tzinfo): def __init__(self, offset, name): self.__offset = datetime_.timedelta(minutes=offset) @@ -185,6 +187,33 @@ def tzname(self, dt): return self.__name def dst(self, dt): return None + def __str__(self): + settings = { + 'str_pretty_print': True, + 'str_indent_level': 0, + 'str_namespaceprefix': '', + 'str_name': self.__class__.__name__, + 'str_namespacedefs': '', + } + for n in settings: + if hasattr(self, n): + settings[n] = getattr(self, n) + if sys.version_info.major == 2: + from StringIO import StringIO + else: + from io import StringIO + output = StringIO() + self.export( + output, + settings['str_indent_level'], + pretty_print=settings['str_pretty_print'], + namespaceprefix_=settings['str_namespaceprefix'], + name_=settings['str_name'], + namespacedef_=settings['str_namespacedefs'] + ) + strval = output.getvalue() + output.close() + return strval def gds_format_string(self, input_data, input_name=''): return input_data def gds_parse_string(self, input_data, node=None, input_name=''): @@ -195,11 +224,11 @@ def gds_validate_string(self, input_data, node=None, input_name=''): else: return input_data def gds_format_base64(self, input_data, input_name=''): - return base64.b64encode(input_data) + return base64.b64encode(input_data).decode('ascii') def gds_validate_base64(self, input_data, node=None, input_name=''): return input_data def gds_format_integer(self, input_data, input_name=''): - return '%d' % input_data + return '%d' % int(input_data) def gds_parse_integer(self, input_data, node=None, input_name=''): try: ival = int(input_data) @@ -213,6 +242,8 @@ def gds_validate_integer(self, input_data, node=None, input_name=''): raise_parse_error(node, 'Requires integer value') return value def gds_format_integer_list(self, input_data, input_name=''): + if len(input_data) > 0 and not isinstance(input_data[0], BaseStrType_): + input_data = [str(s) for s in input_data] return '%s' % ' '.join(input_data) def gds_validate_integer_list( self, input_data, node=None, input_name=''): @@ -221,10 +252,14 @@ def gds_validate_integer_list( try: int(value) except (TypeError, ValueError): - raise_parse_error(node, 'Requires sequence of integer valuess') + raise_parse_error(node, 'Requires sequence of integer values') return values def gds_format_float(self, input_data, input_name=''): - return ('%.15f' % input_data).rstrip('0') + value = ('%.15f' % float(input_data)).rstrip('0') + if value.endswith('.'): + value += '0' + return value + def gds_parse_float(self, input_data, node=None, input_name=''): try: fval_ = float(input_data) @@ -238,6 +273,8 @@ def gds_validate_float(self, input_data, node=None, input_name=''): raise_parse_error(node, 'Requires float value') return value def gds_format_float_list(self, input_data, input_name=''): + if len(input_data) > 0 and not isinstance(input_data[0], BaseStrType_): + input_data = [str(s) for s in input_data] return '%s' % ' '.join(input_data) def gds_validate_float_list( self, input_data, node=None, input_name=''): @@ -249,7 +286,12 @@ def gds_validate_float_list( raise_parse_error(node, 'Requires sequence of float values') return values def gds_format_decimal(self, input_data, input_name=''): - return ('%s' % input_data).rstrip('0') + return_value = '%s' % input_data + if '.' in return_value: + return_value = return_value.rstrip('0') + if return_value.endswith('.'): + return_value = return_value.rstrip('.') + return return_value def gds_parse_decimal(self, input_data, node=None, input_name=''): try: decimal_value = decimal_.Decimal(input_data) @@ -263,7 +305,9 @@ def gds_validate_decimal(self, input_data, node=None, input_name=''): raise_parse_error(node, 'Requires decimal value') return value def gds_format_decimal_list(self, input_data, input_name=''): - return '%s' % ' '.join(input_data) + if len(input_data) > 0 and not isinstance(input_data[0], BaseStrType_): + input_data = [str(s) for s in input_data] + return ' '.join([self.gds_format_decimal(item) for item in input_data]) def gds_validate_decimal_list( self, input_data, node=None, input_name=''): values = input_data.split() @@ -274,7 +318,7 @@ def gds_validate_decimal_list( raise_parse_error(node, 'Requires sequence of decimal values') return values def gds_format_double(self, input_data, input_name=''): - return '%e' % input_data + return '%s' % input_data def gds_parse_double(self, input_data, node=None, input_name=''): try: fval_ = float(input_data) @@ -288,6 +332,8 @@ def gds_validate_double(self, input_data, node=None, input_name=''): raise_parse_error(node, 'Requires double or float value') return value def gds_format_double_list(self, input_data, input_name=''): + if len(input_data) > 0 and not isinstance(input_data[0], BaseStrType_): + input_data = [str(s) for s in input_data] return '%s' % ' '.join(input_data) def gds_validate_double_list( self, input_data, node=None, input_name=''): @@ -302,6 +348,7 @@ def gds_validate_double_list( def gds_format_boolean(self, input_data, input_name=''): return ('%s' % input_data).lower() def gds_parse_boolean(self, input_data, node=None, input_name=''): + input_data = input_data.strip() if input_data in ('true', '1'): bval = True elif input_data in ('false', '0'): @@ -317,11 +364,14 @@ def gds_validate_boolean(self, input_data, node=None, input_name=''): '(one of True, 1, False, 0)') return input_data def gds_format_boolean_list(self, input_data, input_name=''): + if len(input_data) > 0 and not isinstance(input_data[0], BaseStrType_): + input_data = [str(s) for s in input_data] return '%s' % ' '.join(input_data) def gds_validate_boolean_list( self, input_data, node=None, input_name=''): values = input_data.split() for value in values: + value = self.gds_parse_boolean(value, node, input_name) if value not in (True, 1, False, 0, ): raise_parse_error( node, @@ -478,6 +528,7 @@ def gds_validate_simple_patterns(self, patterns, target): # The target value must match at least one of the patterns # in order for the test to succeed. found1 = True + target = str(target) for patterns1 in patterns: found2 = False for patterns2 in patterns1: @@ -563,7 +614,7 @@ def get_path_(self, node): path_list.reverse() path = '/'.join(path_list) return path - Tag_strip_pattern_ = re_.compile(r'\{.*\}') + Tag_strip_pattern_ = re_.compile(r'{.*}') def get_path_list_(self, node, path_list): if node is None: return @@ -723,6 +774,7 @@ def quote_attrib(inStr): s1 = s1.replace('&', '&') s1 = s1.replace('<', '<') s1 = s1.replace('>', '>') + s1 = s1.replace('\n', ' ') if '"' in s1: if "'" in s1: s1 = '"%s"' % s1.replace('"', """) @@ -768,7 +820,10 @@ def find_attr_value_(attr_name, node): value = attrs.get(attr_name) elif len(attr_parts) == 2: prefix, name = attr_parts - namespace = node.nsmap.get(prefix) + if prefix == 'xml': + namespace = 'http://www.w3.org/XML/1998/namespace' + else: + namespace = node.nsmap.get(prefix) if namespace is not None: value = attrs.get('{%s}%s' % (namespace, name, )) return value @@ -849,7 +904,7 @@ def exportSimple(self, outfile, level, name): self.name, base64.b64encode(self.value), self.name)) - def to_etree(self, element, mapping_=None, nsmap_=None): + def to_etree(self, element, mapping_=None, reverse_mapping_=None, nsmap_=None): if self.category == MixedContainer.CategoryText: # Prevent exporting empty content as empty lines. if self.value.strip(): @@ -869,7 +924,7 @@ def to_etree(self, element, mapping_=None, nsmap_=None): subelement.text = self.to_etree_simple() else: # category == MixedContainer.CategoryComplex self.value.to_etree(element) - def to_etree_simple(self, mapping_=None, nsmap_=None): + def to_etree_simple(self, mapping_=None, reverse_mapping_=None, nsmap_=None): if self.content_type == MixedContainer.TypeString: text = self.value elif (self.content_type == MixedContainer.TypeInteger or @@ -942,11 +997,10 @@ def _cast(typ, value): return value return typ(value) + # -# Data representation classes. +# Start enum classes # - - class AlignSimpleType(str, Enum): LEFT='left' CENTRE='centre' @@ -1013,6 +1067,200 @@ class GroupTypeSimpleType(str, Enum): OTHER='other' +class LanguageSimpleType(str, Enum): + """LanguageSimpleType -- ISO 639.x 2016-07-14 + + """ + ABKHAZ='Abkhaz' + AFAR='Afar' + AFRIKAANS='Afrikaans' + AKAN='Akan' + ALBANIAN='Albanian' + AMHARIC='Amharic' + ARABIC='Arabic' + ARAGONESE='Aragonese' + ARMENIAN='Armenian' + ASSAMESE='Assamese' + AVARIC='Avaric' + AVESTAN='Avestan' + AYMARA='Aymara' + AZERBAIJANI='Azerbaijani' + BAMBARA='Bambara' + BASHKIR='Bashkir' + BASQUE='Basque' + BELARUSIAN='Belarusian' + BENGALI='Bengali' + BIHARI='Bihari' + BISLAMA='Bislama' + BOSNIAN='Bosnian' + BRETON='Breton' + BULGARIAN='Bulgarian' + BURMESE='Burmese' + CAMBODIAN='Cambodian' + CANTONESE='Cantonese' + CATALAN='Catalan' + CHAMORRO='Chamorro' + CHECHEN='Chechen' + CHICHEWA='Chichewa' + CHINESE='Chinese' + CHUVASH='Chuvash' + CORNISH='Cornish' + CORSICAN='Corsican' + CREE='Cree' + CROATIAN='Croatian' + CZECH='Czech' + DANISH='Danish' + DIVEHI='Divehi' + DUTCH='Dutch' + DZONGKHA='Dzongkha' + ENGLISH='English' + ESPERANTO='Esperanto' + ESTONIAN='Estonian' + EWE='Ewe' + FAROESE='Faroese' + FIJIAN='Fijian' + FINNISH='Finnish' + FRENCH='French' + FULA='Fula' + GAELIC='Gaelic' + GALICIAN='Galician' + GANDA='Ganda' + GEORGIAN='Georgian' + GERMAN='German' + GREEK='Greek' + GUARANÍ='Guaraní' + GUJARATI='Gujarati' + HAITIAN='Haitian' + HAUSA='Hausa' + HEBREW='Hebrew' + HERERO='Herero' + HINDI='Hindi' + HIRI_MOTU='Hiri Motu' + HUNGARIAN='Hungarian' + ICELANDIC='Icelandic' + IDO='Ido' + IGBO='Igbo' + INDONESIAN='Indonesian' + INTERLINGUA='Interlingua' + INTERLINGUE='Interlingue' + INUKTITUT='Inuktitut' + INUPIAQ='Inupiaq' + IRISH='Irish' + ITALIAN='Italian' + JAPANESE='Japanese' + JAVANESE='Javanese' + KALAALLISUT='Kalaallisut' + KANNADA='Kannada' + KANURI='Kanuri' + KASHMIRI='Kashmiri' + KAZAKH='Kazakh' + KHMER='Khmer' + KIKUYU='Kikuyu' + KINYARWANDA='Kinyarwanda' + KIRUNDI='Kirundi' + KOMI='Komi' + KONGO='Kongo' + KOREAN='Korean' + KURDISH='Kurdish' + KWANYAMA='Kwanyama' + KYRGYZ='Kyrgyz' + LAO='Lao' + LATIN='Latin' + LATVIAN='Latvian' + LIMBURGISH='Limburgish' + LINGALA='Lingala' + LITHUANIAN='Lithuanian' + LUBA_KATANGA='Luba-Katanga' + LUXEMBOURGISH='Luxembourgish' + MACEDONIAN='Macedonian' + MALAGASY='Malagasy' + MALAY='Malay' + MALAYALAM='Malayalam' + MALTESE='Maltese' + MANX='Manx' + MĀORI='Māori' + MARATHI='Marathi' + MARSHALLESE='Marshallese' + MONGOLIAN='Mongolian' + NAURU='Nauru' + NAVAJO='Navajo' + NDONGA='Ndonga' + NEPALI='Nepali' + NORTH_NDEBELE='North Ndebele' + NORTHERN_SAMI='Northern Sami' + NORWEGIAN='Norwegian' + NORWEGIAN_BOKMÅL='Norwegian Bokmål' + NORWEGIAN_NYNORSK='Norwegian Nynorsk' + NUOSU='Nuosu' + OCCITAN='Occitan' + OJIBWE='Ojibwe' + OLD_CHURCH_SLAVONIC='Old Church Slavonic' + ORIYA='Oriya' + OROMO='Oromo' + OSSETIAN='Ossetian' + PĀLI='Pāli' + PANJABI='Panjabi' + PASHTO='Pashto' + PERSIAN='Persian' + POLISH='Polish' + PORTUGUESE='Portuguese' + PUNJABI='Punjabi' + QUECHUA='Quechua' + ROMANIAN='Romanian' + ROMANSH='Romansh' + RUSSIAN='Russian' + SAMOAN='Samoan' + SANGO='Sango' + SANSKRIT='Sanskrit' + SARDINIAN='Sardinian' + SERBIAN='Serbian' + SHONA='Shona' + SINDHI='Sindhi' + SINHALA='Sinhala' + SLOVAK='Slovak' + SLOVENE='Slovene' + SOMALI='Somali' + SOUTH_NDEBELE='South Ndebele' + SOUTHERN_SOTHO='Southern Sotho' + SPANISH='Spanish' + SUNDANESE='Sundanese' + SWAHILI='Swahili' + SWATI='Swati' + SWEDISH='Swedish' + TAGALOG='Tagalog' + TAHITIAN='Tahitian' + TAJIK='Tajik' + TAMIL='Tamil' + TATAR='Tatar' + TELUGU='Telugu' + THAI='Thai' + TIBETAN='Tibetan' + TIGRINYA='Tigrinya' + TONGA='Tonga' + TSONGA='Tsonga' + TSWANA='Tswana' + TURKISH='Turkish' + TURKMEN='Turkmen' + TWI='Twi' + UIGHUR='Uighur' + UKRAINIAN='Ukrainian' + URDU='Urdu' + UZBEK='Uzbek' + VENDA='Venda' + VIETNAMESE='Vietnamese' + VOLAPÜK='Volapük' + WALLOON='Walloon' + WELSH='Welsh' + WESTERN_FRISIAN='Western Frisian' + WOLOF='Wolof' + XHOSA='Xhosa' + YIDDISH='Yiddish' + YORUBA='Yoruba' + ZHUANG='Zhuang' + ZULU='Zulu' + OTHER='other' + + class PageTypeSimpleType(str, Enum): FRONTCOVER='front-cover' BACKCOVER='back-cover' @@ -1025,7 +1273,9 @@ class PageTypeSimpleType(str, Enum): class ProductionSimpleType(str, Enum): - """Text production type""" + """ProductionSimpleType -- Text production type + + """ PRINTED='printed' TYPEWRITTEN='typewritten' HANDWRITTENCURSIVE='handwritten-cursive' @@ -1041,6 +1291,193 @@ class ReadingDirectionSimpleType(str, Enum): BOTTOMTOTOP='bottom-to-top' +class ScriptSimpleType(str, Enum): + """ScriptSimpleType -- iso15924 2016-07-14 + + """ + ADLM_ADLAM='Adlm - Adlam' + AFAK_AFAKA='Afak - Afaka' + AGHB_CAUCASIAN_ALBANIAN='Aghb - Caucasian Albanian' + AHOM_AHOM_TAI_AHOM='Ahom - Ahom, Tai Ahom' + ARAB_ARABIC='Arab - Arabic' + ARAN_ARABIC_NASTALIQVARIANT='Aran - Arabic (Nastaliq variant)' + ARMI_IMPERIAL_ARAMAIC='Armi - Imperial Aramaic' + ARMN_ARMENIAN='Armn - Armenian' + AVST_AVESTAN='Avst - Avestan' + BALI_BALINESE='Bali - Balinese' + BAMU_BAMUM='Bamu - Bamum' + BASS_BASSA_VAH='Bass - Bassa Vah' + BATK_BATAK='Batk - Batak' + BENG_BENGALI='Beng - Bengali' + BHKS_BHAIKSUKI='Bhks - Bhaiksuki' + BLIS_BLISSYMBOLS='Blis - Blissymbols' + BOPO_BOPOMOFO='Bopo - Bopomofo' + BRAH_BRAHMI='Brah - Brahmi' + BRAI_BRAILLE='Brai - Braille' + BUGI_BUGINESE='Bugi - Buginese' + BUHD_BUHID='Buhd - Buhid' + CAKM_CHAKMA='Cakm - Chakma' + CANS_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS='Cans - Unified Canadian Aboriginal Syllabics' + CARI_CARIAN='Cari - Carian' + CHAM_CHAM='Cham - Cham' + CHER_CHEROKEE='Cher - Cherokee' + CIRT_CIRTH='Cirt - Cirth' + COPT_COPTIC='Copt - Coptic' + CPRT_CYPRIOT='Cprt - Cypriot' + CYRL_CYRILLIC='Cyrl - Cyrillic' + CYRS_CYRILLIC_OLD_CHURCH_SLAVONICVARIANT='Cyrs - Cyrillic (Old Church Slavonic variant)' + DEVA_DEVANAGARI_NAGARI='Deva - Devanagari (Nagari)' + DSRT_DESERET_MORMON='Dsrt - Deseret (Mormon)' + DUPL_DUPLOYANSHORTHAND_DUPLOYANSTENOGRAPHY='Dupl - Duployan shorthand, Duployan stenography' + EGYD_EGYPTIANDEMOTIC='Egyd - Egyptian demotic' + EGYH_EGYPTIANHIERATIC='Egyh - Egyptian hieratic' + EGYP_EGYPTIANHIEROGLYPHS='Egyp - Egyptian hieroglyphs' + ELBA_ELBASAN='Elba - Elbasan' + ETHI_ETHIOPIC='Ethi - Ethiopic' + GEOK_KHUTSURI_ASOMTAVRULIAND_NUSKHURI='Geok - Khutsuri (Asomtavruli and Nuskhuri)' + GEOR_GEORGIAN_MKHEDRULI='Geor - Georgian (Mkhedruli)' + GLAG_GLAGOLITIC='Glag - Glagolitic' + GOTH_GOTHIC='Goth - Gothic' + GRAN_GRANTHA='Gran - Grantha' + GREK_GREEK='Grek - Greek' + GUJR_GUJARATI='Gujr - Gujarati' + GURU_GURMUKHI='Guru - Gurmukhi' + HANB_HANWITH_BOPOMOFO='Hanb - Han with Bopomofo' + HANG_HANGUL='Hang - Hangul' + HANI_HAN_HANZI_KANJI_HANJA='Hani - Han (Hanzi, Kanji, Hanja)' + HANO_HANUNOO_HANUNÓO='Hano - Hanunoo (Hanunóo)' + HANS_HAN_SIMPLIFIEDVARIANT='Hans - Han (Simplified variant)' + HANT_HAN_TRADITIONALVARIANT='Hant - Han (Traditional variant)' + HATR_HATRAN='Hatr - Hatran' + HEBR_HEBREW='Hebr - Hebrew' + HIRA_HIRAGANA='Hira - Hiragana' + HLUW_ANATOLIAN_HIEROGLYPHS='Hluw - Anatolian Hieroglyphs' + HMNG_PAHAWH_HMONG='Hmng - Pahawh Hmong' + HRKT_JAPANESESYLLABARIES='Hrkt - Japanese syllabaries' + HUNG_OLD_HUNGARIAN_HUNGARIAN_RUNIC='Hung - Old Hungarian (Hungarian Runic)' + INDS_INDUS_HARAPPAN='Inds - Indus (Harappan)' + ITAL_OLD_ITALIC_ETRUSCAN_OSCANETC='Ital - Old Italic (Etruscan, Oscan etc.)' + JAMO_JAMO='Jamo - Jamo' + JAVA_JAVANESE='Java - Javanese' + JPAN_JAPANESE='Jpan - Japanese' + JURC_JURCHEN='Jurc - Jurchen' + KALI_KAYAH_LI='Kali - Kayah Li' + KANA_KATAKANA='Kana - Katakana' + KHAR_KHAROSHTHI='Khar - Kharoshthi' + KHMR_KHMER='Khmr - Khmer' + KHOJ_KHOJKI='Khoj - Khojki' + KITL_KHITANLARGESCRIPT='Kitl - Khitan large script' + KITS_KHITANSMALLSCRIPT='Kits - Khitan small script' + KNDA_KANNADA='Knda - Kannada' + KORE_KOREANALIASFOR_HANGUL_HAN='Kore - Korean (alias for Hangul + Han)' + KPEL_KPELLE='Kpel - Kpelle' + KTHI_KAITHI='Kthi - Kaithi' + LANA_TAI_THAM_LANNA='Lana - Tai Tham (Lanna)' + LAOO_LAO='Laoo - Lao' + LATF_LATIN_FRAKTURVARIANT='Latf - Latin (Fraktur variant)' + LATG_LATIN_GAELICVARIANT='Latg - Latin (Gaelic variant)' + LATN_LATIN='Latn - Latin' + LEKE_LEKE='Leke - Leke' + LEPC_LEPCHARÓNG='Lepc - Lepcha (Róng)' + LIMB_LIMBU='Limb - Limbu' + LINA_LINEARA='Lina - Linear A' + LINB_LINEARB='Linb - Linear B' + LISU_LISU_FRASER='Lisu - Lisu (Fraser)' + LOMA_LOMA='Loma - Loma' + LYCI_LYCIAN='Lyci - Lycian' + LYDI_LYDIAN='Lydi - Lydian' + MAHJ_MAHAJANI='Mahj - Mahajani' + MAND_MANDAIC_MANDAEAN='Mand - Mandaic, Mandaean' + MANI_MANICHAEAN='Mani - Manichaean' + MARC_MARCHEN='Marc - Marchen' + MAYA_MAYANHIEROGLYPHS='Maya - Mayan hieroglyphs' + MEND_MENDE_KIKAKUI='Mend - Mende Kikakui' + MERC_MEROITIC_CURSIVE='Merc - Meroitic Cursive' + MERO_MEROITIC_HIEROGLYPHS='Mero - Meroitic Hieroglyphs' + MLYM_MALAYALAM='Mlym - Malayalam' + MODI_MODI_MOḌĪ='Modi - Modi, Moḍī' + MONG_MONGOLIAN='Mong - Mongolian' + MOON_MOON_MOONCODE_MOONSCRIPT_MOONTYPE='Moon - Moon (Moon code, Moon script, Moon type)' + MROO_MRO_MRU='Mroo - Mro, Mru' + MTEI_MEITEI_MAYEK_MEITHEI_MEETEI='Mtei - Meitei Mayek (Meithei, Meetei)' + MULT_MULTANI='Mult - Multani' + MYMR_MYANMAR_BURMESE='Mymr - Myanmar (Burmese)' + NARB_OLD_NORTH_ARABIAN_ANCIENT_NORTH_ARABIAN='Narb - Old North Arabian (Ancient North Arabian)' + NBAT_NABATAEAN='Nbat - Nabataean' + NEWA_NEWA_NEWAR_NEWARI='Newa - Newa, Newar, Newari' + NKGB_NAKHI_GEBA='Nkgb - Nakhi Geba' + NKOON_KO='Nkoo - N’Ko' + NSHUNÜSHU='Nshu - Nüshu' + OGAM_OGHAM='Ogam - Ogham' + OLCK_OL_CHIKI_OL_CEMET_OL_SANTALI='Olck - Ol Chiki (Ol Cemet’, Ol, Santali)' + ORKH_OLD_TURKIC_ORKHON_RUNIC='Orkh - Old Turkic, Orkhon Runic' + ORYA_ORIYA='Orya - Oriya' + OSGE_OSAGE='Osge - Osage' + OSMA_OSMANYA='Osma - Osmanya' + PALM_PALMYRENE='Palm - Palmyrene' + PAUC_PAU_CIN_HAU='Pauc - Pau Cin Hau' + PERM_OLD_PERMIC='Perm - Old Permic' + PHAG_PHAGSPA='Phag - Phags-pa' + PHLI_INSCRIPTIONAL_PAHLAVI='Phli - Inscriptional Pahlavi' + PHLP_PSALTER_PAHLAVI='Phlp - Psalter Pahlavi' + PHLV_BOOK_PAHLAVI='Phlv - Book Pahlavi' + PHNX_PHOENICIAN='Phnx - Phoenician' + PIQD_KLINGONKLIP_IQA_D='Piqd - Klingon (KLI pIqaD)' + PLRD_MIAO_POLLARD='Plrd - Miao (Pollard)' + PRTI_INSCRIPTIONAL_PARTHIAN='Prti - Inscriptional Parthian' + RJNG_REJANG_REDJANG_KAGANGA='Rjng - Rejang (Redjang, Kaganga)' + RORO_RONGORONGO='Roro - Rongorongo' + RUNR_RUNIC='Runr - Runic' + SAMR_SAMARITAN='Samr - Samaritan' + SARA_SARATI='Sara - Sarati' + SARB_OLD_SOUTH_ARABIAN='Sarb - Old South Arabian' + SAUR_SAURASHTRA='Saur - Saurashtra' + SGNW_SIGN_WRITING='Sgnw - SignWriting' + SHAW_SHAVIAN_SHAW='Shaw - Shavian (Shaw)' + SHRD_SHARADAŚĀRADĀ='Shrd - Sharada, Śāradā' + SIDD_SIDDHAM='Sidd - Siddham' + SIND_KHUDAWADI_SINDHI='Sind - Khudawadi, Sindhi' + SINH_SINHALA='Sinh - Sinhala' + SORA_SORA_SOMPENG='Sora - Sora Sompeng' + SUND_SUNDANESE='Sund - Sundanese' + SYLO_SYLOTI_NAGRI='Sylo - Syloti Nagri' + SYRC_SYRIAC='Syrc - Syriac' + SYRE_SYRIAC_ESTRANGELOVARIANT='Syre - Syriac (Estrangelo variant)' + SYRJ_SYRIAC_WESTERNVARIANT='Syrj - Syriac (Western variant)' + SYRN_SYRIAC_EASTERNVARIANT='Syrn - Syriac (Eastern variant)' + TAGB_TAGBANWA='Tagb - Tagbanwa' + TAKR_TAKRI='Takr - Takri' + TALE_TAI_LE='Tale - Tai Le' + TALU_NEW_TAI_LUE='Talu - New Tai Lue' + TAML_TAMIL='Taml - Tamil' + TANG_TANGUT='Tang - Tangut' + TAVT_TAI_VIET='Tavt - Tai Viet' + TELU_TELUGU='Telu - Telugu' + TENG_TENGWAR='Teng - Tengwar' + TFNG_TIFINAGH_BERBER='Tfng - Tifinagh (Berber)' + TGLG_TAGALOG_BAYBAYIN_ALIBATA='Tglg - Tagalog (Baybayin, Alibata)' + THAA_THAANA='Thaa - Thaana' + THAI_THAI='Thai - Thai' + TIBT_TIBETAN='Tibt - Tibetan' + TIRH_TIRHUTA='Tirh - Tirhuta' + UGAR_UGARITIC='Ugar - Ugaritic' + VAII_VAI='Vaii - Vai' + VISP_VISIBLE_SPEECH='Visp - Visible Speech' + WARA_WARANG_CITI_VARANG_KSHITI='Wara - Warang Citi (Varang Kshiti)' + WOLE_WOLEAI='Wole - Woleai' + XPEO_OLD_PERSIAN='Xpeo - Old Persian' + XSUX_CUNEIFORM_SUMERO_AKKADIAN='Xsux - Cuneiform, Sumero-Akkadian' + YIII_YI='Yiii - Yi' + ZINH_CODEFORINHERITEDSCRIPT='Zinh - Code for inherited script' + ZMTH_MATHEMATICALNOTATION='Zmth - Mathematical notation' + ZSYE_SYMBOLS_EMOJIVARIANT='Zsye - Symbols (Emoji variant)' + ZSYM_SYMBOLS='Zsym - Symbols' + ZXXX_CODEFORUNWRITTENDOCUMENTS='Zxxx - Code for unwritten documents' + ZYYY_CODEFORUNDETERMINEDSCRIPT='Zyyy - Code for undetermined script' + ZZZZ_CODEFORUNCODEDSCRIPT='Zzzz - Code for uncoded script' + OTHER='other' + + class TextDataTypeSimpleType(str, Enum): XSDDECIMAL='xsd:decimal' # Examples: "123.456", "+1234.456", "-1234.456", "-.456", "-456" XSDFLOAT='xsd:float' # Examples: "123.456", "+1234.456", "-1.2344e56", "-.45E-6", "INF", "-INF", "NaN" @@ -1087,10 +1524,58 @@ class UnderlineStyleSimpleType(str, Enum): OTHER='other' +class charTypeType(str, Enum): + """charTypeType -- + Type of character represented by the + grapheme, group, or non-printing character element. + + """ + BASE='base' + COMBINING='combining' + + +class imageResolutionUnitType(str, Enum): + """imageResolutionUnitType -- + Specifies the unit of the resolution information + referring to a standardised unit of measurement + (pixels per inch, pixels per centimeter or other). + + """ + PPI='PPI' + PPCM='PPCM' + OTHER='other' + + +class typeType(str, Enum): + """typeType -- + Type of metadata (e.g. author) + + """ + AUTHOR='author' + IMAGE_PROPERTIES='imageProperties' + PROCESSING_STEP='processingStep' + OTHER='other' + + +class typeType1(str, Enum): + LINK='link' + JOIN='join' + + +class typeType3(str, Enum): + XSDSTRING='xsd:string' + XSDINTEGER='xsd:integer' + XSDBOOLEAN='xsd:boolean' + XSDFLOAT='xsd:float' + + +# +# Start data representation classes +# class PcGtsType(GeneratedsSuper): __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('pcGtsId', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('pcGtsId', 'string', 0, 1, {'use': 'optional', 'name': 'pcGtsId'}), MemberSpec_('Metadata', 'MetadataType', 0, 0, {'name': 'Metadata', 'type': 'MetadataType'}, None), MemberSpec_('Page', 'PageType', 0, 0, {'name': 'Page', 'type': 'PageType'}, None), ] @@ -1101,7 +1586,7 @@ def __init__(self, pcGtsId=None, Metadata=None, Page=None, gds_collector_=None, self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.pcGtsId = _cast(None, pcGtsId) self.pcGtsId_nsprefix_ = "pc" self.Metadata = Metadata @@ -1135,7 +1620,7 @@ def get_pcGtsId(self): return self.pcGtsId def set_pcGtsId(self, pcGtsId): self.pcGtsId = pcGtsId - def hasContent_(self): + def has__content(self): if ( self.Metadata is not None or self.Page is not None @@ -1158,19 +1643,19 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='PcGtsType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='PcGtsType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='PcGtsType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='PcGtsType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='PcGtsType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='PcGtsType'): if self.pcGtsId is not None and 'pcGtsId' not in already_processed: already_processed.add('pcGtsId') outfile.write(' pcGtsId=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.pcGtsId), input_name='pcGtsId')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='PcGtsType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='PcGtsType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -1181,7 +1666,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml if self.Page is not None: namespaceprefix_ = self.Page_nsprefix_ + ':' if (UseCapturedNS_ and self.Page_nsprefix_) else '' self.Page.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Page', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='PcGtsType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='PcGtsType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -1190,12 +1675,14 @@ def to_etree(self, parent_element=None, name_='PcGtsType', mapping_=None, nsmap_ element.set('pcGtsId', self.gds_format_string(self.pcGtsId)) if self.Metadata is not None: Metadata_ = self.Metadata - Metadata_.to_etree(element, name_='Metadata', mapping_=mapping_, nsmap_=nsmap_) + Metadata_.to_etree(element, name_='Metadata', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.Page is not None: Page_ = self.Page - Page_.to_etree(element, name_='Page', mapping_=mapping_, nsmap_=nsmap_) + Page_.to_etree(element, name_='Page', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -1203,17 +1690,17 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('pcGtsId', node) if value is not None and 'pcGtsId' not in already_processed: already_processed.add('pcGtsId') self.pcGtsId = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'Metadata': obj_ = MetadataType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -1303,10 +1790,20 @@ def prune_ReadingOrder(self): class MetadataType(GeneratedsSuper): - """External reference of any kind""" + """externalRef -- External reference of any kind + Created -- + The timestamp has to be in UTC (Coordinated + Universal Time) and not local time. + + * LastChange -- + The timestamp has to be in UTC + (Coordinated Universal Time) + and not local time. + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('externalRef', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('externalRef', 'string', 0, 1, {'use': 'optional', 'name': 'externalRef'}), MemberSpec_('Creator', 'string', 0, 0, {'name': 'Creator', 'type': 'string'}, None), MemberSpec_('Created', 'dateTime', 0, 0, {'name': 'Created', 'type': 'dateTime'}, None), MemberSpec_('LastChange', 'dateTime', 0, 0, {'name': 'LastChange', 'type': 'dateTime'}, None), @@ -1396,7 +1893,7 @@ def get_externalRef(self): return self.externalRef def set_externalRef(self, externalRef): self.externalRef = externalRef - def hasContent_(self): + def has__content(self): if ( self.Creator is not None or self.Created is not None or @@ -1423,19 +1920,19 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MetadataType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MetadataType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='MetadataType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='MetadataType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='MetadataType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='MetadataType'): if self.externalRef is not None and 'externalRef' not in already_processed: already_processed.add('externalRef') outfile.write(' externalRef=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.externalRef), input_name='externalRef')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:None="http://www.w3.org/2001/XMLSchema" ', name_='MetadataType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:None="http://www.w3.org/2001/XMLSchema" ', name_='MetadataType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -1462,7 +1959,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for MetadataItem_ in self.MetadataItem: namespaceprefix_ = self.MetadataItem_nsprefix_ + ':' if (UseCapturedNS_ and self.MetadataItem_nsprefix_) else '' MetadataItem_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='MetadataItem', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='MetadataType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='MetadataType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -1483,11 +1980,13 @@ def to_etree(self, parent_element=None, name_='MetadataType', mapping_=None, nsm etree_.SubElement(element, '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}Comments').text = self.gds_format_string(Comments_) if self.UserDefined is not None: UserDefined_ = self.UserDefined - UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, nsmap_=nsmap_) + UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for MetadataItem_ in self.MetadataItem: - MetadataItem_.to_etree(element, name_='MetadataItem', mapping_=mapping_, nsmap_=nsmap_) + MetadataItem_.to_etree(element, name_='MetadataItem', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -1495,17 +1994,17 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('externalRef', node) if value is not None and 'externalRef' not in already_processed: already_processed.add('externalRef') self.externalRef = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'Creator': value_ = child_.text value_ = self.gds_parse_string(value_, node, 'Creator') @@ -1544,15 +2043,22 @@ def __hash__(self): class MetadataItemType(GeneratedsSuper): - """Type of metadata (e.g. author) - E.g. imagePhotometricInterpretation - E.g. RGB""" + """type -- + Type of metadata (e.g. author) + + * name -- + E.g. imagePhotometricInterpretation + + * value -- E.g. RGB + * Labels -- Semantic labels / tags + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('type_', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('name', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('value', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('date', 'dateTime', 0, 1, {'use': 'optional'}), + MemberSpec_('type_', 'typeType', 0, 1, {'use': 'optional', 'name': 'type_'}), + MemberSpec_('name', 'string', 0, 1, {'use': 'optional', 'name': 'name'}), + MemberSpec_('value', 'string', 0, 0, {'use': 'required', 'name': 'value'}), + MemberSpec_('date', 'dateTime', 0, 1, {'use': 'optional', 'name': 'date'}), MemberSpec_('Labels', 'LabelsType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'Labels', 'type': 'LabelsType'}, None), ] subclass = None @@ -1562,7 +2068,7 @@ def __init__(self, type_=None, name=None, value=None, date=None, Labels=None, gd self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.type_ = _cast(None, type_) self.type__nsprefix_ = "pc" self.name = _cast(None, name) @@ -1620,7 +2126,20 @@ def get_date(self): return self.date def set_date(self, date): self.date = date - def hasContent_(self): + def validate_typeType(self, value): + # Validate type typeType, a restriction on string. + if value is not None and Validate_simpletypes_ and self.gds_collector_ is not None: + if not isinstance(value, str): + lineno = self.gds_get_node_lineno_() + self.gds_collector_.add_message('Value "%(value)s"%(lineno)s is not of the correct base simple type (str)' % {"value": value, "lineno": lineno, }) + return False + value = value + enumerations = ['author', 'imageProperties', 'processingStep', 'other'] + if value not in enumerations: + lineno = self.gds_get_node_lineno_() + self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on typeType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) + result = False + def has__content(self): if ( self.Labels ): @@ -1642,15 +2161,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MetadataItemType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MetadataItemType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='MetadataItemType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='MetadataItemType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='MetadataItemType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='MetadataItemType'): if self.type_ is not None and 'type_' not in already_processed: already_processed.add('type_') outfile.write(' type=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.type_), input_name='type')), )) @@ -1663,7 +2182,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.date is not None and 'date' not in already_processed: already_processed.add('date') outfile.write(' date="%s"' % self.gds_format_datetime(self.date, input_name='date')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='MetadataItemType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='MetadataItemType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -1671,7 +2190,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for Labels_ in self.Labels: namespaceprefix_ = self.Labels_nsprefix_ + ':' if (UseCapturedNS_ and self.Labels_nsprefix_) else '' Labels_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Labels', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='MetadataItemType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='MetadataItemType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -1685,9 +2204,11 @@ def to_etree(self, parent_element=None, name_='MetadataItemType', mapping_=None, if self.date is not None: element.set('date', self.gds_format_datetime(self.date)) for Labels_ in self.Labels: - Labels_.to_etree(element, name_='Labels', mapping_=mapping_, nsmap_=nsmap_) + Labels_.to_etree(element, name_='Labels', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -1695,16 +2216,17 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('type', node) if value is not None and 'type' not in already_processed: already_processed.add('type') self.type_ = value + self.validate_typeType(self.type_) # validate type typeType value = find_attr_value_('name', node) if value is not None and 'name' not in already_processed: already_processed.add('name') @@ -1720,7 +2242,7 @@ def buildAttributes(self, node, attrs, already_processed): self.date = self.gds_parse_datetime(value) except ValueError as exp: raise ValueError('Bad date-time attribute (date): %s' % exp) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'Labels': obj_ = LabelsType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -1732,16 +2254,26 @@ def __hash__(self): class LabelsType(GeneratedsSuper): - """Reference to external model / ontology / schema - E.g. an RDF resource identifier - (to be used as subject or object of an RDF triple) - Prefix for all labels (e.g. first part of an URI)""" + """externalModel -- + Reference to external model / ontology / schema + + * externalId -- + E.g. an RDF resource identifier + (to be used as subject or object of an RDF triple) + + * prefix -- + Prefix for all labels (e.g. first part of an URI) + + * Label -- + A semantic label / tag + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('externalModel', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('externalId', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('prefix', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('comments', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('externalModel', 'string', 0, 1, {'use': 'optional', 'name': 'externalModel'}), + MemberSpec_('externalId', 'string', 0, 1, {'use': 'optional', 'name': 'externalId'}), + MemberSpec_('prefix', 'string', 0, 1, {'use': 'optional', 'name': 'prefix'}), + MemberSpec_('comments', 'string', 0, 1, {'use': 'optional', 'name': 'comments'}), MemberSpec_('Label', 'LabelType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'Label', 'type': 'LabelType'}, None), ] subclass = None @@ -1751,7 +2283,7 @@ def __init__(self, externalModel=None, externalId=None, prefix=None, comments=No self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.externalModel = _cast(None, externalModel) self.externalModel_nsprefix_ = "pc" self.externalId = _cast(None, externalId) @@ -1806,7 +2338,7 @@ def get_comments(self): return self.comments def set_comments(self, comments): self.comments = comments - def hasContent_(self): + def has__content(self): if ( self.Label ): @@ -1828,15 +2360,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='LabelsType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='LabelsType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='LabelsType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='LabelsType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='LabelsType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='LabelsType'): if self.externalModel is not None and 'externalModel' not in already_processed: already_processed.add('externalModel') outfile.write(' externalModel=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.externalModel), input_name='externalModel')), )) @@ -1849,7 +2381,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.comments is not None and 'comments' not in already_processed: already_processed.add('comments') outfile.write(' comments=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.comments), input_name='comments')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='LabelsType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='LabelsType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -1857,7 +2389,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for Label_ in self.Label: namespaceprefix_ = self.Label_nsprefix_ + ':' if (UseCapturedNS_ and self.Label_nsprefix_) else '' Label_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Label', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='LabelsType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='LabelsType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -1871,9 +2403,11 @@ def to_etree(self, parent_element=None, name_='LabelsType', mapping_=None, nsmap if self.comments is not None: element.set('comments', self.gds_format_string(self.comments)) for Label_ in self.Label: - Label_.to_etree(element, name_='Label', mapping_=mapping_, nsmap_=nsmap_) + Label_.to_etree(element, name_='Label', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -1881,12 +2415,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('externalModel', node) if value is not None and 'externalModel' not in already_processed: already_processed.add('externalModel') @@ -1903,7 +2437,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'comments' not in already_processed: already_processed.add('comments') self.comments = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'Label': obj_ = LabelType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -1915,18 +2449,23 @@ def __hash__(self): class LabelType(GeneratedsSuper): - """Semantic label + """LabelType -- Semantic label + value -- The label / tag (e.g. 'person'). Can be an RDF resource identifier (e.g. object of an RDF triple). - Additional information on the label - (e.g. 'YYYY-mm-dd' for a date label). - Can be used as predicate of an RDF triple.""" + + * type -- + Additional information on the label + (e.g. 'YYYY-mm-dd' for a date label). + Can be used as predicate of an RDF triple. + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('value', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('type_', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('comments', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('value', 'string', 0, 0, {'use': 'required', 'name': 'value'}), + MemberSpec_('type_', 'string', 0, 1, {'use': 'optional', 'name': 'type_'}), + MemberSpec_('comments', 'string', 0, 1, {'use': 'optional', 'name': 'comments'}), ] subclass = None superclass = None @@ -1935,7 +2474,7 @@ def __init__(self, value=None, type_=None, comments=None, gds_collector_=None, * self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.value = _cast(None, value) self.value_nsprefix_ = "pc" self.type_ = _cast(None, type_) @@ -1969,7 +2508,7 @@ def get_comments(self): return self.comments def set_comments(self, comments): self.comments = comments - def hasContent_(self): + def has__content(self): if ( ): @@ -1991,14 +2530,14 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='LabelType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='LabelType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='LabelType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='LabelType', pretty_print=pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='LabelType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='LabelType'): if self.value is not None and 'value' not in already_processed: already_processed.add('value') outfile.write(' value=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.value), input_name='value')), )) @@ -2008,9 +2547,9 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.comments is not None and 'comments' not in already_processed: already_processed.add('comments') outfile.write(' comments=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.comments), input_name='comments')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='LabelType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='LabelType', fromsubclass_=False, pretty_print=True): pass - def to_etree(self, parent_element=None, name_='LabelType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='LabelType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -2023,6 +2562,8 @@ def to_etree(self, parent_element=None, name_='LabelType', mapping_=None, nsmap_ element.set('comments', self.gds_format_string(self.comments)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -2030,12 +2571,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('value', node) if value is not None and 'value' not in already_processed: already_processed.add('value') @@ -2048,7 +2589,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'comments' not in already_processed: already_processed.add('comments') self.comments = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): pass def __hash__(self): return hash(self.id) @@ -2056,57 +2597,102 @@ def __hash__(self): class PageType(GeneratedsSuper): - """Contains the image file name including the file extension. - Specifies the width of the image.Specifies the height of the - image.Specifies the image resolution in width.Specifies the image - resolution in height. - Specifies the unit of the resolution information - referring to a standardised unit of measurement - (pixels per inch, pixels per centimeter or other). - For generic use - The angle the rectangle encapsulating the page - (or its Border) has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - (The rotated image can be further referenced - via “AlternativeImage”.) - Range: -179.999,180 - The type of the page within the document - (e.g. cover page). - The primary language used in the page - (lower-level definitions override the page-level definition). - The secondary language used in the page - (lower-level definitions override the page-level definition). - The primary script used in the page - (lower-level definitions override the page-level definition). - The secondary script used in the page - (lower-level definitions override the page-level definition). - The direction in which text within lines - should be read (order of words and characters), - in addition to “textLineOrder” - (lower-level definitions override the page-level definition). - The order of text lines within a block, - in addition to “readingDirection” - (lower-level definitions override the page-level definition). - Confidence value for whole page (between 0 and 1)""" + """imageFilename -- + Contains the image file name including the file extension. + + * imageWidth -- Specifies the width of the image. + * imageHeight -- Specifies the height of the image. + * imageXResolution -- Specifies the image resolution in width. + * imageYResolution -- Specifies the image resolution in height. + * imageResolutionUnit -- + Specifies the unit of the resolution information + referring to a standardised unit of measurement + (pixels per inch, pixels per centimeter or other). + + * custom -- For generic use + * orientation -- + The angle the rectangle encapsulating the page + (or its Border) has to be rotated in clockwise direction + in order to correct the present skew + (negative values indicate anti-clockwise rotation). + (The rotated image can be further referenced + via + “ + AlternativeImage + ” + .) + Range: -179.999,180 + + * type -- + The type of the page within the document + (e.g. cover page). + + * primaryLanguage -- + The primary language used in the page + (lower-level definitions override the page-level definition). + + * secondaryLanguage -- + The secondary language used in the page + (lower-level definitions override the page-level definition). + + * primaryScript -- + The primary script used in the page + (lower-level definitions override the page-level definition). + + * secondaryScript -- + The secondary script used in the page + (lower-level definitions override the page-level definition). + + * readingDirection -- + The direction in which text within lines + should be read (order of words and characters), + in addition to + “ + textLineOrder + ” + (lower-level definitions override the page-level definition). + + * textLineOrder -- + The order of text lines within a block, + in addition to + “ + readingDirection + ” + (lower-level definitions override the page-level definition). + + * conf -- Confidence value for whole page (between 0 and 1) + * AlternativeImage -- + Alternative document page images + (e.g. black-and-white). + + * ReadingOrder -- Order of blocks within the page. + * Layers -- + Unassigned regions are considered to be in the + (virtual) default layer which is to be treated + as below any other layers. + + * TextStyle -- Default text style + * Labels -- Semantic labels / tags + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('imageFilename', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('imageWidth', 'int', 0, 0, {'use': 'required'}), - MemberSpec_('imageHeight', 'int', 0, 0, {'use': 'required'}), - MemberSpec_('imageXResolution', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('imageYResolution', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('imageResolutionUnit', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('custom', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('type_', 'pc:PageTypeSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('primaryLanguage', 'pc:LanguageSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('secondaryLanguage', 'pc:LanguageSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('primaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('secondaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('readingDirection', 'pc:ReadingDirectionSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('textLineOrder', 'pc:TextLineOrderSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('conf', 'pc:ConfSimpleType', 0, 1, {'use': 'optional'}), + MemberSpec_('imageFilename', 'string', 0, 0, {'use': 'required', 'name': 'imageFilename'}), + MemberSpec_('imageWidth', 'int', 0, 0, {'use': 'required', 'name': 'imageWidth'}), + MemberSpec_('imageHeight', 'int', 0, 0, {'use': 'required', 'name': 'imageHeight'}), + MemberSpec_('imageXResolution', 'float', 0, 1, {'use': 'optional', 'name': 'imageXResolution'}), + MemberSpec_('imageYResolution', 'float', 0, 1, {'use': 'optional', 'name': 'imageYResolution'}), + MemberSpec_('imageResolutionUnit', 'imageResolutionUnitType', 0, 1, {'use': 'optional', 'name': 'imageResolutionUnit'}), + MemberSpec_('custom', 'string', 0, 1, {'use': 'optional', 'name': 'custom'}), + MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), + MemberSpec_('type_', 'pc:PageTypeSimpleType', 0, 1, {'use': 'optional', 'name': 'type_'}), + MemberSpec_('primaryLanguage', 'pc:LanguageSimpleType', 0, 1, {'use': 'optional', 'name': 'primaryLanguage'}), + MemberSpec_('secondaryLanguage', 'pc:LanguageSimpleType', 0, 1, {'use': 'optional', 'name': 'secondaryLanguage'}), + MemberSpec_('primaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional', 'name': 'primaryScript'}), + MemberSpec_('secondaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional', 'name': 'secondaryScript'}), + MemberSpec_('readingDirection', 'pc:ReadingDirectionSimpleType', 0, 1, {'use': 'optional', 'name': 'readingDirection'}), + MemberSpec_('textLineOrder', 'pc:TextLineOrderSimpleType', 0, 1, {'use': 'optional', 'name': 'textLineOrder'}), + MemberSpec_('conf', 'pc:ConfSimpleType', 0, 1, {'use': 'optional', 'name': 'conf'}), MemberSpec_('AlternativeImage', 'AlternativeImageType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'AlternativeImage', 'type': 'AlternativeImageType'}, None), MemberSpec_('Border', 'BorderType', 0, 1, {'maxOccurs': '1', 'minOccurs': '0', 'name': 'Border', 'type': 'BorderType'}, None), MemberSpec_('PrintSpace', 'PrintSpaceType', 0, 1, {'maxOccurs': '1', 'minOccurs': '0', 'name': 'PrintSpace', 'type': 'PrintSpaceType'}, None), @@ -2139,7 +2725,7 @@ def __init__(self, imageFilename=None, imageWidth=None, imageHeight=None, imageX self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.imageFilename = _cast(None, imageFilename) self.imageFilename_nsprefix_ = "pc" self.imageWidth = _cast(int, imageWidth) @@ -2548,6 +3134,19 @@ def get_conf(self): return self.conf def set_conf(self, conf): self.conf = conf + def validate_imageResolutionUnitType(self, value): + # Validate type imageResolutionUnitType, a restriction on string. + if value is not None and Validate_simpletypes_ and self.gds_collector_ is not None: + if not isinstance(value, str): + lineno = self.gds_get_node_lineno_() + self.gds_collector_.add_message('Value "%(value)s"%(lineno)s is not of the correct base simple type (str)' % {"value": value, "lineno": lineno, }) + return False + value = value + enumerations = ['PPI', 'PPCM', 'other'] + if value not in enumerations: + lineno = self.gds_get_node_lineno_() + self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on imageResolutionUnitType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) + result = False def validate_PageTypeSimpleType(self, value): # Validate type pc:PageTypeSimpleType, a restriction on string. if value is not None and Validate_simpletypes_ and self.gds_collector_ is not None: @@ -2628,7 +3227,7 @@ def validate_ConfSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd maxInclusive restriction on ConfSimpleType' % {"value": value, "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( self.AlternativeImage or self.Border is not None or @@ -2673,15 +3272,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='PageType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='PageType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='PageType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='PageType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='PageType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='PageType'): if self.imageFilename is not None and 'imageFilename' not in already_processed: already_processed.add('imageFilename') outfile.write(' imageFilename=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.imageFilename), input_name='imageFilename')), )) @@ -2730,7 +3329,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.conf is not None and 'conf' not in already_processed: already_processed.add('conf') outfile.write(' conf="%s"' % self.gds_format_float(self.conf, input_name='conf')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='PageType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='PageType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -2807,7 +3406,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for CustomRegion_ in self.CustomRegion: namespaceprefix_ = self.CustomRegion_nsprefix_ + ':' if (UseCapturedNS_ and self.CustomRegion_nsprefix_) else '' CustomRegion_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='CustomRegion', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='PageType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='PageType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -2845,62 +3444,64 @@ def to_etree(self, parent_element=None, name_='PageType', mapping_=None, nsmap_= if self.conf is not None: element.set('conf', self.gds_format_float(self.conf)) for AlternativeImage_ in self.AlternativeImage: - AlternativeImage_.to_etree(element, name_='AlternativeImage', mapping_=mapping_, nsmap_=nsmap_) + AlternativeImage_.to_etree(element, name_='AlternativeImage', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.Border is not None: Border_ = self.Border - Border_.to_etree(element, name_='Border', mapping_=mapping_, nsmap_=nsmap_) + Border_.to_etree(element, name_='Border', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.PrintSpace is not None: PrintSpace_ = self.PrintSpace - PrintSpace_.to_etree(element, name_='PrintSpace', mapping_=mapping_, nsmap_=nsmap_) + PrintSpace_.to_etree(element, name_='PrintSpace', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.ReadingOrder is not None: ReadingOrder_ = self.ReadingOrder - ReadingOrder_.to_etree(element, name_='ReadingOrder', mapping_=mapping_, nsmap_=nsmap_) + ReadingOrder_.to_etree(element, name_='ReadingOrder', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.Layers is not None: Layers_ = self.Layers - Layers_.to_etree(element, name_='Layers', mapping_=mapping_, nsmap_=nsmap_) + Layers_.to_etree(element, name_='Layers', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.Relations is not None: Relations_ = self.Relations - Relations_.to_etree(element, name_='Relations', mapping_=mapping_, nsmap_=nsmap_) + Relations_.to_etree(element, name_='Relations', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.TextStyle is not None: TextStyle_ = self.TextStyle - TextStyle_.to_etree(element, name_='TextStyle', mapping_=mapping_, nsmap_=nsmap_) + TextStyle_.to_etree(element, name_='TextStyle', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.UserDefined is not None: UserDefined_ = self.UserDefined - UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, nsmap_=nsmap_) + UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for Labels_ in self.Labels: - Labels_.to_etree(element, name_='Labels', mapping_=mapping_, nsmap_=nsmap_) + Labels_.to_etree(element, name_='Labels', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for TextRegion_ in self.TextRegion: - TextRegion_.to_etree(element, name_='TextRegion', mapping_=mapping_, nsmap_=nsmap_) + TextRegion_.to_etree(element, name_='TextRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for ImageRegion_ in self.ImageRegion: - ImageRegion_.to_etree(element, name_='ImageRegion', mapping_=mapping_, nsmap_=nsmap_) + ImageRegion_.to_etree(element, name_='ImageRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for LineDrawingRegion_ in self.LineDrawingRegion: - LineDrawingRegion_.to_etree(element, name_='LineDrawingRegion', mapping_=mapping_, nsmap_=nsmap_) + LineDrawingRegion_.to_etree(element, name_='LineDrawingRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for GraphicRegion_ in self.GraphicRegion: - GraphicRegion_.to_etree(element, name_='GraphicRegion', mapping_=mapping_, nsmap_=nsmap_) + GraphicRegion_.to_etree(element, name_='GraphicRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for TableRegion_ in self.TableRegion: - TableRegion_.to_etree(element, name_='TableRegion', mapping_=mapping_, nsmap_=nsmap_) + TableRegion_.to_etree(element, name_='TableRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for ChartRegion_ in self.ChartRegion: - ChartRegion_.to_etree(element, name_='ChartRegion', mapping_=mapping_, nsmap_=nsmap_) + ChartRegion_.to_etree(element, name_='ChartRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for MapRegion_ in self.MapRegion: - MapRegion_.to_etree(element, name_='MapRegion', mapping_=mapping_, nsmap_=nsmap_) + MapRegion_.to_etree(element, name_='MapRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for SeparatorRegion_ in self.SeparatorRegion: - SeparatorRegion_.to_etree(element, name_='SeparatorRegion', mapping_=mapping_, nsmap_=nsmap_) + SeparatorRegion_.to_etree(element, name_='SeparatorRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for MathsRegion_ in self.MathsRegion: - MathsRegion_.to_etree(element, name_='MathsRegion', mapping_=mapping_, nsmap_=nsmap_) + MathsRegion_.to_etree(element, name_='MathsRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for ChemRegion_ in self.ChemRegion: - ChemRegion_.to_etree(element, name_='ChemRegion', mapping_=mapping_, nsmap_=nsmap_) + ChemRegion_.to_etree(element, name_='ChemRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for MusicRegion_ in self.MusicRegion: - MusicRegion_.to_etree(element, name_='MusicRegion', mapping_=mapping_, nsmap_=nsmap_) + MusicRegion_.to_etree(element, name_='MusicRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for AdvertRegion_ in self.AdvertRegion: - AdvertRegion_.to_etree(element, name_='AdvertRegion', mapping_=mapping_, nsmap_=nsmap_) + AdvertRegion_.to_etree(element, name_='AdvertRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for NoiseRegion_ in self.NoiseRegion: - NoiseRegion_.to_etree(element, name_='NoiseRegion', mapping_=mapping_, nsmap_=nsmap_) + NoiseRegion_.to_etree(element, name_='NoiseRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for UnknownRegion_ in self.UnknownRegion: - UnknownRegion_.to_etree(element, name_='UnknownRegion', mapping_=mapping_, nsmap_=nsmap_) + UnknownRegion_.to_etree(element, name_='UnknownRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for CustomRegion_ in self.CustomRegion: - CustomRegion_.to_etree(element, name_='CustomRegion', mapping_=mapping_, nsmap_=nsmap_) + CustomRegion_.to_etree(element, name_='CustomRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -2908,12 +3509,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('imageFilename', node) if value is not None and 'imageFilename' not in already_processed: already_processed.add('imageFilename') @@ -2940,6 +3541,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'imageResolutionUnit' not in already_processed: already_processed.add('imageResolutionUnit') self.imageResolutionUnit = value + self.validate_imageResolutionUnitType(self.imageResolutionUnit) # validate type imageResolutionUnitType value = find_attr_value_('custom', node) if value is not None and 'custom' not in already_processed: already_processed.add('custom') @@ -2990,7 +3592,7 @@ def buildAttributes(self, node, attrs, already_processed): value = self.gds_parse_float(value, node, 'conf') self.conf = value self.validate_ConfSimpleType(self.conf) # validate type ConfSimpleType - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'AlternativeImage': obj_ = AlternativeImageType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -3362,18 +3964,22 @@ def set_orientation(self, orientation): class CoordsType(GeneratedsSuper): - """Polygon outline of the element as a path of points. + """points -- + Polygon outline of the element as a path of points. No points may lie outside the outline of its parent, which in the case of Border is the bounding rectangle of the root image. Paths are closed by convention, i.e. the last point logically connects with the first (and at least 3 points are required to span an area). Paths must be planar (i.e. must not self-intersect). - Confidence value (between 0 and 1)""" + + * conf -- Confidence value (between 0 and 1) + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('points', 'pc:PointsType', 0, 0, {'use': 'required'}), - MemberSpec_('conf', 'pc:ConfSimpleType', 0, 1, {'use': 'optional'}), + MemberSpec_('points', 'pc:PointsType', 0, 0, {'use': 'required', 'name': 'points'}), + MemberSpec_('conf', 'pc:ConfSimpleType', 0, 1, {'use': 'optional', 'name': 'conf'}), ] subclass = None superclass = None @@ -3382,7 +3988,7 @@ def __init__(self, points=None, conf=None, gds_collector_=None, **kwargs_): self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.points = _cast(None, points) self.points_nsprefix_ = "pc" self.conf = _cast(float, conf) @@ -3436,7 +4042,7 @@ def validate_ConfSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd maxInclusive restriction on ConfSimpleType' % {"value": value, "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( ): @@ -3458,23 +4064,23 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='CoordsType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='CoordsType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='CoordsType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='CoordsType', pretty_print=pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='CoordsType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='CoordsType'): if self.points is not None and 'points' not in already_processed: already_processed.add('points') outfile.write(' points=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.points), input_name='points')), )) if self.conf is not None and 'conf' not in already_processed: already_processed.add('conf') outfile.write(' conf="%s"' % self.gds_format_float(self.conf, input_name='conf')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='CoordsType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='CoordsType', fromsubclass_=False, pretty_print=True): pass - def to_etree(self, parent_element=None, name_='CoordsType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='CoordsType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -3485,6 +4091,8 @@ def to_etree(self, parent_element=None, name_='CoordsType', mapping_=None, nsmap element.set('conf', self.gds_format_float(self.conf)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -3492,12 +4100,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('points', node) if value is not None and 'points' not in already_processed: already_processed.add('points') @@ -3509,7 +4117,7 @@ def buildAttributes(self, node, attrs, already_processed): value = self.gds_parse_float(value, node, 'conf') self.conf = value self.validate_ConfSimpleType(self.conf) # validate type ConfSimpleType - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): pass def __hash__(self): return hash(self.id) @@ -3533,28 +4141,51 @@ def set_points(self, points): class TextLineType(GeneratedsSuper): - """Overrides primaryLanguage attribute of parent text + """primaryLanguage -- + Overrides primaryLanguage attribute of parent text region - The primary script used in the text line - The secondary script used in the text line - The direction in which text within the line - should be read (order of words and characters). - Overrides the production attribute of the parent - text region - For generic use - Position (order number) of this text line within the - parent text region.""" + + * primaryScript -- + The primary script used in the text line + + * secondaryScript -- + The secondary script used in the text line + + * readingDirection -- + The direction in which text within the line + should be read (order of words and characters). + + * production -- + Overrides the production attribute of the parent + text region + + * custom -- For generic use + * index -- + Position (order number) of this text line within the + parent text region. + + * AlternativeImage -- + Alternative text line images (e.g. + black-and-white) + + * Baseline -- + Multiple connected points that mark the baseline + of the glyphs + + * Labels -- Semantic labels / tags + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('id', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('primaryLanguage', 'pc:LanguageSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('primaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('secondaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('readingDirection', 'pc:ReadingDirectionSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('production', 'pc:ProductionSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('custom', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('comments', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('index', 'int', 0, 1, {'use': 'optional'}), + MemberSpec_('id', 'string', 0, 0, {'use': 'required', 'name': 'id'}), + MemberSpec_('primaryLanguage', 'pc:LanguageSimpleType', 0, 1, {'use': 'optional', 'name': 'primaryLanguage'}), + MemberSpec_('primaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional', 'name': 'primaryScript'}), + MemberSpec_('secondaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional', 'name': 'secondaryScript'}), + MemberSpec_('readingDirection', 'pc:ReadingDirectionSimpleType', 0, 1, {'use': 'optional', 'name': 'readingDirection'}), + MemberSpec_('production', 'pc:ProductionSimpleType', 0, 1, {'use': 'optional', 'name': 'production'}), + MemberSpec_('custom', 'string', 0, 1, {'use': 'optional', 'name': 'custom'}), + MemberSpec_('comments', 'string', 0, 1, {'use': 'optional', 'name': 'comments'}), + MemberSpec_('index', 'int', 0, 1, {'use': 'optional', 'name': 'index'}), MemberSpec_('AlternativeImage', 'AlternativeImageType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'AlternativeImage', 'type': 'AlternativeImageType'}, None), MemberSpec_('Coords', 'CoordsType', 0, 0, {'name': 'Coords', 'type': 'CoordsType'}, None), MemberSpec_('Baseline', 'BaselineType', 0, 1, {'minOccurs': '0', 'name': 'Baseline', 'type': 'BaselineType'}, None), @@ -3571,7 +4202,7 @@ def __init__(self, id=None, primaryLanguage=None, primaryScript=None, secondaryS self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.id = _cast(None, id) self.id_nsprefix_ = "pc" self.primaryLanguage = _cast(None, primaryLanguage) @@ -3777,7 +4408,7 @@ def validate_ProductionSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ProductionSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( self.AlternativeImage or self.Coords is not None or @@ -3806,15 +4437,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TextLineType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TextLineType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='TextLineType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='TextLineType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='TextLineType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='TextLineType'): if self.id is not None and 'id' not in already_processed: already_processed.add('id') outfile.write(' id=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.id), input_name='id')), )) @@ -3842,7 +4473,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.index is not None and 'index' not in already_processed: already_processed.add('index') outfile.write(' index="%s"' % self.gds_format_integer(self.index, input_name='index')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='TextLineType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='TextLineType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -3871,7 +4502,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for Labels_ in self.Labels: namespaceprefix_ = self.Labels_nsprefix_ + ':' if (UseCapturedNS_ and self.Labels_nsprefix_) else '' Labels_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Labels', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='TextLineType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='TextLineType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -3895,27 +4526,29 @@ def to_etree(self, parent_element=None, name_='TextLineType', mapping_=None, nsm if self.index is not None: element.set('index', self.gds_format_integer(self.index)) for AlternativeImage_ in self.AlternativeImage: - AlternativeImage_.to_etree(element, name_='AlternativeImage', mapping_=mapping_, nsmap_=nsmap_) + AlternativeImage_.to_etree(element, name_='AlternativeImage', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.Coords is not None: Coords_ = self.Coords - Coords_.to_etree(element, name_='Coords', mapping_=mapping_, nsmap_=nsmap_) + Coords_.to_etree(element, name_='Coords', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.Baseline is not None: Baseline_ = self.Baseline - Baseline_.to_etree(element, name_='Baseline', mapping_=mapping_, nsmap_=nsmap_) + Baseline_.to_etree(element, name_='Baseline', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for Word_ in self.Word: - Word_.to_etree(element, name_='Word', mapping_=mapping_, nsmap_=nsmap_) + Word_.to_etree(element, name_='Word', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for TextEquiv_ in self.TextEquiv: - TextEquiv_.to_etree(element, name_='TextEquiv', mapping_=mapping_, nsmap_=nsmap_) + TextEquiv_.to_etree(element, name_='TextEquiv', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.TextStyle is not None: TextStyle_ = self.TextStyle - TextStyle_.to_etree(element, name_='TextStyle', mapping_=mapping_, nsmap_=nsmap_) + TextStyle_.to_etree(element, name_='TextStyle', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.UserDefined is not None: UserDefined_ = self.UserDefined - UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, nsmap_=nsmap_) + UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for Labels_ in self.Labels: - Labels_.to_etree(element, name_='Labels', mapping_=mapping_, nsmap_=nsmap_) + Labels_.to_etree(element, name_='Labels', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -3923,12 +4556,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('id', node) if value is not None and 'id' not in already_processed: already_processed.add('id') @@ -3970,7 +4603,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'index' not in already_processed: already_processed.add('index') self.index = self.gds_parse_integer(value, node, 'index') - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'AlternativeImage': obj_ = AlternativeImageType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -4062,25 +4695,42 @@ def set_Coords(self, Coords): class WordType(GeneratedsSuper): - """Overrides primaryLanguage attribute of parent line + """language -- + Overrides primaryLanguage attribute of parent line and/or text region - The primary script used in the word - The secondary script used in the word - The direction in which text within the word - should be read (order of characters). - Overrides the production attribute of the parent - text line and/or text region. - For generic use""" + + * primaryScript -- + The primary script used in the word + + * secondaryScript -- + The secondary script used in the word + + * readingDirection -- + The direction in which text within the word + should be read (order of characters). + + * production -- + Overrides the production attribute of the parent + text line and/or text region. + + * custom -- For generic use + * AlternativeImage -- + Alternative word images (e.g. + black-and-white) + + * Labels -- Semantic labels / tags + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('id', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('language', 'pc:LanguageSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('primaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('secondaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('readingDirection', 'pc:ReadingDirectionSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('production', 'pc:ProductionSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('custom', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('comments', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('id', 'string', 0, 0, {'use': 'required', 'name': 'id'}), + MemberSpec_('language', 'pc:LanguageSimpleType', 0, 1, {'use': 'optional', 'name': 'language'}), + MemberSpec_('primaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional', 'name': 'primaryScript'}), + MemberSpec_('secondaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional', 'name': 'secondaryScript'}), + MemberSpec_('readingDirection', 'pc:ReadingDirectionSimpleType', 0, 1, {'use': 'optional', 'name': 'readingDirection'}), + MemberSpec_('production', 'pc:ProductionSimpleType', 0, 1, {'use': 'optional', 'name': 'production'}), + MemberSpec_('custom', 'string', 0, 1, {'use': 'optional', 'name': 'custom'}), + MemberSpec_('comments', 'string', 0, 1, {'use': 'optional', 'name': 'comments'}), MemberSpec_('AlternativeImage', 'AlternativeImageType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'AlternativeImage', 'type': 'AlternativeImageType'}, None), MemberSpec_('Coords', 'CoordsType', 0, 0, {'name': 'Coords', 'type': 'CoordsType'}, None), MemberSpec_('Glyph', 'GlyphType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'Glyph', 'type': 'GlyphType'}, None), @@ -4096,7 +4746,7 @@ def __init__(self, id=None, language=None, primaryScript=None, secondaryScript=N self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.id = _cast(None, id) self.id_nsprefix_ = "pc" self.language = _cast(None, language) @@ -4290,7 +4940,7 @@ def validate_ProductionSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ProductionSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( self.AlternativeImage or self.Coords is not None or @@ -4318,15 +4968,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='WordType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='WordType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='WordType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='WordType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='WordType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='WordType'): if self.id is not None and 'id' not in already_processed: already_processed.add('id') outfile.write(' id=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.id), input_name='id')), )) @@ -4351,7 +5001,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.comments is not None and 'comments' not in already_processed: already_processed.add('comments') outfile.write(' comments=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.comments), input_name='comments')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='WordType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='WordType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -4377,7 +5027,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for Labels_ in self.Labels: namespaceprefix_ = self.Labels_nsprefix_ + ':' if (UseCapturedNS_ and self.Labels_nsprefix_) else '' Labels_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Labels', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='WordType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='WordType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -4399,24 +5049,26 @@ def to_etree(self, parent_element=None, name_='WordType', mapping_=None, nsmap_= if self.comments is not None: element.set('comments', self.gds_format_string(self.comments)) for AlternativeImage_ in self.AlternativeImage: - AlternativeImage_.to_etree(element, name_='AlternativeImage', mapping_=mapping_, nsmap_=nsmap_) + AlternativeImage_.to_etree(element, name_='AlternativeImage', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.Coords is not None: Coords_ = self.Coords - Coords_.to_etree(element, name_='Coords', mapping_=mapping_, nsmap_=nsmap_) + Coords_.to_etree(element, name_='Coords', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for Glyph_ in self.Glyph: - Glyph_.to_etree(element, name_='Glyph', mapping_=mapping_, nsmap_=nsmap_) + Glyph_.to_etree(element, name_='Glyph', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for TextEquiv_ in self.TextEquiv: - TextEquiv_.to_etree(element, name_='TextEquiv', mapping_=mapping_, nsmap_=nsmap_) + TextEquiv_.to_etree(element, name_='TextEquiv', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.TextStyle is not None: TextStyle_ = self.TextStyle - TextStyle_.to_etree(element, name_='TextStyle', mapping_=mapping_, nsmap_=nsmap_) + TextStyle_.to_etree(element, name_='TextStyle', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.UserDefined is not None: UserDefined_ = self.UserDefined - UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, nsmap_=nsmap_) + UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for Labels_ in self.Labels: - Labels_.to_etree(element, name_='Labels', mapping_=mapping_, nsmap_=nsmap_) + Labels_.to_etree(element, name_='Labels', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -4424,12 +5076,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('id', node) if value is not None and 'id' not in already_processed: already_processed.add('id') @@ -4467,7 +5119,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'comments' not in already_processed: already_processed.add('comments') self.comments = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'AlternativeImage': obj_ = AlternativeImageType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -4554,19 +5206,34 @@ def set_Coords(self, Coords): class GlyphType(GeneratedsSuper): - """The script used for the glyph - Overrides the production attribute of the parent - word / text line / text region. - For generic use""" + """script -- + The script used for the glyph + + * production -- + Overrides the production attribute of the parent + word / text line / text region. + + * custom -- For generic use + * AlternativeImage -- + Alternative glyph images (e.g. + black-and-white) + + * Graphemes -- + Container for graphemes, grapheme groups and + non-printing characters + + * Labels -- Semantic labels / tags + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('id', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('ligature', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('symbol', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('script', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('production', 'pc:ProductionSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('custom', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('comments', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('id', 'string', 0, 0, {'use': 'required', 'name': 'id'}), + MemberSpec_('ligature', 'boolean', 0, 1, {'use': 'optional', 'name': 'ligature'}), + MemberSpec_('symbol', 'boolean', 0, 1, {'use': 'optional', 'name': 'symbol'}), + MemberSpec_('script', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional', 'name': 'script'}), + MemberSpec_('production', 'pc:ProductionSimpleType', 0, 1, {'use': 'optional', 'name': 'production'}), + MemberSpec_('custom', 'string', 0, 1, {'use': 'optional', 'name': 'custom'}), + MemberSpec_('comments', 'string', 0, 1, {'use': 'optional', 'name': 'comments'}), MemberSpec_('AlternativeImage', 'AlternativeImageType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'AlternativeImage', 'type': 'AlternativeImageType'}, None), MemberSpec_('Coords', 'CoordsType', 0, 0, {'name': 'Coords', 'type': 'CoordsType'}, None), MemberSpec_('Graphemes', 'GraphemesType', 0, 1, {'maxOccurs': '1', 'minOccurs': '0', 'name': 'Graphemes', 'type': 'GraphemesType'}, None), @@ -4582,7 +5249,7 @@ def __init__(self, id=None, ligature=None, symbol=None, script=None, production= self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.id = _cast(None, id) self.id_nsprefix_ = "pc" self.ligature = _cast(bool, ligature) @@ -4735,7 +5402,7 @@ def validate_ProductionSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ProductionSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( self.AlternativeImage or self.Coords is not None or @@ -4763,15 +5430,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GlyphType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GlyphType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GlyphType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GlyphType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GlyphType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GlyphType'): if self.id is not None and 'id' not in already_processed: already_processed.add('id') outfile.write(' id=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.id), input_name='id')), )) @@ -4793,7 +5460,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.comments is not None and 'comments' not in already_processed: already_processed.add('comments') outfile.write(' comments=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.comments), input_name='comments')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GlyphType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GlyphType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -4819,7 +5486,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for Labels_ in self.Labels: namespaceprefix_ = self.Labels_nsprefix_ + ':' if (UseCapturedNS_ and self.Labels_nsprefix_) else '' Labels_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Labels', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='GlyphType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='GlyphType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -4839,25 +5506,27 @@ def to_etree(self, parent_element=None, name_='GlyphType', mapping_=None, nsmap_ if self.comments is not None: element.set('comments', self.gds_format_string(self.comments)) for AlternativeImage_ in self.AlternativeImage: - AlternativeImage_.to_etree(element, name_='AlternativeImage', mapping_=mapping_, nsmap_=nsmap_) + AlternativeImage_.to_etree(element, name_='AlternativeImage', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.Coords is not None: Coords_ = self.Coords - Coords_.to_etree(element, name_='Coords', mapping_=mapping_, nsmap_=nsmap_) + Coords_.to_etree(element, name_='Coords', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.Graphemes is not None: Graphemes_ = self.Graphemes - Graphemes_.to_etree(element, name_='Graphemes', mapping_=mapping_, nsmap_=nsmap_) + Graphemes_.to_etree(element, name_='Graphemes', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for TextEquiv_ in self.TextEquiv: - TextEquiv_.to_etree(element, name_='TextEquiv', mapping_=mapping_, nsmap_=nsmap_) + TextEquiv_.to_etree(element, name_='TextEquiv', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.TextStyle is not None: TextStyle_ = self.TextStyle - TextStyle_.to_etree(element, name_='TextStyle', mapping_=mapping_, nsmap_=nsmap_) + TextStyle_.to_etree(element, name_='TextStyle', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.UserDefined is not None: UserDefined_ = self.UserDefined - UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, nsmap_=nsmap_) + UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for Labels_ in self.Labels: - Labels_.to_etree(element, name_='Labels', mapping_=mapping_, nsmap_=nsmap_) + Labels_.to_etree(element, name_='Labels', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -4865,12 +5534,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('id', node) if value is not None and 'id' not in already_processed: already_processed.add('id') @@ -4911,7 +5580,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'comments' not in already_processed: already_processed.add('comments') self.comments = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'AlternativeImage': obj_ = AlternativeImageType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -4998,22 +5667,40 @@ def set_Coords(self, Coords): class TextEquivType(GeneratedsSuper): - """Used for sort order in case multiple TextEquivs are defined. + """index -- + Used for sort order in case multiple TextEquivs are defined. The text content with the lowest index should be interpreted as the main text content. - OCR confidence value (between 0 and 1) - Type of text content (is it free text or a number, for instance). - This is only a descriptive attribute, the text type - is not checked during XML validation. - Refinement for dataType attribute. Can be a regular expression, for - instance.""" + + * conf -- OCR confidence value (between 0 and 1) + * dataType -- + Type of text content (is it free text or a number, for instance). + This is only a descriptive attribute, the text type + is not checked during XML validation. + + * dataTypeDetails -- + Refinement for dataType attribute. Can be a regular expression, for instance. + + * PlainText -- + Text in a "simple" form (ASCII or extended ASCII + as mostly used for typing). I.e. no use of + special characters for ligatures (should be + stored as two separate characters) etc. + + * Unicode -- + Correct encoding of the original, always using + the corresponding Unicode code point. I.e. + ligatures have to be represented as one + character etc. + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('index', 'integer', 0, 1, {'use': 'optional'}), - MemberSpec_('conf', 'pc:ConfSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('dataType', 'pc:TextDataTypeSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('dataTypeDetails', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('comments', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('index', 'indexType', 0, 1, {'use': 'optional', 'name': 'index'}), + MemberSpec_('conf', 'pc:ConfSimpleType', 0, 1, {'use': 'optional', 'name': 'conf'}), + MemberSpec_('dataType', 'pc:TextDataTypeSimpleType', 0, 1, {'use': 'optional', 'name': 'dataType'}), + MemberSpec_('dataTypeDetails', 'string', 0, 1, {'use': 'optional', 'name': 'dataTypeDetails'}), + MemberSpec_('comments', 'string', 0, 1, {'use': 'optional', 'name': 'comments'}), MemberSpec_('PlainText', 'string', 0, 1, {'minOccurs': '0', 'name': 'PlainText', 'type': 'string'}, None), MemberSpec_('Unicode', 'string', 0, 0, {'name': 'Unicode', 'type': 'string'}, None), ] @@ -5082,6 +5769,17 @@ def get_comments(self): return self.comments def set_comments(self, comments): self.comments = comments + def validate_indexType(self, value): + # Validate type indexType, a restriction on integer. + if value is not None and Validate_simpletypes_ and self.gds_collector_ is not None: + if not isinstance(value, int): + lineno = self.gds_get_node_lineno_() + self.gds_collector_.add_message('Value "%(value)s"%(lineno)s is not of the correct base simple type (int)' % {"value": value, "lineno": lineno, }) + return False + if value < 0: + lineno = self.gds_get_node_lineno_() + self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd minInclusive restriction on indexType' % {"value": value, "lineno": lineno} ) + result = False def validate_ConfSimpleType(self, value): # Validate type pc:ConfSimpleType, a restriction on float. if value is not None and Validate_simpletypes_ and self.gds_collector_ is not None: @@ -5110,7 +5808,7 @@ def validate_TextDataTypeSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on TextDataTypeSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( self.PlainText is not None or self.Unicode is not None @@ -5133,15 +5831,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TextEquivType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TextEquivType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='TextEquivType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='TextEquivType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='TextEquivType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='TextEquivType'): if self.index is not None and 'index' not in already_processed: already_processed.add('index') outfile.write(' index="%s"' % self.gds_format_integer(self.index, input_name='index')) @@ -5157,7 +5855,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.comments is not None and 'comments' not in already_processed: already_processed.add('comments') outfile.write(' comments=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.comments), input_name='comments')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:None="http://www.w3.org/2001/XMLSchema" ', name_='TextEquivType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:None="http://www.w3.org/2001/XMLSchema" ', name_='TextEquivType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -5170,7 +5868,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml namespaceprefix_ = self.Unicode_nsprefix_ + ':' if (UseCapturedNS_ and self.Unicode_nsprefix_) else '' showIndent(outfile, level, pretty_print) outfile.write('<%sUnicode>%s%s' % (namespaceprefix_ , self.gds_encode(self.gds_format_string(quote_xml(self.Unicode), input_name='Unicode')), namespaceprefix_ , eol_)) - def to_etree(self, parent_element=None, name_='TextEquivType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='TextEquivType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -5193,6 +5891,8 @@ def to_etree(self, parent_element=None, name_='TextEquivType', mapping_=None, ns etree_.SubElement(element, '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}Unicode').text = self.gds_format_string(Unicode_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -5200,16 +5900,17 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('index', node) if value is not None and 'index' not in already_processed: already_processed.add('index') self.index = self.gds_parse_integer(value, node, 'index') + self.validate_indexType(self.index) # validate type indexType value = find_attr_value_('conf', node) if value is not None and 'conf' not in already_processed: already_processed.add('conf') @@ -5229,7 +5930,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'comments' not in already_processed: already_processed.add('comments') self.comments = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'PlainText': value_ = child_.text value_ = self.gds_parse_string(value_, node, 'PlainText') @@ -5248,7 +5949,15 @@ def __hash__(self): class GridType(GeneratedsSuper): - """Matrix of grid points defining the table grid on the page.""" + """GridType -- + Matrix of grid points defining the table grid on the page. + + * GridPoints -- + One row in the grid point matrix. + Points with x,y coordinates. + (note: for a table with n table rows there should be n+1 grid rows) + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ MemberSpec_('GridPoints', 'GridPointsType', 1, 0, {'maxOccurs': 'unbounded', 'minOccurs': '2', 'name': 'GridPoints', 'type': 'GridPointsType'}, None), @@ -5260,7 +5969,7 @@ def __init__(self, GridPoints=None, gds_collector_=None, **kwargs_): self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" if GridPoints is None: self.GridPoints = [] else: @@ -5291,7 +6000,7 @@ def insert_GridPoints_at(self, index, value): self.GridPoints.insert(index, value) def replace_GridPoints_at(self, index, value): self.GridPoints[index] = value - def hasContent_(self): + def has__content(self): if ( self.GridPoints ): @@ -5313,17 +6022,17 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GridType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GridType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GridType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GridType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GridType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GridType'): pass - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GridType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GridType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -5331,15 +6040,17 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for GridPoints_ in self.GridPoints: namespaceprefix_ = self.GridPoints_nsprefix_ + ':' if (UseCapturedNS_ and self.GridPoints_nsprefix_) else '' GridPoints_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='GridPoints', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='GridType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='GridType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: element = etree_.SubElement(parent_element, '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) for GridPoints_ in self.GridPoints: - GridPoints_.to_etree(element, name_='GridPoints', mapping_=mapping_, nsmap_=nsmap_) + GridPoints_.to_etree(element, name_='GridPoints', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -5347,14 +6058,14 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): pass - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'GridPoints': obj_ = GridPointsType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -5366,12 +6077,15 @@ def __hash__(self): class GridPointsType(GeneratedsSuper): - """Points with x,y coordinates. - The grid row index""" + """GridPointsType -- Points with x,y coordinates. + index -- + The grid row index + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('index', 'int', 0, 0, {'use': 'required'}), - MemberSpec_('points', 'pc:PointsType', 0, 0, {'use': 'required'}), + MemberSpec_('index', 'int', 0, 0, {'use': 'required', 'name': 'index'}), + MemberSpec_('points', 'pc:PointsType', 0, 0, {'use': 'required', 'name': 'points'}), ] subclass = None superclass = None @@ -5380,7 +6094,7 @@ def __init__(self, index=None, points=None, gds_collector_=None, **kwargs_): self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.index = _cast(int, index) self.index_nsprefix_ = "pc" self.points = _cast(None, points) @@ -5419,7 +6133,7 @@ def validate_PointsType(self, value): self.validate_PointsType_patterns_, value): self.gds_collector_.add_message('Value "%s" does not match xsd pattern restrictions: %s' % (encode_str_2_3(value), self.validate_PointsType_patterns_, )) validate_PointsType_patterns_ = [['^(([0-9]+,[0-9]+ )+([0-9]+,[0-9]+))$']] - def hasContent_(self): + def has__content(self): if ( ): @@ -5441,23 +6155,23 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GridPointsType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GridPointsType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GridPointsType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GridPointsType', pretty_print=pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GridPointsType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GridPointsType'): if self.index is not None and 'index' not in already_processed: already_processed.add('index') outfile.write(' index="%s"' % self.gds_format_integer(self.index, input_name='index')) if self.points is not None and 'points' not in already_processed: already_processed.add('points') outfile.write(' points=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.points), input_name='points')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GridPointsType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GridPointsType', fromsubclass_=False, pretty_print=True): pass - def to_etree(self, parent_element=None, name_='GridPointsType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='GridPointsType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -5468,6 +6182,8 @@ def to_etree(self, parent_element=None, name_='GridPointsType', mapping_=None, n element.set('points', self.gds_format_string(self.points)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -5475,12 +6191,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('index', node) if value is not None and 'index' not in already_processed: already_processed.add('index') @@ -5490,7 +6206,7 @@ def buildAttributes(self, node, attrs, already_processed): already_processed.add('points') self.points = value self.validate_PointsType(self.points) # validate type PointsType - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): pass def __hash__(self): return hash(self.id) @@ -5498,13 +6214,16 @@ def __hash__(self): class PrintSpaceType(GeneratedsSuper): - """Determines the effective area on the paper of a printed page. + """PrintSpaceType -- + Determines the effective area on the paper of a printed page. Its size is equal for all pages of a book (exceptions: titlepage, multipage pictures). It contains all living elements (except marginals) like body type, footnotes, headings, running titles. It does not contain pagenumber (if not part of running title), - marginals, signature mark, preview words.""" + marginals, signature mark, preview words. + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ MemberSpec_('Coords', 'CoordsType', 0, 0, {'name': 'Coords', 'type': 'CoordsType'}, None), @@ -5516,7 +6235,7 @@ def __init__(self, Coords=None, gds_collector_=None, **kwargs_): self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.Coords = Coords self.Coords_nsprefix_ = "pc" def factory(*args_, **kwargs_): @@ -5538,7 +6257,7 @@ def get_Coords(self): return self.Coords def set_Coords(self, Coords): self.Coords = Coords - def hasContent_(self): + def has__content(self): if ( self.Coords is not None ): @@ -5560,17 +6279,17 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='PrintSpaceType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='PrintSpaceType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='PrintSpaceType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='PrintSpaceType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='PrintSpaceType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='PrintSpaceType'): pass - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='PrintSpaceType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='PrintSpaceType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -5578,16 +6297,18 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml if self.Coords is not None: namespaceprefix_ = self.Coords_nsprefix_ + ':' if (UseCapturedNS_ and self.Coords_nsprefix_) else '' self.Coords.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Coords', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='PrintSpaceType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='PrintSpaceType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: element = etree_.SubElement(parent_element, '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) if self.Coords is not None: Coords_ = self.Coords - Coords_.to_etree(element, name_='Coords', mapping_=mapping_, nsmap_=nsmap_) + Coords_.to_etree(element, name_='Coords', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -5595,14 +6316,14 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): pass - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'Coords': obj_ = CoordsType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -5614,14 +6335,18 @@ def __hash__(self): class ReadingOrderType(GeneratedsSuper): - """Definition of the reading order within the page. + """ReadingOrderType -- + Definition of the reading order within the page. To express a reading order between elements they have to be included in an OrderedGroup. Groups may contain further groups. - Confidence value (between 0 and 1)""" + + * conf -- Confidence value (between 0 and 1) + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('conf', 'pc:ConfSimpleType', 0, 1, {'use': 'optional'}), + MemberSpec_('conf', 'pc:ConfSimpleType', 0, 1, {'use': 'optional', 'name': 'conf'}), MemberSpec_('OrderedGroup', 'OrderedGroupType', 0, 0, {'name': 'OrderedGroup', 'type': 'OrderedGroupType'}, 2), MemberSpec_('UnorderedGroup', 'UnorderedGroupType', 0, 0, {'name': 'UnorderedGroup', 'type': 'UnorderedGroupType'}, 2), ] @@ -5632,7 +6357,7 @@ def __init__(self, conf=None, OrderedGroup=None, UnorderedGroup=None, gds_collec self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.conf = _cast(float, conf) self.conf_nsprefix_ = "pc" self.OrderedGroup = OrderedGroup @@ -5681,7 +6406,7 @@ def validate_ConfSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd maxInclusive restriction on ConfSimpleType' % {"value": value, "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( self.OrderedGroup is not None or self.UnorderedGroup is not None @@ -5704,19 +6429,19 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ReadingOrderType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ReadingOrderType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='ReadingOrderType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='ReadingOrderType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='ReadingOrderType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='ReadingOrderType'): if self.conf is not None and 'conf' not in already_processed: already_processed.add('conf') outfile.write(' conf="%s"' % self.gds_format_float(self.conf, input_name='conf')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='ReadingOrderType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='ReadingOrderType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -5727,7 +6452,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml if self.UnorderedGroup is not None: namespaceprefix_ = self.UnorderedGroup_nsprefix_ + ':' if (UseCapturedNS_ and self.UnorderedGroup_nsprefix_) else '' self.UnorderedGroup.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UnorderedGroup', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='ReadingOrderType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='ReadingOrderType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -5736,12 +6461,14 @@ def to_etree(self, parent_element=None, name_='ReadingOrderType', mapping_=None, element.set('conf', self.gds_format_float(self.conf)) if self.OrderedGroup is not None: OrderedGroup_ = self.OrderedGroup - OrderedGroup_.to_etree(element, name_='OrderedGroup', mapping_=mapping_, nsmap_=nsmap_) + OrderedGroup_.to_etree(element, name_='OrderedGroup', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.UnorderedGroup is not None: UnorderedGroup_ = self.UnorderedGroup - UnorderedGroup_.to_etree(element, name_='UnorderedGroup', mapping_=mapping_, nsmap_=nsmap_) + UnorderedGroup_.to_etree(element, name_='UnorderedGroup', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -5749,19 +6476,19 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('conf', node) if value is not None and 'conf' not in already_processed: already_processed.add('conf') value = self.gds_parse_float(value, node, 'conf') self.conf = value self.validate_ConfSimpleType(self.conf) # validate type ConfSimpleType - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'OrderedGroup': obj_ = OrderedGroupType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -5778,12 +6505,14 @@ def __hash__(self): class RegionRefIndexedType(GeneratedsSuper): - """Numbered regionPosition (order number) of this item within the current - hierarchy level.""" + """RegionRefIndexedType -- Numbered region + index -- Position (order number) of this item within the current hierarchy level. + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('index', 'int', 0, 0, {'use': 'required'}), - MemberSpec_('regionRef', 'string', 0, 0, {'use': 'required'}), + MemberSpec_('index', 'int', 0, 0, {'use': 'required', 'name': 'index'}), + MemberSpec_('regionRef', 'string', 0, 0, {'use': 'required', 'name': 'regionRef'}), ] subclass = None superclass = None @@ -5792,7 +6521,7 @@ def __init__(self, index=None, regionRef=None, gds_collector_=None, **kwargs_): self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.index = _cast(int, index) self.index_nsprefix_ = "pc" self.regionRef = _cast(None, regionRef) @@ -5820,7 +6549,7 @@ def get_regionRef(self): return self.regionRef def set_regionRef(self, regionRef): self.regionRef = regionRef - def hasContent_(self): + def has__content(self): if ( ): @@ -5842,23 +6571,23 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='RegionRefIndexedType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='RegionRefIndexedType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='RegionRefIndexedType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='RegionRefIndexedType', pretty_print=pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='RegionRefIndexedType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='RegionRefIndexedType'): if self.index is not None and 'index' not in already_processed: already_processed.add('index') outfile.write(' index="%s"' % self.gds_format_integer(self.index, input_name='index')) if self.regionRef is not None and 'regionRef' not in already_processed: already_processed.add('regionRef') outfile.write(' regionRef=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.regionRef), input_name='regionRef')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='RegionRefIndexedType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='RegionRefIndexedType', fromsubclass_=False, pretty_print=True): pass - def to_etree(self, parent_element=None, name_='RegionRefIndexedType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='RegionRefIndexedType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -5869,6 +6598,8 @@ def to_etree(self, parent_element=None, name_='RegionRefIndexedType', mapping_=N element.set('regionRef', self.gds_format_string(self.regionRef)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -5876,12 +6607,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('index', node) if value is not None and 'index' not in already_processed: already_processed.add('index') @@ -5890,7 +6621,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'regionRef' not in already_processed: already_processed.add('regionRef') self.regionRef = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): pass def __hash__(self): return hash(self.id) @@ -5898,25 +6629,36 @@ def __hash__(self): class OrderedGroupIndexedType(GeneratedsSuper): - """Indexed group containing ordered elements - Optional link to a parent region of nested regions. - The parent region doubles as reading order group. - Only the nested regions should be allowed as group members. - Position (order number) of this item within the - current hierarchy level. - Is this group a continuation of another group (from - previous column or page, for example)? - For generic use""" + """OrderedGroupIndexedType -- + Indexed group containing ordered elements + + * regionRef -- + Optional link to a parent region of nested regions. + The parent region doubles as reading order group. + Only the nested regions should be allowed as group members. + + * index -- + Position (order number) of this item within the + current hierarchy level. + + * continuation -- + Is this group a continuation of another group (from + previous column or page, for example)? + + * custom -- For generic use + * Labels -- Semantic labels / tags + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('id', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('regionRef', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('index', 'int', 0, 0, {'use': 'required'}), - MemberSpec_('caption', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('type_', 'pc:GroupTypeSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('continuation', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('custom', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('comments', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('id', 'string', 0, 0, {'use': 'required', 'name': 'id'}), + MemberSpec_('regionRef', 'string', 0, 1, {'use': 'optional', 'name': 'regionRef'}), + MemberSpec_('index', 'int', 0, 0, {'use': 'required', 'name': 'index'}), + MemberSpec_('caption', 'string', 0, 1, {'use': 'optional', 'name': 'caption'}), + MemberSpec_('type_', 'pc:GroupTypeSimpleType', 0, 1, {'use': 'optional', 'name': 'type_'}), + MemberSpec_('continuation', 'boolean', 0, 1, {'use': 'optional', 'name': 'continuation'}), + MemberSpec_('custom', 'string', 0, 1, {'use': 'optional', 'name': 'custom'}), + MemberSpec_('comments', 'string', 0, 1, {'use': 'optional', 'name': 'comments'}), MemberSpec_('UserDefined', 'UserDefinedType', 0, 1, {'maxOccurs': '1', 'minOccurs': '0', 'name': 'UserDefined', 'type': 'UserDefinedType'}, None), MemberSpec_('Labels', 'LabelsType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'Labels', 'type': 'LabelsType'}, None), MemberSpec_('RegionRefIndexed', 'RegionRefIndexedType', 1, 0, {'name': 'RegionRefIndexed', 'type': 'RegionRefIndexedType'}, 3), @@ -5930,7 +6672,7 @@ def __init__(self, id=None, regionRef=None, index=None, caption=None, type_=None self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.id = _cast(None, id) self.id_nsprefix_ = "pc" self.regionRef = _cast(None, regionRef) @@ -6073,7 +6815,7 @@ def validate_GroupTypeSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on GroupTypeSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( self.UserDefined is not None or self.Labels or @@ -6099,15 +6841,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='OrderedGroupIndexedType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='OrderedGroupIndexedType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='OrderedGroupIndexedType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='OrderedGroupIndexedType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='OrderedGroupIndexedType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='OrderedGroupIndexedType'): if self.id is not None and 'id' not in already_processed: already_processed.add('id') outfile.write(' id=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.id), input_name='id')), )) @@ -6132,7 +6874,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.comments is not None and 'comments' not in already_processed: already_processed.add('comments') outfile.write(' comments=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.comments), input_name='comments')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupIndexedType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupIndexedType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -6152,7 +6894,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for UnorderedGroupIndexed_ in self.UnorderedGroupIndexed: namespaceprefix_ = self.UnorderedGroupIndexed_nsprefix_ + ':' if (UseCapturedNS_ and self.UnorderedGroupIndexed_nsprefix_) else '' UnorderedGroupIndexed_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UnorderedGroupIndexed', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='OrderedGroupIndexedType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='OrderedGroupIndexedType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -6175,17 +6917,19 @@ def to_etree(self, parent_element=None, name_='OrderedGroupIndexedType', mapping element.set('comments', self.gds_format_string(self.comments)) if self.UserDefined is not None: UserDefined_ = self.UserDefined - UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, nsmap_=nsmap_) + UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for Labels_ in self.Labels: - Labels_.to_etree(element, name_='Labels', mapping_=mapping_, nsmap_=nsmap_) + Labels_.to_etree(element, name_='Labels', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for RegionRefIndexed_ in self.RegionRefIndexed: - RegionRefIndexed_.to_etree(element, name_='RegionRefIndexed', mapping_=mapping_, nsmap_=nsmap_) + RegionRefIndexed_.to_etree(element, name_='RegionRefIndexed', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for OrderedGroupIndexed_ in self.OrderedGroupIndexed: - OrderedGroupIndexed_.to_etree(element, name_='OrderedGroupIndexed', mapping_=mapping_, nsmap_=nsmap_) + OrderedGroupIndexed_.to_etree(element, name_='OrderedGroupIndexed', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for UnorderedGroupIndexed_ in self.UnorderedGroupIndexed: - UnorderedGroupIndexed_.to_etree(element, name_='UnorderedGroupIndexed', mapping_=mapping_, nsmap_=nsmap_) + UnorderedGroupIndexed_.to_etree(element, name_='UnorderedGroupIndexed', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -6193,12 +6937,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('id', node) if value is not None and 'id' not in already_processed: already_processed.add('id') @@ -6237,7 +6981,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'comments' not in already_processed: already_processed.add('comments') self.comments = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'UserDefined': obj_ = UserDefinedType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -6341,11 +7085,16 @@ def sort_AllIndexed(self, validate_uniqueness=True): return self.get_AllIndexed() # pylint: disable=line-too-long,invalid-name,missing-module-docstring,missing-function-docstring - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): # pylint: disable=unused-argument,too-many-arguments - namespaceprefix_ = 'pc:' + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): # pylint: disable=unused-argument,too-many-arguments + if pretty_print: + eol_ = '\n' + else: + eol_ = '' if self.UserDefined is not None: + namespaceprefix_ = self.UserDefined_nsprefix_ + ':' if (UseCapturedNS_ and self.UserDefined_nsprefix_) else '' self.UserDefined.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UserDefined', pretty_print=pretty_print) for Labels_ in self.Labels: + namespaceprefix_ = self.Labels_nsprefix_ + ':' if (UseCapturedNS_ and self.Labels_nsprefix_) else '' Labels_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Labels', pretty_print=pretty_print) cleaned = [] def replaceWithRRI(group): @@ -6363,30 +7112,41 @@ def replaceWithRRI(group): else: cleaned.append(entry) for entry in cleaned: - entry.export(outfile, level, namespaceprefix_, namespacedef_='', name_=entry.__class__.__name__[:-4], pretty_print=pretty_print) + entry.export(outfile, level, entry.ns_prefix_, namespacedef_='', name_=entry.__class__.__name__[:-4], pretty_print=pretty_print) # end class OrderedGroupIndexedType class UnorderedGroupIndexedType(GeneratedsSuper): - """Indexed group containing unordered elements - Optional link to a parent region of nested regions. - The parent region doubles as reading order group. - Only the nested regions should be allowed as group members. - Position (order number) of this item within the - current hierarchy level. - Is this group a continuation of another group - (from previous column or page, for example)? - For generic use""" + """UnorderedGroupIndexedType -- + Indexed group containing unordered elements + + * regionRef -- + Optional link to a parent region of nested regions. + The parent region doubles as reading order group. + Only the nested regions should be allowed as group members. + + * index -- + Position (order number) of this item within the + current hierarchy level. + + * continuation -- + Is this group a continuation of another group + (from previous column or page, for example)? + + * custom -- For generic use + * Labels -- Semantic labels / tags + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('id', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('regionRef', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('index', 'int', 0, 0, {'use': 'required'}), - MemberSpec_('caption', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('type_', 'pc:GroupTypeSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('continuation', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('custom', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('comments', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('id', 'string', 0, 0, {'use': 'required', 'name': 'id'}), + MemberSpec_('regionRef', 'string', 0, 1, {'use': 'optional', 'name': 'regionRef'}), + MemberSpec_('index', 'int', 0, 0, {'use': 'required', 'name': 'index'}), + MemberSpec_('caption', 'string', 0, 1, {'use': 'optional', 'name': 'caption'}), + MemberSpec_('type_', 'pc:GroupTypeSimpleType', 0, 1, {'use': 'optional', 'name': 'type_'}), + MemberSpec_('continuation', 'boolean', 0, 1, {'use': 'optional', 'name': 'continuation'}), + MemberSpec_('custom', 'string', 0, 1, {'use': 'optional', 'name': 'custom'}), + MemberSpec_('comments', 'string', 0, 1, {'use': 'optional', 'name': 'comments'}), MemberSpec_('UserDefined', 'UserDefinedType', 0, 1, {'maxOccurs': '1', 'minOccurs': '0', 'name': 'UserDefined', 'type': 'UserDefinedType'}, None), MemberSpec_('Labels', 'LabelsType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'Labels', 'type': 'LabelsType'}, None), MemberSpec_('RegionRef', 'RegionRefType', 1, 0, {'name': 'RegionRef', 'type': 'RegionRefType'}, 4), @@ -6400,7 +7160,7 @@ def __init__(self, id=None, regionRef=None, index=None, caption=None, type_=None self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.id = _cast(None, id) self.id_nsprefix_ = "pc" self.regionRef = _cast(None, regionRef) @@ -6543,7 +7303,7 @@ def validate_GroupTypeSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on GroupTypeSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( self.UserDefined is not None or self.Labels or @@ -6569,15 +7329,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='UnorderedGroupIndexedType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='UnorderedGroupIndexedType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='UnorderedGroupIndexedType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='UnorderedGroupIndexedType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='UnorderedGroupIndexedType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='UnorderedGroupIndexedType'): if self.id is not None and 'id' not in already_processed: already_processed.add('id') outfile.write(' id=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.id), input_name='id')), )) @@ -6602,7 +7362,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.comments is not None and 'comments' not in already_processed: already_processed.add('comments') outfile.write(' comments=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.comments), input_name='comments')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='UnorderedGroupIndexedType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='UnorderedGroupIndexedType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -6622,7 +7382,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for UnorderedGroup_ in self.UnorderedGroup: namespaceprefix_ = self.UnorderedGroup_nsprefix_ + ':' if (UseCapturedNS_ and self.UnorderedGroup_nsprefix_) else '' UnorderedGroup_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UnorderedGroup', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='UnorderedGroupIndexedType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='UnorderedGroupIndexedType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -6645,17 +7405,19 @@ def to_etree(self, parent_element=None, name_='UnorderedGroupIndexedType', mappi element.set('comments', self.gds_format_string(self.comments)) if self.UserDefined is not None: UserDefined_ = self.UserDefined - UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, nsmap_=nsmap_) + UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for Labels_ in self.Labels: - Labels_.to_etree(element, name_='Labels', mapping_=mapping_, nsmap_=nsmap_) + Labels_.to_etree(element, name_='Labels', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for RegionRef_ in self.RegionRef: - RegionRef_.to_etree(element, name_='RegionRef', mapping_=mapping_, nsmap_=nsmap_) + RegionRef_.to_etree(element, name_='RegionRef', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for OrderedGroup_ in self.OrderedGroup: - OrderedGroup_.to_etree(element, name_='OrderedGroup', mapping_=mapping_, nsmap_=nsmap_) + OrderedGroup_.to_etree(element, name_='OrderedGroup', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for UnorderedGroup_ in self.UnorderedGroup: - UnorderedGroup_.to_etree(element, name_='UnorderedGroup', mapping_=mapping_, nsmap_=nsmap_) + UnorderedGroup_.to_etree(element, name_='UnorderedGroup', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -6663,12 +7425,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('id', node) if value is not None and 'id' not in already_processed: already_processed.add('id') @@ -6707,7 +7469,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'comments' not in already_processed: already_processed.add('comments') self.comments = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'UserDefined': obj_ = UserDefinedType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -6748,7 +7510,7 @@ def get_UnorderedGroupChildren(self): class RegionRefType(GeneratedsSuper): __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('regionRef', 'string', 0, 0, {'use': 'required'}), + MemberSpec_('regionRef', 'string', 0, 0, {'use': 'required', 'name': 'regionRef'}), ] subclass = None superclass = None @@ -6757,7 +7519,7 @@ def __init__(self, regionRef=None, gds_collector_=None, **kwargs_): self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.regionRef = _cast(None, regionRef) self.regionRef_nsprefix_ = "pc" def factory(*args_, **kwargs_): @@ -6779,7 +7541,7 @@ def get_regionRef(self): return self.regionRef def set_regionRef(self, regionRef): self.regionRef = regionRef - def hasContent_(self): + def has__content(self): if ( ): @@ -6801,20 +7563,20 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='RegionRefType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='RegionRefType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='RegionRefType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='RegionRefType', pretty_print=pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='RegionRefType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='RegionRefType'): if self.regionRef is not None and 'regionRef' not in already_processed: already_processed.add('regionRef') outfile.write(' regionRef=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.regionRef), input_name='regionRef')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='RegionRefType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='RegionRefType', fromsubclass_=False, pretty_print=True): pass - def to_etree(self, parent_element=None, name_='RegionRefType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='RegionRefType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -6823,6 +7585,8 @@ def to_etree(self, parent_element=None, name_='RegionRefType', mapping_=None, ns element.set('regionRef', self.gds_format_string(self.regionRef)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -6830,17 +7594,17 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('regionRef', node) if value is not None and 'regionRef' not in already_processed: already_processed.add('regionRef') self.regionRef = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): pass def __hash__(self): return hash(self.id) @@ -6848,22 +7612,31 @@ def __hash__(self): class OrderedGroupType(GeneratedsSuper): - """Numbered group (contains ordered elements) - Optional link to a parent region of nested regions. - The parent region doubles as reading order group. - Only the nested regions should be allowed as group members. - Is this group a continuation of another group - (from previous column or page, for example)? - For generic use""" + """OrderedGroupType -- + Numbered group (contains ordered elements) + + * regionRef -- + Optional link to a parent region of nested regions. + The parent region doubles as reading order group. + Only the nested regions should be allowed as group members. + + * continuation -- + Is this group a continuation of another group + (from previous column or page, for example)? + + * custom -- For generic use + * Labels -- Semantic labels / tags + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('id', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('regionRef', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('caption', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('type_', 'pc:GroupTypeSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('continuation', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('custom', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('comments', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('id', 'string', 0, 0, {'use': 'required', 'name': 'id'}), + MemberSpec_('regionRef', 'string', 0, 1, {'use': 'optional', 'name': 'regionRef'}), + MemberSpec_('caption', 'string', 0, 1, {'use': 'optional', 'name': 'caption'}), + MemberSpec_('type_', 'pc:GroupTypeSimpleType', 0, 1, {'use': 'optional', 'name': 'type_'}), + MemberSpec_('continuation', 'boolean', 0, 1, {'use': 'optional', 'name': 'continuation'}), + MemberSpec_('custom', 'string', 0, 1, {'use': 'optional', 'name': 'custom'}), + MemberSpec_('comments', 'string', 0, 1, {'use': 'optional', 'name': 'comments'}), MemberSpec_('UserDefined', 'UserDefinedType', 0, 1, {'maxOccurs': '1', 'minOccurs': '0', 'name': 'UserDefined', 'type': 'UserDefinedType'}, None), MemberSpec_('Labels', 'LabelsType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'Labels', 'type': 'LabelsType'}, None), MemberSpec_('RegionRefIndexed', 'RegionRefIndexedType', 1, 0, {'name': 'RegionRefIndexed', 'type': 'RegionRefIndexedType'}, 5), @@ -6877,7 +7650,7 @@ def __init__(self, id=None, regionRef=None, caption=None, type_=None, continuati self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.id = _cast(None, id) self.id_nsprefix_ = "pc" self.regionRef = _cast(None, regionRef) @@ -7014,7 +7787,7 @@ def validate_GroupTypeSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on GroupTypeSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( self.UserDefined is not None or self.Labels or @@ -7040,15 +7813,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='OrderedGroupType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='OrderedGroupType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='OrderedGroupType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='OrderedGroupType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='OrderedGroupType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='OrderedGroupType'): if self.id is not None and 'id' not in already_processed: already_processed.add('id') outfile.write(' id=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.id), input_name='id')), )) @@ -7070,7 +7843,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.comments is not None and 'comments' not in already_processed: already_processed.add('comments') outfile.write(' comments=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.comments), input_name='comments')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -7090,7 +7863,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for UnorderedGroupIndexed_ in self.UnorderedGroupIndexed: namespaceprefix_ = self.UnorderedGroupIndexed_nsprefix_ + ':' if (UseCapturedNS_ and self.UnorderedGroupIndexed_nsprefix_) else '' UnorderedGroupIndexed_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UnorderedGroupIndexed', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='OrderedGroupType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='OrderedGroupType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -7111,17 +7884,19 @@ def to_etree(self, parent_element=None, name_='OrderedGroupType', mapping_=None, element.set('comments', self.gds_format_string(self.comments)) if self.UserDefined is not None: UserDefined_ = self.UserDefined - UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, nsmap_=nsmap_) + UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for Labels_ in self.Labels: - Labels_.to_etree(element, name_='Labels', mapping_=mapping_, nsmap_=nsmap_) + Labels_.to_etree(element, name_='Labels', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for RegionRefIndexed_ in self.RegionRefIndexed: - RegionRefIndexed_.to_etree(element, name_='RegionRefIndexed', mapping_=mapping_, nsmap_=nsmap_) + RegionRefIndexed_.to_etree(element, name_='RegionRefIndexed', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for OrderedGroupIndexed_ in self.OrderedGroupIndexed: - OrderedGroupIndexed_.to_etree(element, name_='OrderedGroupIndexed', mapping_=mapping_, nsmap_=nsmap_) + OrderedGroupIndexed_.to_etree(element, name_='OrderedGroupIndexed', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for UnorderedGroupIndexed_ in self.UnorderedGroupIndexed: - UnorderedGroupIndexed_.to_etree(element, name_='UnorderedGroupIndexed', mapping_=mapping_, nsmap_=nsmap_) + UnorderedGroupIndexed_.to_etree(element, name_='UnorderedGroupIndexed', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -7129,12 +7904,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('id', node) if value is not None and 'id' not in already_processed: already_processed.add('id') @@ -7169,7 +7944,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'comments' not in already_processed: already_processed.add('comments') self.comments = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'UserDefined': obj_ = UserDefinedType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -7273,11 +8048,16 @@ def sort_AllIndexed(self, validate_uniqueness=True): return self.get_AllIndexed() # pylint: disable=line-too-long,invalid-name,missing-module-docstring,missing-function-docstring - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): # pylint: disable=unused-argument,too-many-arguments - namespaceprefix_ = 'pc:' + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): # pylint: disable=unused-argument,too-many-arguments + if pretty_print: + eol_ = '\n' + else: + eol_ = '' if self.UserDefined is not None: + namespaceprefix_ = self.UserDefined_nsprefix_ + ':' if (UseCapturedNS_ and self.UserDefined_nsprefix_) else '' self.UserDefined.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UserDefined', pretty_print=pretty_print) for Labels_ in self.Labels: + namespaceprefix_ = self.Labels_nsprefix_ + ':' if (UseCapturedNS_ and self.Labels_nsprefix_) else '' Labels_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Labels', pretty_print=pretty_print) cleaned = [] def replaceWithRRI(group): @@ -7295,27 +8075,36 @@ def replaceWithRRI(group): else: cleaned.append(entry) for entry in cleaned: - entry.export(outfile, level, namespaceprefix_, namespacedef_='', name_=entry.__class__.__name__[:-4], pretty_print=pretty_print) + entry.export(outfile, level, entry.ns_prefix_, namespacedef_='', name_=entry.__class__.__name__[:-4], pretty_print=pretty_print) # end class OrderedGroupType class UnorderedGroupType(GeneratedsSuper): - """Numbered group (contains unordered elements) - Optional link to a parent region of nested regions. - The parent region doubles as reading order group. - Only the nested regions should be allowed as group members. - Is this group a continuation of another group - (from previous column or page, for example)? - For generic use""" + """UnorderedGroupType -- + Numbered group (contains unordered elements) + + * regionRef -- + Optional link to a parent region of nested regions. + The parent region doubles as reading order group. + Only the nested regions should be allowed as group members. + + * continuation -- + Is this group a continuation of another group + (from previous column or page, for example)? + + * custom -- For generic use + * Labels -- Semantic labels / tags + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('id', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('regionRef', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('caption', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('type_', 'pc:GroupTypeSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('continuation', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('custom', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('comments', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('id', 'string', 0, 0, {'use': 'required', 'name': 'id'}), + MemberSpec_('regionRef', 'string', 0, 1, {'use': 'optional', 'name': 'regionRef'}), + MemberSpec_('caption', 'string', 0, 1, {'use': 'optional', 'name': 'caption'}), + MemberSpec_('type_', 'pc:GroupTypeSimpleType', 0, 1, {'use': 'optional', 'name': 'type_'}), + MemberSpec_('continuation', 'boolean', 0, 1, {'use': 'optional', 'name': 'continuation'}), + MemberSpec_('custom', 'string', 0, 1, {'use': 'optional', 'name': 'custom'}), + MemberSpec_('comments', 'string', 0, 1, {'use': 'optional', 'name': 'comments'}), MemberSpec_('UserDefined', 'UserDefinedType', 0, 1, {'maxOccurs': '1', 'minOccurs': '0', 'name': 'UserDefined', 'type': 'UserDefinedType'}, None), MemberSpec_('Labels', 'LabelsType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'Labels', 'type': 'LabelsType'}, None), MemberSpec_('RegionRef', 'RegionRefType', 1, 0, {'name': 'RegionRef', 'type': 'RegionRefType'}, 6), @@ -7329,7 +8118,7 @@ def __init__(self, id=None, regionRef=None, caption=None, type_=None, continuati self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.id = _cast(None, id) self.id_nsprefix_ = "pc" self.regionRef = _cast(None, regionRef) @@ -7466,7 +8255,7 @@ def validate_GroupTypeSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on GroupTypeSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( self.UserDefined is not None or self.Labels or @@ -7492,15 +8281,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='UnorderedGroupType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='UnorderedGroupType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='UnorderedGroupType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='UnorderedGroupType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='UnorderedGroupType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='UnorderedGroupType'): if self.id is not None and 'id' not in already_processed: already_processed.add('id') outfile.write(' id=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.id), input_name='id')), )) @@ -7522,7 +8311,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.comments is not None and 'comments' not in already_processed: already_processed.add('comments') outfile.write(' comments=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.comments), input_name='comments')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='UnorderedGroupType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='UnorderedGroupType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -7542,7 +8331,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for UnorderedGroup_ in self.UnorderedGroup: namespaceprefix_ = self.UnorderedGroup_nsprefix_ + ':' if (UseCapturedNS_ and self.UnorderedGroup_nsprefix_) else '' UnorderedGroup_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UnorderedGroup', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='UnorderedGroupType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='UnorderedGroupType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -7563,17 +8352,19 @@ def to_etree(self, parent_element=None, name_='UnorderedGroupType', mapping_=Non element.set('comments', self.gds_format_string(self.comments)) if self.UserDefined is not None: UserDefined_ = self.UserDefined - UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, nsmap_=nsmap_) + UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for Labels_ in self.Labels: - Labels_.to_etree(element, name_='Labels', mapping_=mapping_, nsmap_=nsmap_) + Labels_.to_etree(element, name_='Labels', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for RegionRef_ in self.RegionRef: - RegionRef_.to_etree(element, name_='RegionRef', mapping_=mapping_, nsmap_=nsmap_) + RegionRef_.to_etree(element, name_='RegionRef', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for OrderedGroup_ in self.OrderedGroup: - OrderedGroup_.to_etree(element, name_='OrderedGroup', mapping_=mapping_, nsmap_=nsmap_) + OrderedGroup_.to_etree(element, name_='OrderedGroup', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for UnorderedGroup_ in self.UnorderedGroup: - UnorderedGroup_.to_etree(element, name_='UnorderedGroup', mapping_=mapping_, nsmap_=nsmap_) + UnorderedGroup_.to_etree(element, name_='UnorderedGroup', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -7581,12 +8372,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('id', node) if value is not None and 'id' not in already_processed: already_processed.add('id') @@ -7621,7 +8412,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'comments' not in already_processed: already_processed.add('comments') self.comments = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'UserDefined': obj_ = UserDefinedType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -7660,8 +8451,11 @@ def get_UnorderedGroupChildren(self): class BorderType(GeneratedsSuper): - """Border of the actual page (if the scanned image - contains parts not belonging to the page).""" + """BorderType -- + Border of the actual page (if the scanned image + contains parts not belonging to the page). + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ MemberSpec_('Coords', 'CoordsType', 0, 0, {'name': 'Coords', 'type': 'CoordsType'}, None), @@ -7673,7 +8467,7 @@ def __init__(self, Coords=None, gds_collector_=None, **kwargs_): self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.Coords = Coords self.Coords_nsprefix_ = "pc" def factory(*args_, **kwargs_): @@ -7695,7 +8489,7 @@ def get_Coords(self): return self.Coords def set_Coords(self, Coords): self.Coords = Coords - def hasContent_(self): + def has__content(self): if ( self.Coords is not None ): @@ -7717,17 +8511,17 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='BorderType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='BorderType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='BorderType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='BorderType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='BorderType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='BorderType'): pass - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='BorderType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='BorderType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -7735,16 +8529,18 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml if self.Coords is not None: namespaceprefix_ = self.Coords_nsprefix_ + ':' if (UseCapturedNS_ and self.Coords_nsprefix_) else '' self.Coords.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Coords', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='BorderType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='BorderType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: element = etree_.SubElement(parent_element, '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) if self.Coords is not None: Coords_ = self.Coords - Coords_.to_etree(element, name_='Coords', mapping_=mapping_, nsmap_=nsmap_) + Coords_.to_etree(element, name_='Coords', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -7752,14 +8548,14 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): pass - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'Coords': obj_ = CoordsType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -7785,9 +8581,12 @@ def set_Coords(self, Coords): class LayersType(GeneratedsSuper): - """Can be used to express the z-index of overlapping + """LayersType -- + Can be used to express the z-index of overlapping regions. An element with a greater z-index is always in - front of another element with lower z-index.""" + front of another element with lower z-index. + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ MemberSpec_('Layer', 'LayerType', 1, 0, {'maxOccurs': 'unbounded', 'minOccurs': '1', 'name': 'Layer', 'type': 'LayerType'}, None), @@ -7799,7 +8598,7 @@ def __init__(self, Layer=None, gds_collector_=None, **kwargs_): self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" if Layer is None: self.Layer = [] else: @@ -7830,7 +8629,7 @@ def insert_Layer_at(self, index, value): self.Layer.insert(index, value) def replace_Layer_at(self, index, value): self.Layer[index] = value - def hasContent_(self): + def has__content(self): if ( self.Layer ): @@ -7852,17 +8651,17 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='LayersType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='LayersType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='LayersType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='LayersType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='LayersType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='LayersType'): pass - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='LayersType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='LayersType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -7870,15 +8669,17 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for Layer_ in self.Layer: namespaceprefix_ = self.Layer_nsprefix_ + ':' if (UseCapturedNS_ and self.Layer_nsprefix_) else '' Layer_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Layer', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='LayersType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='LayersType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: element = etree_.SubElement(parent_element, '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) for Layer_ in self.Layer: - Layer_.to_etree(element, name_='Layer', mapping_=mapping_, nsmap_=nsmap_) + Layer_.to_etree(element, name_='Layer', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -7886,14 +8687,14 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): pass - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'Layer': obj_ = LayerType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -7907,9 +8708,9 @@ def __hash__(self): class LayerType(GeneratedsSuper): __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('id', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('zIndex', 'int', 0, 0, {'use': 'required'}), - MemberSpec_('caption', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('id', 'string', 0, 0, {'use': 'required', 'name': 'id'}), + MemberSpec_('zIndex', 'int', 0, 0, {'use': 'required', 'name': 'zIndex'}), + MemberSpec_('caption', 'string', 0, 1, {'use': 'optional', 'name': 'caption'}), MemberSpec_('RegionRef', 'RegionRefType', 1, 0, {'maxOccurs': 'unbounded', 'minOccurs': '1', 'name': 'RegionRef', 'type': 'RegionRefType'}, None), ] subclass = None @@ -7919,7 +8720,7 @@ def __init__(self, id=None, zIndex=None, caption=None, RegionRef=None, gds_colle self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.id = _cast(None, id) self.id_nsprefix_ = "pc" self.zIndex = _cast(int, zIndex) @@ -7968,7 +8769,7 @@ def get_caption(self): return self.caption def set_caption(self, caption): self.caption = caption - def hasContent_(self): + def has__content(self): if ( self.RegionRef ): @@ -7990,15 +8791,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='LayerType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='LayerType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='LayerType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='LayerType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='LayerType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='LayerType'): if self.id is not None and 'id' not in already_processed: already_processed.add('id') outfile.write(' id=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.id), input_name='id')), )) @@ -8008,7 +8809,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.caption is not None and 'caption' not in already_processed: already_processed.add('caption') outfile.write(' caption=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.caption), input_name='caption')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='LayerType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='LayerType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -8016,7 +8817,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for RegionRef_ in self.RegionRef: namespaceprefix_ = self.RegionRef_nsprefix_ + ':' if (UseCapturedNS_ and self.RegionRef_nsprefix_) else '' RegionRef_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='RegionRef', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='LayerType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='LayerType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -8028,9 +8829,11 @@ def to_etree(self, parent_element=None, name_='LayerType', mapping_=None, nsmap_ if self.caption is not None: element.set('caption', self.gds_format_string(self.caption)) for RegionRef_ in self.RegionRef: - RegionRef_.to_etree(element, name_='RegionRef', mapping_=mapping_, nsmap_=nsmap_) + RegionRef_.to_etree(element, name_='RegionRef', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -8038,12 +8841,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('id', node) if value is not None and 'id' not in already_processed: already_processed.add('id') @@ -8056,7 +8859,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'caption' not in already_processed: already_processed.add('caption') self.caption = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'RegionRef': obj_ = RegionRefType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -8068,11 +8871,13 @@ def __hash__(self): class BaselineType(GeneratedsSuper): - """Confidence value (between 0 and 1)""" + """conf -- Confidence value (between 0 and 1) + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('points', 'pc:PointsType', 0, 0, {'use': 'required'}), - MemberSpec_('conf', 'pc:ConfSimpleType', 0, 1, {'use': 'optional'}), + MemberSpec_('points', 'pc:PointsType', 0, 0, {'use': 'required', 'name': 'points'}), + MemberSpec_('conf', 'pc:ConfSimpleType', 0, 1, {'use': 'optional', 'name': 'conf'}), ] subclass = None superclass = None @@ -8081,7 +8886,7 @@ def __init__(self, points=None, conf=None, gds_collector_=None, **kwargs_): self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.points = _cast(None, points) self.points_nsprefix_ = "pc" self.conf = _cast(float, conf) @@ -8135,7 +8940,7 @@ def validate_ConfSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd maxInclusive restriction on ConfSimpleType' % {"value": value, "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( ): @@ -8157,23 +8962,23 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='BaselineType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='BaselineType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='BaselineType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='BaselineType', pretty_print=pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='BaselineType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='BaselineType'): if self.points is not None and 'points' not in already_processed: already_processed.add('points') outfile.write(' points=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.points), input_name='points')), )) if self.conf is not None and 'conf' not in already_processed: already_processed.add('conf') outfile.write(' conf="%s"' % self.gds_format_float(self.conf, input_name='conf')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='BaselineType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='BaselineType', fromsubclass_=False, pretty_print=True): pass - def to_etree(self, parent_element=None, name_='BaselineType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='BaselineType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -8184,6 +8989,8 @@ def to_etree(self, parent_element=None, name_='BaselineType', mapping_=None, nsm element.set('conf', self.gds_format_float(self.conf)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -8191,12 +8998,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('points', node) if value is not None and 'points' not in already_processed: already_processed.add('points') @@ -8208,7 +9015,7 @@ def buildAttributes(self, node, attrs, already_processed): value = self.gds_parse_float(value, node, 'conf') self.conf = value self.validate_ConfSimpleType(self.conf) # validate type ConfSimpleType - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): pass def __hash__(self): return hash(self.id) @@ -8216,9 +9023,12 @@ def __hash__(self): class RelationsType(GeneratedsSuper): - """Container for one-to-one relations between layout + """RelationsType -- + Container for one-to-one relations between layout objects (for example: DropCap - paragraph, caption - - image).""" + image). + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ MemberSpec_('Relation', 'RelationType', 1, 0, {'maxOccurs': 'unbounded', 'minOccurs': '1', 'name': 'Relation', 'type': 'RelationType'}, None), @@ -8230,7 +9040,7 @@ def __init__(self, Relation=None, gds_collector_=None, **kwargs_): self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" if Relation is None: self.Relation = [] else: @@ -8261,7 +9071,7 @@ def insert_Relation_at(self, index, value): self.Relation.insert(index, value) def replace_Relation_at(self, index, value): self.Relation[index] = value - def hasContent_(self): + def has__content(self): if ( self.Relation ): @@ -8283,17 +9093,17 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='RelationsType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='RelationsType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='RelationsType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='RelationsType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='RelationsType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='RelationsType'): pass - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='RelationsType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='RelationsType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -8301,15 +9111,17 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for Relation_ in self.Relation: namespaceprefix_ = self.Relation_nsprefix_ + ':' if (UseCapturedNS_ and self.Relation_nsprefix_) else '' Relation_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Relation', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='RelationsType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='RelationsType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: element = etree_.SubElement(parent_element, '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) for Relation_ in self.Relation: - Relation_.to_etree(element, name_='Relation', mapping_=mapping_, nsmap_=nsmap_) + Relation_.to_etree(element, name_='Relation', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -8317,14 +9129,14 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): pass - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'Relation': obj_ = RelationType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -8336,7 +9148,8 @@ def __hash__(self): class RelationType(GeneratedsSuper): - """One-to-one relation between to layout object. Use 'link' + """RelationType -- + One-to-one relation between to layout object. Use 'link' for loose relations and 'join' for strong relations (where something is fragmented for instance). Examples for 'link': caption - image floating - @@ -8350,13 +9163,17 @@ class RelationType(GeneratedsSuper): pragraph is split across columns and the last word of the first paragraph DOES continue in the second paragraph) - For generic use""" + + * custom -- For generic use + * Labels -- Semantic labels / tags + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('id', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('type_', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('custom', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('comments', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('id', 'string', 0, 0, {'use': 'required', 'name': 'id'}), + MemberSpec_('type_', 'typeType1', 0, 1, {'use': 'optional', 'name': 'type_'}), + MemberSpec_('custom', 'string', 0, 1, {'use': 'optional', 'name': 'custom'}), + MemberSpec_('comments', 'string', 0, 1, {'use': 'optional', 'name': 'comments'}), MemberSpec_('Labels', 'LabelsType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'Labels', 'type': 'LabelsType'}, None), MemberSpec_('SourceRegionRef', 'RegionRefType', 0, 0, {'maxOccurs': '1', 'minOccurs': '1', 'name': 'SourceRegionRef', 'type': 'RegionRefType'}, None), MemberSpec_('TargetRegionRef', 'RegionRefType', 0, 0, {'maxOccurs': '1', 'minOccurs': '1', 'name': 'TargetRegionRef', 'type': 'RegionRefType'}, None), @@ -8368,7 +9185,7 @@ def __init__(self, id=None, type_=None, custom=None, comments=None, Labels=None, self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.id = _cast(None, id) self.id_nsprefix_ = "pc" self.type_ = _cast(None, type_) @@ -8435,7 +9252,20 @@ def get_comments(self): return self.comments def set_comments(self, comments): self.comments = comments - def hasContent_(self): + def validate_typeType1(self, value): + # Validate type typeType1, a restriction on string. + if value is not None and Validate_simpletypes_ and self.gds_collector_ is not None: + if not isinstance(value, str): + lineno = self.gds_get_node_lineno_() + self.gds_collector_.add_message('Value "%(value)s"%(lineno)s is not of the correct base simple type (str)' % {"value": value, "lineno": lineno, }) + return False + value = value + enumerations = ['link', 'join'] + if value not in enumerations: + lineno = self.gds_get_node_lineno_() + self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on typeType1' % {"value" : encode_str_2_3(value), "lineno": lineno} ) + result = False + def has__content(self): if ( self.Labels or self.SourceRegionRef is not None or @@ -8459,15 +9289,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='RelationType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='RelationType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='RelationType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='RelationType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='RelationType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='RelationType'): if self.id is not None and 'id' not in already_processed: already_processed.add('id') outfile.write(' id=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.id), input_name='id')), )) @@ -8480,7 +9310,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.comments is not None and 'comments' not in already_processed: already_processed.add('comments') outfile.write(' comments=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.comments), input_name='comments')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='RelationType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='RelationType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -8494,7 +9324,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml if self.TargetRegionRef is not None: namespaceprefix_ = self.TargetRegionRef_nsprefix_ + ':' if (UseCapturedNS_ and self.TargetRegionRef_nsprefix_) else '' self.TargetRegionRef.export(outfile, level, namespaceprefix_, namespacedef_='', name_='TargetRegionRef', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='RelationType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='RelationType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -8508,15 +9338,17 @@ def to_etree(self, parent_element=None, name_='RelationType', mapping_=None, nsm if self.comments is not None: element.set('comments', self.gds_format_string(self.comments)) for Labels_ in self.Labels: - Labels_.to_etree(element, name_='Labels', mapping_=mapping_, nsmap_=nsmap_) + Labels_.to_etree(element, name_='Labels', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.SourceRegionRef is not None: SourceRegionRef_ = self.SourceRegionRef - SourceRegionRef_.to_etree(element, name_='SourceRegionRef', mapping_=mapping_, nsmap_=nsmap_) + SourceRegionRef_.to_etree(element, name_='SourceRegionRef', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.TargetRegionRef is not None: TargetRegionRef_ = self.TargetRegionRef - TargetRegionRef_.to_etree(element, name_='TargetRegionRef', mapping_=mapping_, nsmap_=nsmap_) + TargetRegionRef_.to_etree(element, name_='TargetRegionRef', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -8524,12 +9356,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('id', node) if value is not None and 'id' not in already_processed: already_processed.add('id') @@ -8538,6 +9370,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'type' not in already_processed: already_processed.add('type') self.type_ = value + self.validate_typeType1(self.type_) # validate type typeType1 value = find_attr_value_('custom', node) if value is not None and 'custom' not in already_processed: already_processed.add('custom') @@ -8546,7 +9379,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'comments' not in already_processed: already_processed.add('comments') self.comments = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'Labels': obj_ = LabelsType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -8568,49 +9401,69 @@ def __hash__(self): class TextStyleType(GeneratedsSuper): - """Monospace (fixed-pitch, non-proportional) or + """TextStyleType -- + Monospace (fixed-pitch, non-proportional) or proportional font. - For instance: Arial, Times New Roman. - Add more information if necessary - (e.g. blackletter, antiqua). - Serif or sans-serif typeface. - The size of the characters in points. - The x-height or corpus size refers to the distance - between the baseline and the mean line of - lower-case letters in a typeface. - The unit is assumed to be pixels. - The degree of space (in points) between - the characters in a string of text. - Text colour in RGB encoded format - (red value) + (256 x green value) + (65536 x blue value). - Background colour - Background colour in RGB encoded format - (red value) + (256 x green value) + (65536 x blue value). - Specifies whether the colour of the text appears - reversed against a background colour. - Line style details if "underlined" is TRUE""" + + * fontFamily -- + For instance: Arial, Times New Roman. + Add more information if necessary + (e.g. blackletter, antiqua). + + * serif -- + Serif or sans-serif typeface. + + * fontSize -- + The size of the characters in points. + + * xHeight -- + The x-height or corpus size refers to the distance + between the baseline and the mean line of + lower-case letters in a typeface. + The unit is assumed to be pixels. + + * kerning -- + The degree of space (in points) between + the characters in a string of text. + + * textColourRgb -- + Text colour in RGB encoded format + (red value) + (256 x green value) + (65536 x blue value). + + * bgColour -- Background colour + * bgColourRgb -- + Background colour in RGB encoded format + (red value) + (256 x green value) + (65536 x blue value). + + * reverseVideo -- + Specifies whether the colour of the text appears + reversed against a background colour. + + * underlineStyle -- Line style details if "underlined" is TRUE + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('fontFamily', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('serif', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('monospace', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('fontSize', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('xHeight', 'integer', 0, 1, {'use': 'optional'}), - MemberSpec_('kerning', 'int', 0, 1, {'use': 'optional'}), - MemberSpec_('textColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('textColourRgb', 'integer', 0, 1, {'use': 'optional'}), - MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('bgColourRgb', 'integer', 0, 1, {'use': 'optional'}), - MemberSpec_('reverseVideo', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('bold', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('italic', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('underlined', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('underlineStyle', 'pc:UnderlineStyleSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('subscript', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('superscript', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('strikethrough', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('smallCaps', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('letterSpaced', 'boolean', 0, 1, {'use': 'optional'}), + MemberSpec_('fontFamily', 'string', 0, 1, {'use': 'optional', 'name': 'fontFamily'}), + MemberSpec_('serif', 'boolean', 0, 1, {'use': 'optional', 'name': 'serif'}), + MemberSpec_('monospace', 'boolean', 0, 1, {'use': 'optional', 'name': 'monospace'}), + MemberSpec_('fontSize', 'float', 0, 1, {'use': 'optional', 'name': 'fontSize'}), + MemberSpec_('xHeight', 'integer', 0, 1, {'use': 'optional', 'name': 'xHeight'}), + MemberSpec_('kerning', 'int', 0, 1, {'use': 'optional', 'name': 'kerning'}), + MemberSpec_('textColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'textColour'}), + MemberSpec_('textColourRgb', 'integer', 0, 1, {'use': 'optional', 'name': 'textColourRgb'}), + MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'bgColour'}), + MemberSpec_('bgColourRgb', 'integer', 0, 1, {'use': 'optional', 'name': 'bgColourRgb'}), + MemberSpec_('reverseVideo', 'boolean', 0, 1, {'use': 'optional', 'name': 'reverseVideo'}), + MemberSpec_('bold', 'boolean', 0, 1, {'use': 'optional', 'name': 'bold'}), + MemberSpec_('italic', 'boolean', 0, 1, {'use': 'optional', 'name': 'italic'}), + MemberSpec_('underlined', 'boolean', 0, 1, {'use': 'optional', 'name': 'underlined'}), + MemberSpec_('underlineStyle', 'pc:UnderlineStyleSimpleType', 0, 1, {'use': 'optional', 'name': 'underlineStyle'}), + MemberSpec_('subscript', 'boolean', 0, 1, {'use': 'optional', 'name': 'subscript'}), + MemberSpec_('superscript', 'boolean', 0, 1, {'use': 'optional', 'name': 'superscript'}), + MemberSpec_('strikethrough', 'boolean', 0, 1, {'use': 'optional', 'name': 'strikethrough'}), + MemberSpec_('smallCaps', 'boolean', 0, 1, {'use': 'optional', 'name': 'smallCaps'}), + MemberSpec_('letterSpaced', 'boolean', 0, 1, {'use': 'optional', 'name': 'letterSpaced'}), ] subclass = None superclass = None @@ -8619,7 +9472,7 @@ def __init__(self, fontFamily=None, serif=None, monospace=None, fontSize=None, x self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.fontFamily = _cast(None, fontFamily) self.fontFamily_nsprefix_ = "pc" self.serif = _cast(bool, serif) @@ -8781,7 +9634,7 @@ def validate_UnderlineStyleSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on UnderlineStyleSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( ): @@ -8803,14 +9656,14 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TextStyleType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TextStyleType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='TextStyleType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='TextStyleType', pretty_print=pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='TextStyleType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='TextStyleType'): if self.fontFamily is not None and 'fontFamily' not in already_processed: already_processed.add('fontFamily') outfile.write(' fontFamily=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.fontFamily), input_name='fontFamily')), )) @@ -8871,9 +9724,9 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.letterSpaced is not None and 'letterSpaced' not in already_processed: already_processed.add('letterSpaced') outfile.write(' letterSpaced="%s"' % self.gds_format_boolean(self.letterSpaced, input_name='letterSpaced')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='TextStyleType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='TextStyleType', fromsubclass_=False, pretty_print=True): pass - def to_etree(self, parent_element=None, name_='TextStyleType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='TextStyleType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -8920,6 +9773,8 @@ def to_etree(self, parent_element=None, name_='TextStyleType', mapping_=None, ns element.set('letterSpaced', self.gds_format_boolean(self.letterSpaced)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -8927,12 +9782,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('fontFamily', node) if value is not None and 'fontFamily' not in already_processed: already_processed.add('fontFamily') @@ -9072,7 +9927,7 @@ def buildAttributes(self, node, attrs, already_processed): self.letterSpaced = False else: raise_parse_error(node, 'Bad boolean attribute') - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): pass def __hash__(self): return hash(self.id) @@ -9080,15 +9935,27 @@ def __hash__(self): class RegionType(GeneratedsSuper): - """For generic use + """custom -- For generic use + continuation -- Is this region a continuation of another region - (in previous column or page, for example)?""" + (in previous column or page, for example)? + + * AlternativeImage -- + Alternative region images + (e.g. black-and-white). + + * Labels -- Semantic labels / tags + * Roles -- + Roles the region takes + (e.g. in context of a parent region). + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('id', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('custom', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('comments', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('continuation', 'boolean', 0, 1, {'use': 'optional'}), + MemberSpec_('id', 'string', 0, 0, {'use': 'required', 'name': 'id'}), + MemberSpec_('custom', 'string', 0, 1, {'use': 'optional', 'name': 'custom'}), + MemberSpec_('comments', 'string', 0, 1, {'use': 'optional', 'name': 'comments'}), + MemberSpec_('continuation', 'boolean', 0, 1, {'use': 'optional', 'name': 'continuation'}), MemberSpec_('AlternativeImage', 'AlternativeImageType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'AlternativeImage', 'type': 'AlternativeImageType'}, None), MemberSpec_('Coords', 'CoordsType', 0, 0, {'name': 'Coords', 'type': 'CoordsType'}, None), MemberSpec_('UserDefined', 'UserDefinedType', 0, 1, {'maxOccurs': '1', 'minOccurs': '0', 'name': 'UserDefined', 'type': 'UserDefinedType'}, None), @@ -9116,7 +9983,7 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.id = _cast(None, id) self.id_nsprefix_ = "pc" self.custom = _cast(None, custom) @@ -9417,7 +10284,7 @@ def set_continuation(self, continuation): self.continuation = continuation def get_extensiontype_(self): return self.extensiontype_ def set_extensiontype_(self, extensiontype_): self.extensiontype_ = extensiontype_ - def hasContent_(self): + def has__content(self): if ( self.AlternativeImage or self.Coords is not None or @@ -9457,15 +10324,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='RegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='RegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='RegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='RegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='RegionType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='RegionType'): if self.id is not None and 'id' not in already_processed: already_processed.add('id') outfile.write(' id=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.id), input_name='id')), )) @@ -9486,7 +10353,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' outfile.write(' xsi:type="%s%s"' % (imported_ns_type_prefix_, self.extensiontype_)) else: outfile.write(' xsi:type="%s"' % self.extensiontype_) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='RegionType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='RegionType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -9548,7 +10415,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for CustomRegion_ in self.CustomRegion: namespaceprefix_ = self.CustomRegion_nsprefix_ + ':' if (UseCapturedNS_ and self.CustomRegion_nsprefix_) else '' CustomRegion_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='CustomRegion', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='RegionType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='RegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -9564,48 +10431,50 @@ def to_etree(self, parent_element=None, name_='RegionType', mapping_=None, nsmap if self.continuation is not None: element.set('continuation', self.gds_format_boolean(self.continuation)) for AlternativeImage_ in self.AlternativeImage: - AlternativeImage_.to_etree(element, name_='AlternativeImage', mapping_=mapping_, nsmap_=nsmap_) + AlternativeImage_.to_etree(element, name_='AlternativeImage', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.Coords is not None: Coords_ = self.Coords - Coords_.to_etree(element, name_='Coords', mapping_=mapping_, nsmap_=nsmap_) + Coords_.to_etree(element, name_='Coords', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.UserDefined is not None: UserDefined_ = self.UserDefined - UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, nsmap_=nsmap_) + UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for Labels_ in self.Labels: - Labels_.to_etree(element, name_='Labels', mapping_=mapping_, nsmap_=nsmap_) + Labels_.to_etree(element, name_='Labels', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.Roles is not None: Roles_ = self.Roles - Roles_.to_etree(element, name_='Roles', mapping_=mapping_, nsmap_=nsmap_) + Roles_.to_etree(element, name_='Roles', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for TextRegion_ in self.TextRegion: - TextRegion_.to_etree(element, name_='TextRegion', mapping_=mapping_, nsmap_=nsmap_) + TextRegion_.to_etree(element, name_='TextRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for ImageRegion_ in self.ImageRegion: - ImageRegion_.to_etree(element, name_='ImageRegion', mapping_=mapping_, nsmap_=nsmap_) + ImageRegion_.to_etree(element, name_='ImageRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for LineDrawingRegion_ in self.LineDrawingRegion: - LineDrawingRegion_.to_etree(element, name_='LineDrawingRegion', mapping_=mapping_, nsmap_=nsmap_) + LineDrawingRegion_.to_etree(element, name_='LineDrawingRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for GraphicRegion_ in self.GraphicRegion: - GraphicRegion_.to_etree(element, name_='GraphicRegion', mapping_=mapping_, nsmap_=nsmap_) + GraphicRegion_.to_etree(element, name_='GraphicRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for TableRegion_ in self.TableRegion: - TableRegion_.to_etree(element, name_='TableRegion', mapping_=mapping_, nsmap_=nsmap_) + TableRegion_.to_etree(element, name_='TableRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for ChartRegion_ in self.ChartRegion: - ChartRegion_.to_etree(element, name_='ChartRegion', mapping_=mapping_, nsmap_=nsmap_) + ChartRegion_.to_etree(element, name_='ChartRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for SeparatorRegion_ in self.SeparatorRegion: - SeparatorRegion_.to_etree(element, name_='SeparatorRegion', mapping_=mapping_, nsmap_=nsmap_) + SeparatorRegion_.to_etree(element, name_='SeparatorRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for MathsRegion_ in self.MathsRegion: - MathsRegion_.to_etree(element, name_='MathsRegion', mapping_=mapping_, nsmap_=nsmap_) + MathsRegion_.to_etree(element, name_='MathsRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for ChemRegion_ in self.ChemRegion: - ChemRegion_.to_etree(element, name_='ChemRegion', mapping_=mapping_, nsmap_=nsmap_) + ChemRegion_.to_etree(element, name_='ChemRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for MusicRegion_ in self.MusicRegion: - MusicRegion_.to_etree(element, name_='MusicRegion', mapping_=mapping_, nsmap_=nsmap_) + MusicRegion_.to_etree(element, name_='MusicRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for AdvertRegion_ in self.AdvertRegion: - AdvertRegion_.to_etree(element, name_='AdvertRegion', mapping_=mapping_, nsmap_=nsmap_) + AdvertRegion_.to_etree(element, name_='AdvertRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for NoiseRegion_ in self.NoiseRegion: - NoiseRegion_.to_etree(element, name_='NoiseRegion', mapping_=mapping_, nsmap_=nsmap_) + NoiseRegion_.to_etree(element, name_='NoiseRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for UnknownRegion_ in self.UnknownRegion: - UnknownRegion_.to_etree(element, name_='UnknownRegion', mapping_=mapping_, nsmap_=nsmap_) + UnknownRegion_.to_etree(element, name_='UnknownRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for CustomRegion_ in self.CustomRegion: - CustomRegion_.to_etree(element, name_='CustomRegion', mapping_=mapping_, nsmap_=nsmap_) + CustomRegion_.to_etree(element, name_='CustomRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -9613,12 +10482,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('id', node) if value is not None and 'id' not in already_processed: already_processed.add('id') @@ -9644,7 +10513,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'xsi:type' not in already_processed: already_processed.add('xsi:type') self.extensiontype_ = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'AlternativeImage': obj_ = AlternativeImageType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -9791,12 +10660,14 @@ def set_Coords(self, Coords): class AlternativeImageType(GeneratedsSuper): - """Confidence value (between 0 and 1)""" + """conf -- Confidence value (between 0 and 1) + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('filename', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('comments', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('conf', 'pc:ConfSimpleType', 0, 1, {'use': 'optional'}), + MemberSpec_('filename', 'string', 0, 0, {'use': 'required', 'name': 'filename'}), + MemberSpec_('comments', 'string', 0, 1, {'use': 'optional', 'name': 'comments'}), + MemberSpec_('conf', 'pc:ConfSimpleType', 0, 1, {'use': 'optional', 'name': 'conf'}), ] subclass = None superclass = None @@ -9805,7 +10676,7 @@ def __init__(self, filename=None, comments=None, conf=None, gds_collector_=None, self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.filename = _cast(None, filename) self.filename_nsprefix_ = "pc" self.comments = _cast(None, comments) @@ -9854,7 +10725,7 @@ def validate_ConfSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd maxInclusive restriction on ConfSimpleType' % {"value": value, "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( ): @@ -9876,14 +10747,14 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='AlternativeImageType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='AlternativeImageType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='AlternativeImageType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='AlternativeImageType', pretty_print=pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='AlternativeImageType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='AlternativeImageType'): if self.filename is not None and 'filename' not in already_processed: already_processed.add('filename') outfile.write(' filename=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.filename), input_name='filename')), )) @@ -9893,9 +10764,9 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.conf is not None and 'conf' not in already_processed: already_processed.add('conf') outfile.write(' conf="%s"' % self.gds_format_float(self.conf, input_name='conf')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='AlternativeImageType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='AlternativeImageType', fromsubclass_=False, pretty_print=True): pass - def to_etree(self, parent_element=None, name_='AlternativeImageType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='AlternativeImageType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -9908,6 +10779,8 @@ def to_etree(self, parent_element=None, name_='AlternativeImageType', mapping_=N element.set('conf', self.gds_format_float(self.conf)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -9915,12 +10788,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('filename', node) if value is not None and 'filename' not in already_processed: already_processed.add('filename') @@ -9935,7 +10808,7 @@ def buildAttributes(self, node, attrs, already_processed): value = self.gds_parse_float(value, node, 'conf') self.conf = value self.validate_ConfSimpleType(self.conf) # validate type ConfSimpleType - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): pass def __hash__(self): return hash(self.id) @@ -9943,8 +10816,11 @@ def __hash__(self): class GraphemesType(GeneratedsSuper): - """Container for graphemes, grapheme groups and - non-printing characters.""" + """GraphemesType -- + Container for graphemes, grapheme groups and + non-printing characters. + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ MemberSpec_('Grapheme', 'GraphemeType', 1, 0, {'name': 'Grapheme', 'type': 'GraphemeType'}, 8), @@ -9958,7 +10834,7 @@ def __init__(self, Grapheme=None, NonPrintingChar=None, GraphemeGroup=None, gds_ self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" if Grapheme is None: self.Grapheme = [] else: @@ -10019,7 +10895,7 @@ def insert_GraphemeGroup_at(self, index, value): self.GraphemeGroup.insert(index, value) def replace_GraphemeGroup_at(self, index, value): self.GraphemeGroup[index] = value - def hasContent_(self): + def has__content(self): if ( self.Grapheme or self.NonPrintingChar or @@ -10043,17 +10919,17 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphemesType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphemesType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GraphemesType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GraphemesType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GraphemesType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GraphemesType'): pass - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GraphemesType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GraphemesType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -10067,19 +10943,21 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for GraphemeGroup_ in self.GraphemeGroup: namespaceprefix_ = self.GraphemeGroup_nsprefix_ + ':' if (UseCapturedNS_ and self.GraphemeGroup_nsprefix_) else '' GraphemeGroup_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='GraphemeGroup', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='GraphemesType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='GraphemesType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: element = etree_.SubElement(parent_element, '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) for Grapheme_ in self.Grapheme: - Grapheme_.to_etree(element, name_='Grapheme', mapping_=mapping_, nsmap_=nsmap_) + Grapheme_.to_etree(element, name_='Grapheme', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for NonPrintingChar_ in self.NonPrintingChar: - NonPrintingChar_.to_etree(element, name_='NonPrintingChar', mapping_=mapping_, nsmap_=nsmap_) + NonPrintingChar_.to_etree(element, name_='NonPrintingChar', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for GraphemeGroup_ in self.GraphemeGroup: - GraphemeGroup_.to_etree(element, name_='GraphemeGroup', mapping_=mapping_, nsmap_=nsmap_) + GraphemeGroup_.to_etree(element, name_='GraphemeGroup', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -10087,14 +10965,14 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): pass - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'Grapheme': obj_ = GraphemeType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -10116,20 +10994,29 @@ def __hash__(self): class GraphemeBaseType(GeneratedsSuper): - """Base type for graphemes, grapheme groups and non-printing characters. - Order index of grapheme, group, or non-printing character - within the parent container (graphemes or glyph or grapheme group). - Type of character represented by the - grapheme, group, or non-printing character element. - For generic useFor generic use""" + """GraphemeBaseType -- + Base type for graphemes, grapheme groups and non-printing characters. + + * index -- + Order index of grapheme, group, or non-printing character + within the parent container (graphemes or glyph or grapheme group). + + * charType -- + Type of character represented by the + grapheme, group, or non-printing character element. + + * custom -- For generic use + * comments -- For generic use + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('id', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('index', 'int', 0, 0, {'use': 'required'}), - MemberSpec_('ligature', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('charType', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('custom', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('comments', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('id', 'string', 0, 0, {'use': 'required', 'name': 'id'}), + MemberSpec_('index', 'indexType2', 0, 0, {'use': 'required', 'name': 'index'}), + MemberSpec_('ligature', 'boolean', 0, 1, {'use': 'optional', 'name': 'ligature'}), + MemberSpec_('charType', 'charTypeType', 0, 1, {'use': 'optional', 'name': 'charType'}), + MemberSpec_('custom', 'string', 0, 1, {'use': 'optional', 'name': 'custom'}), + MemberSpec_('comments', 'string', 0, 1, {'use': 'optional', 'name': 'comments'}), MemberSpec_('TextEquiv', 'TextEquivType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'TextEquiv', 'type': 'TextEquivType'}, None), ] subclass = None @@ -10139,7 +11026,7 @@ def __init__(self, id=None, index=None, ligature=None, charType=None, custom=Non self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.id = _cast(None, id) self.id_nsprefix_ = "pc" self.index = _cast(int, index) @@ -10209,7 +11096,31 @@ def set_comments(self, comments): self.comments = comments def get_extensiontype_(self): return self.extensiontype_ def set_extensiontype_(self, extensiontype_): self.extensiontype_ = extensiontype_ - def hasContent_(self): + def validate_indexType2(self, value): + # Validate type indexType2, a restriction on int. + if value is not None and Validate_simpletypes_ and self.gds_collector_ is not None: + if not isinstance(value, int): + lineno = self.gds_get_node_lineno_() + self.gds_collector_.add_message('Value "%(value)s"%(lineno)s is not of the correct base simple type (int)' % {"value": value, "lineno": lineno, }) + return False + if value < 0: + lineno = self.gds_get_node_lineno_() + self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd minInclusive restriction on indexType2' % {"value": value, "lineno": lineno} ) + result = False + def validate_charTypeType(self, value): + # Validate type charTypeType, a restriction on string. + if value is not None and Validate_simpletypes_ and self.gds_collector_ is not None: + if not isinstance(value, str): + lineno = self.gds_get_node_lineno_() + self.gds_collector_.add_message('Value "%(value)s"%(lineno)s is not of the correct base simple type (str)' % {"value": value, "lineno": lineno, }) + return False + value = value + enumerations = ['base', 'combining'] + if value not in enumerations: + lineno = self.gds_get_node_lineno_() + self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on charTypeType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) + result = False + def has__content(self): if ( self.TextEquiv ): @@ -10231,15 +11142,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphemeBaseType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphemeBaseType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GraphemeBaseType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GraphemeBaseType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GraphemeBaseType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GraphemeBaseType'): if self.id is not None and 'id' not in already_processed: already_processed.add('id') outfile.write(' id=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.id), input_name='id')), )) @@ -10266,7 +11177,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' outfile.write(' xsi:type="%s%s"' % (imported_ns_type_prefix_, self.extensiontype_)) else: outfile.write(' xsi:type="%s"' % self.extensiontype_) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GraphemeBaseType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GraphemeBaseType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -10274,7 +11185,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for TextEquiv_ in self.TextEquiv: namespaceprefix_ = self.TextEquiv_nsprefix_ + ':' if (UseCapturedNS_ and self.TextEquiv_nsprefix_) else '' TextEquiv_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='TextEquiv', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='GraphemeBaseType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='GraphemeBaseType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -10294,9 +11205,11 @@ def to_etree(self, parent_element=None, name_='GraphemeBaseType', mapping_=None, if self.comments is not None: element.set('comments', self.gds_format_string(self.comments)) for TextEquiv_ in self.TextEquiv: - TextEquiv_.to_etree(element, name_='TextEquiv', mapping_=mapping_, nsmap_=nsmap_) + TextEquiv_.to_etree(element, name_='TextEquiv', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -10304,12 +11217,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('id', node) if value is not None and 'id' not in already_processed: already_processed.add('id') @@ -10318,6 +11231,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'index' not in already_processed: already_processed.add('index') self.index = self.gds_parse_integer(value, node, 'index') + self.validate_indexType2(self.index) # validate type indexType2 value = find_attr_value_('ligature', node) if value is not None and 'ligature' not in already_processed: already_processed.add('ligature') @@ -10331,6 +11245,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'charType' not in already_processed: already_processed.add('charType') self.charType = value + self.validate_charTypeType(self.charType) # validate type charTypeType value = find_attr_value_('custom', node) if value is not None and 'custom' not in already_processed: already_processed.add('custom') @@ -10343,7 +11258,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'xsi:type' not in already_processed: already_processed.add('xsi:type') self.extensiontype_ = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'TextEquiv': obj_ = TextEquivType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -10355,9 +11270,12 @@ def __hash__(self): class GraphemeType(GraphemeBaseType): - """Represents a sub-element of a glyph. + """GraphemeType -- + Represents a sub-element of a glyph. Smallest graphical unit that can be - assigned a Unicode code point.""" + assigned a Unicode code point. + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ MemberSpec_('Coords', 'CoordsType', 0, 0, {'name': 'Coords', 'type': 'CoordsType'}, None), @@ -10369,8 +11287,8 @@ def __init__(self, id=None, index=None, ligature=None, charType=None, custom=Non self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(GraphemeType, self).__init__(id, index, ligature, charType, custom, comments, TextEquiv, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("GraphemeType"), self).__init__(id, index, ligature, charType, custom, comments, TextEquiv, **kwargs_) self.Coords = Coords self.Coords_nsprefix_ = "pc" def factory(*args_, **kwargs_): @@ -10392,10 +11310,10 @@ def get_Coords(self): return self.Coords def set_Coords(self, Coords): self.Coords = Coords - def hasContent_(self): + def has__content(self): if ( self.Coords is not None or - super(GraphemeType, self).hasContent_() + super(GraphemeType, self).has__content() ): return True else: @@ -10415,18 +11333,18 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphemeType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphemeType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GraphemeType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GraphemeType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GraphemeType'): - super(GraphemeType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphemeType') - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GraphemeType', fromsubclass_=False, pretty_print=True): - super(GraphemeType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GraphemeType'): + super(GraphemeType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphemeType') + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GraphemeType', fromsubclass_=False, pretty_print=True): + super(GraphemeType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) if pretty_print: eol_ = '\n' else: @@ -10434,13 +11352,15 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml if self.Coords is not None: namespaceprefix_ = self.Coords_nsprefix_ + ':' if (UseCapturedNS_ and self.Coords_nsprefix_) else '' self.Coords.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Coords', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='GraphemeType', mapping_=None, nsmap_=None): - element = super(GraphemeType, self).to_etree(parent_element, name_, mapping_) + def to_etree(self, parent_element=None, name_='GraphemeType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(GraphemeType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if self.Coords is not None: Coords_ = self.Coords - Coords_.to_etree(element, name_='Coords', mapping_=mapping_, nsmap_=nsmap_) + Coords_.to_etree(element, name_='Coords', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -10448,30 +11368,33 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): - super(GraphemeType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildAttributes(self, node, attrs, already_processed): + super(GraphemeType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'Coords': obj_ = CoordsType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) self.Coords = obj_ obj_.original_tagname_ = 'Coords' - super(GraphemeType, self).buildChildren(child_, node, nodeName_, True) + super(GraphemeType, self)._buildChildren(child_, node, nodeName_, True) def __hash__(self): return hash(self.id) # end class GraphemeType class NonPrintingCharType(GraphemeBaseType): - """A glyph component without visual representation + """NonPrintingCharType -- + A glyph component without visual representation but with Unicode code point. Non-visual / non-printing / control character. - Part of grapheme container (of glyph) or grapheme sub group.""" + Part of grapheme container (of glyph) or grapheme sub group. + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ ] @@ -10482,8 +11405,8 @@ def __init__(self, id=None, index=None, ligature=None, charType=None, custom=Non self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(NonPrintingCharType, self).__init__(id, index, ligature, charType, custom, comments, TextEquiv, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("NonPrintingCharType"), self).__init__(id, index, ligature, charType, custom, comments, TextEquiv, **kwargs_) def factory(*args_, **kwargs_): if CurrentSubclassModule_ is not None: subclass = getSubclassFromModule_( @@ -10499,9 +11422,9 @@ def get_ns_prefix_(self): return self.ns_prefix_ def set_ns_prefix_(self, ns_prefix): self.ns_prefix_ = ns_prefix - def hasContent_(self): + def has__content(self): if ( - super(NonPrintingCharType, self).hasContent_() + super(NonPrintingCharType, self).has__content() ): return True else: @@ -10521,22 +11444,24 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='NonPrintingCharType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='NonPrintingCharType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='NonPrintingCharType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='NonPrintingCharType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='NonPrintingCharType'): - super(NonPrintingCharType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='NonPrintingCharType') - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='NonPrintingCharType', fromsubclass_=False, pretty_print=True): - super(NonPrintingCharType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='NonPrintingCharType', mapping_=None, nsmap_=None): - element = super(NonPrintingCharType, self).to_etree(parent_element, name_, mapping_) + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='NonPrintingCharType'): + super(NonPrintingCharType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='NonPrintingCharType') + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='NonPrintingCharType', fromsubclass_=False, pretty_print=True): + super(NonPrintingCharType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def to_etree(self, parent_element=None, name_='NonPrintingCharType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(NonPrintingCharType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -10544,15 +11469,15 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): - super(NonPrintingCharType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): - super(NonPrintingCharType, self).buildChildren(child_, node, nodeName_, True) + def _buildAttributes(self, node, attrs, already_processed): + super(NonPrintingCharType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(NonPrintingCharType, self)._buildChildren(child_, node, nodeName_, True) pass def __hash__(self): return hash(self.id) @@ -10572,8 +11497,8 @@ def __init__(self, id=None, index=None, ligature=None, charType=None, custom=Non self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(GraphemeGroupType, self).__init__(id, index, ligature, charType, custom, comments, TextEquiv, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("GraphemeGroupType"), self).__init__(id, index, ligature, charType, custom, comments, TextEquiv, **kwargs_) if Grapheme is None: self.Grapheme = [] else: @@ -10619,11 +11544,11 @@ def insert_NonPrintingChar_at(self, index, value): self.NonPrintingChar.insert(index, value) def replace_NonPrintingChar_at(self, index, value): self.NonPrintingChar[index] = value - def hasContent_(self): + def has__content(self): if ( self.Grapheme or self.NonPrintingChar or - super(GraphemeGroupType, self).hasContent_() + super(GraphemeGroupType, self).has__content() ): return True else: @@ -10643,18 +11568,18 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphemeGroupType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphemeGroupType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GraphemeGroupType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GraphemeGroupType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GraphemeGroupType'): - super(GraphemeGroupType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphemeGroupType') - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GraphemeGroupType', fromsubclass_=False, pretty_print=True): - super(GraphemeGroupType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GraphemeGroupType'): + super(GraphemeGroupType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphemeGroupType') + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GraphemeGroupType', fromsubclass_=False, pretty_print=True): + super(GraphemeGroupType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) if pretty_print: eol_ = '\n' else: @@ -10665,14 +11590,16 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for NonPrintingChar_ in self.NonPrintingChar: namespaceprefix_ = self.NonPrintingChar_nsprefix_ + ':' if (UseCapturedNS_ and self.NonPrintingChar_nsprefix_) else '' NonPrintingChar_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='NonPrintingChar', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='GraphemeGroupType', mapping_=None, nsmap_=None): - element = super(GraphemeGroupType, self).to_etree(parent_element, name_, mapping_) + def to_etree(self, parent_element=None, name_='GraphemeGroupType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(GraphemeGroupType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) for Grapheme_ in self.Grapheme: - Grapheme_.to_etree(element, name_='Grapheme', mapping_=mapping_, nsmap_=nsmap_) + Grapheme_.to_etree(element, name_='Grapheme', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for NonPrintingChar_ in self.NonPrintingChar: - NonPrintingChar_.to_etree(element, name_='NonPrintingChar', mapping_=mapping_, nsmap_=nsmap_) + NonPrintingChar_.to_etree(element, name_='NonPrintingChar', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -10680,14 +11607,14 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): - super(GraphemeGroupType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildAttributes(self, node, attrs, already_processed): + super(GraphemeGroupType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'Grapheme': obj_ = GraphemeType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -10698,14 +11625,16 @@ def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collec obj_.build(child_, gds_collector_=gds_collector_) self.NonPrintingChar.append(obj_) obj_.original_tagname_ = 'NonPrintingChar' - super(GraphemeGroupType, self).buildChildren(child_, node, nodeName_, True) + super(GraphemeGroupType, self)._buildChildren(child_, node, nodeName_, True) def __hash__(self): return hash(self.id) # end class GraphemeGroupType class UserDefinedType(GeneratedsSuper): - """Container for user-defined attributes""" + """UserDefinedType -- Container for user-defined attributes + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ MemberSpec_('UserAttribute', 'UserAttributeType', 1, 0, {'maxOccurs': 'unbounded', 'minOccurs': '1', 'name': 'UserAttribute', 'type': 'UserAttributeType'}, None), @@ -10717,7 +11646,7 @@ def __init__(self, UserAttribute=None, gds_collector_=None, **kwargs_): self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" if UserAttribute is None: self.UserAttribute = [] else: @@ -10748,7 +11677,7 @@ def insert_UserAttribute_at(self, index, value): self.UserAttribute.insert(index, value) def replace_UserAttribute_at(self, index, value): self.UserAttribute[index] = value - def hasContent_(self): + def has__content(self): if ( self.UserAttribute ): @@ -10770,17 +11699,17 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='UserDefinedType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='UserDefinedType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='UserDefinedType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='UserDefinedType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='UserDefinedType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='UserDefinedType'): pass - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='UserDefinedType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='UserDefinedType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -10788,15 +11717,17 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for UserAttribute_ in self.UserAttribute: namespaceprefix_ = self.UserAttribute_nsprefix_ + ':' if (UseCapturedNS_ and self.UserAttribute_nsprefix_) else '' UserAttribute_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UserAttribute', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='UserDefinedType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='UserDefinedType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: element = etree_.SubElement(parent_element, '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) for UserAttribute_ in self.UserAttribute: - UserAttribute_.to_etree(element, name_='UserAttribute', mapping_=mapping_, nsmap_=nsmap_) + UserAttribute_.to_etree(element, name_='UserAttribute', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -10804,14 +11735,14 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): pass - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'UserAttribute': obj_ = UserAttributeType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -10823,13 +11754,15 @@ def __hash__(self): class UserAttributeType(GeneratedsSuper): - """Structured custom data defined by name, type and value.""" + """UserAttributeType -- Structured custom data defined by name, type and value. + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('name', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('description', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('type_', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('value', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('name', 'string', 0, 1, {'use': 'optional', 'name': 'name'}), + MemberSpec_('description', 'string', 0, 1, {'use': 'optional', 'name': 'description'}), + MemberSpec_('type_', 'typeType3', 0, 1, {'use': 'optional', 'name': 'type_'}), + MemberSpec_('value', 'string', 0, 1, {'use': 'optional', 'name': 'value'}), ] subclass = None superclass = None @@ -10838,7 +11771,7 @@ def __init__(self, name=None, description=None, type_=None, value=None, gds_coll self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.name = _cast(None, name) self.name_nsprefix_ = "pc" self.description = _cast(None, description) @@ -10878,7 +11811,20 @@ def get_value(self): return self.value def set_value(self, value): self.value = value - def hasContent_(self): + def validate_typeType3(self, value): + # Validate type typeType3, a restriction on string. + if value is not None and Validate_simpletypes_ and self.gds_collector_ is not None: + if not isinstance(value, str): + lineno = self.gds_get_node_lineno_() + self.gds_collector_.add_message('Value "%(value)s"%(lineno)s is not of the correct base simple type (str)' % {"value": value, "lineno": lineno, }) + return False + value = value + enumerations = ['xsd:string', 'xsd:integer', 'xsd:boolean', 'xsd:float'] + if value not in enumerations: + lineno = self.gds_get_node_lineno_() + self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on typeType3' % {"value" : encode_str_2_3(value), "lineno": lineno} ) + result = False + def has__content(self): if ( ): @@ -10900,14 +11846,14 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='UserAttributeType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='UserAttributeType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='UserAttributeType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='UserAttributeType', pretty_print=pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='UserAttributeType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='UserAttributeType'): if self.name is not None and 'name' not in already_processed: already_processed.add('name') outfile.write(' name=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.name), input_name='name')), )) @@ -10920,9 +11866,9 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.value is not None and 'value' not in already_processed: already_processed.add('value') outfile.write(' value=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.value), input_name='value')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='UserAttributeType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='UserAttributeType', fromsubclass_=False, pretty_print=True): pass - def to_etree(self, parent_element=None, name_='UserAttributeType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='UserAttributeType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -10937,6 +11883,8 @@ def to_etree(self, parent_element=None, name_='UserAttributeType', mapping_=None element.set('value', self.gds_format_string(self.value)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -10944,12 +11892,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('name', node) if value is not None and 'name' not in already_processed: already_processed.add('name') @@ -10962,11 +11910,12 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'type' not in already_processed: already_processed.add('type') self.type_ = value + self.validate_typeType3(self.type_) # validate type typeType3 value = find_attr_value_('value', node) if value is not None and 'value' not in already_processed: already_processed.add('value') self.value = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): pass def __hash__(self): return hash(self.id) @@ -10974,17 +11923,21 @@ def __hash__(self): class TableCellRoleType(GeneratedsSuper): - """Cell position in table starting with row 0Cell position in table - starting with column 0Number of rows the cell spans (optional; default - is 1)Number of columns the cell spans (optional; default is 1) - Is the cell a column or row header?""" + """rowIndex -- Cell position in table starting with row 0 + columnIndex -- Cell position in table starting with column 0 + rowSpan -- Number of rows the cell spans (optional; default is 1) + colSpan -- Number of columns the cell spans (optional; default is 1) + header -- + Is the cell a column or row header? + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('rowIndex', 'int', 0, 0, {'use': 'required'}), - MemberSpec_('columnIndex', 'int', 0, 0, {'use': 'required'}), - MemberSpec_('rowSpan', 'int', 0, 1, {'use': 'optional'}), - MemberSpec_('colSpan', 'int', 0, 1, {'use': 'optional'}), - MemberSpec_('header', 'boolean', 0, 1, {'use': 'optional'}), + MemberSpec_('rowIndex', 'int', 0, 0, {'use': 'required', 'name': 'rowIndex'}), + MemberSpec_('columnIndex', 'int', 0, 0, {'use': 'required', 'name': 'columnIndex'}), + MemberSpec_('rowSpan', 'int', 0, 1, {'use': 'optional', 'name': 'rowSpan'}), + MemberSpec_('colSpan', 'int', 0, 1, {'use': 'optional', 'name': 'colSpan'}), + MemberSpec_('header', 'boolean', 0, 1, {'use': 'optional', 'name': 'header'}), ] subclass = None superclass = None @@ -10993,7 +11946,7 @@ def __init__(self, rowIndex=None, columnIndex=None, rowSpan=None, colSpan=None, self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.rowIndex = _cast(int, rowIndex) self.rowIndex_nsprefix_ = "pc" self.columnIndex = _cast(int, columnIndex) @@ -11039,7 +11992,7 @@ def get_header(self): return self.header def set_header(self, header): self.header = header - def hasContent_(self): + def has__content(self): if ( ): @@ -11061,14 +12014,14 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TableCellRoleType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TableCellRoleType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='TableCellRoleType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='TableCellRoleType', pretty_print=pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='TableCellRoleType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='TableCellRoleType'): if self.rowIndex is not None and 'rowIndex' not in already_processed: already_processed.add('rowIndex') outfile.write(' rowIndex="%s"' % self.gds_format_integer(self.rowIndex, input_name='rowIndex')) @@ -11084,9 +12037,9 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.header is not None and 'header' not in already_processed: already_processed.add('header') outfile.write(' header="%s"' % self.gds_format_boolean(self.header, input_name='header')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='TableCellRoleType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='TableCellRoleType', fromsubclass_=False, pretty_print=True): pass - def to_etree(self, parent_element=None, name_='TableCellRoleType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='TableCellRoleType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -11103,6 +12056,8 @@ def to_etree(self, parent_element=None, name_='TableCellRoleType', mapping_=None element.set('header', self.gds_format_boolean(self.header)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -11110,12 +12065,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('rowIndex', node) if value is not None and 'rowIndex' not in already_processed: already_processed.add('rowIndex') @@ -11141,7 +12096,7 @@ def buildAttributes(self, node, attrs, already_processed): self.header = False else: raise_parse_error(node, 'Bad boolean attribute') - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): pass def __hash__(self): return hash(self.id) @@ -11149,6 +12104,11 @@ def __hash__(self): class RolesType(GeneratedsSuper): + """TableCellRole -- + Data for a region that takes on the role + of a table cell within a parent table region. + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ MemberSpec_('TableCellRole', 'TableCellRoleType', 0, 1, {'maxOccurs': '1', 'minOccurs': '0', 'name': 'TableCellRole', 'type': 'TableCellRoleType'}, None), @@ -11160,7 +12120,7 @@ def __init__(self, TableCellRole=None, gds_collector_=None, **kwargs_): self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.TableCellRole = TableCellRole self.TableCellRole_nsprefix_ = "pc" def factory(*args_, **kwargs_): @@ -11182,7 +12142,7 @@ def get_TableCellRole(self): return self.TableCellRole def set_TableCellRole(self, TableCellRole): self.TableCellRole = TableCellRole - def hasContent_(self): + def has__content(self): if ( self.TableCellRole is not None ): @@ -11204,17 +12164,17 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='RolesType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='RolesType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='RolesType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='RolesType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='RolesType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='RolesType'): pass - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='RolesType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='RolesType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -11222,16 +12182,18 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml if self.TableCellRole is not None: namespaceprefix_ = self.TableCellRole_nsprefix_ + ':' if (UseCapturedNS_ and self.TableCellRole_nsprefix_) else '' self.TableCellRole.export(outfile, level, namespaceprefix_, namespacedef_='', name_='TableCellRole', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='RolesType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='RolesType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: element = etree_.SubElement(parent_element, '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) if self.TableCellRole is not None: TableCellRole_ = self.TableCellRole - TableCellRole_.to_etree(element, name_='TableCellRole', mapping_=mapping_, nsmap_=nsmap_) + TableCellRole_.to_etree(element, name_='TableCellRole', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -11239,14 +12201,14 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): pass - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'TableCellRole': obj_ = TableCellRoleType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -11258,14 +12220,19 @@ def __hash__(self): class CustomRegionType(RegionType): - """Regions containing content that is not covered + """CustomRegionType -- + Regions containing content that is not covered by the default types (text, graphic, image, line drawing, chart, table, separator, maths, map, music, chem, advert, noise, unknown). - Information on the type of content represented by this region""" + + * type -- + Information on the type of content represented by this region + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('type_', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('type_', 'string', 0, 1, {'use': 'optional', 'name': 'type_'}), ] subclass = None superclass = RegionType @@ -11274,8 +12241,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(CustomRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("CustomRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) self.type_ = _cast(None, type_) self.type__nsprefix_ = "pc" def factory(*args_, **kwargs_): @@ -11297,14 +12264,14 @@ def get_type(self): return self.type_ def set_type(self, type_): self.type_ = type_ - def hasContent_(self): + def has__content(self): if ( - super(CustomRegionType, self).hasContent_() + super(CustomRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='CustomRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='CustomRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('CustomRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -11319,27 +12286,29 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='C showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='CustomRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='CustomRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='CustomRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='CustomRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='CustomRegionType'): - super(CustomRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='CustomRegionType') + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='CustomRegionType'): + super(CustomRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='CustomRegionType') if self.type_ is not None and 'type_' not in already_processed: already_processed.add('type_') outfile.write(' type=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.type_), input_name='type')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='CustomRegionType', fromsubclass_=False, pretty_print=True): - super(CustomRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='CustomRegionType', mapping_=None, nsmap_=None): - element = super(CustomRegionType, self).to_etree(parent_element, name_, mapping_) + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='CustomRegionType', fromsubclass_=False, pretty_print=True): + super(CustomRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def to_etree(self, parent_element=None, name_='CustomRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(CustomRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if self.type_ is not None: element.set('type', self.gds_format_string(self.type_)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -11347,19 +12316,19 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('type', node) if value is not None and 'type' not in already_processed: already_processed.add('type') self.type_ = value - super(CustomRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): - super(CustomRegionType, self).buildChildren(child_, node, nodeName_, True) + super(CustomRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(CustomRegionType, self)._buildChildren(child_, node, nodeName_, True) pass def __hash__(self): return hash(self.id) @@ -11367,7 +12336,10 @@ def __hash__(self): class UnknownRegionType(RegionType): - """To be used if the region type cannot be ascertained.""" + """UnknownRegionType -- + To be used if the region type cannot be ascertained. + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ ] @@ -11378,8 +12350,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(UnknownRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("UnknownRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) def factory(*args_, **kwargs_): if CurrentSubclassModule_ is not None: subclass = getSubclassFromModule_( @@ -11395,14 +12367,14 @@ def get_ns_prefix_(self): return self.ns_prefix_ def set_ns_prefix_(self, ns_prefix): self.ns_prefix_ = ns_prefix - def hasContent_(self): + def has__content(self): if ( - super(UnknownRegionType, self).hasContent_() + super(UnknownRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='UnknownRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='UnknownRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('UnknownRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -11417,22 +12389,24 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='U showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='UnknownRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='UnknownRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='UnknownRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='UnknownRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='UnknownRegionType'): - super(UnknownRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='UnknownRegionType') - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='UnknownRegionType', fromsubclass_=False, pretty_print=True): - super(UnknownRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='UnknownRegionType', mapping_=None, nsmap_=None): - element = super(UnknownRegionType, self).to_etree(parent_element, name_, mapping_) + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='UnknownRegionType'): + super(UnknownRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='UnknownRegionType') + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='UnknownRegionType', fromsubclass_=False, pretty_print=True): + super(UnknownRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def to_etree(self, parent_element=None, name_='UnknownRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(UnknownRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -11440,15 +12414,15 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): - super(UnknownRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): - super(UnknownRegionType, self).buildChildren(child_, node, nodeName_, True) + def _buildAttributes(self, node, attrs, already_processed): + super(UnknownRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(UnknownRegionType, self)._buildChildren(child_, node, nodeName_, True) pass def __hash__(self): return hash(self.id) @@ -11456,9 +12430,12 @@ def __hash__(self): class NoiseRegionType(RegionType): - """Noise regions are regions where no real data lies, only + """NoiseRegionType -- + Noise regions are regions where no real data lies, only false data created by artifacts on the document or - scanner noise.""" + scanner noise. + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ ] @@ -11469,8 +12446,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(NoiseRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("NoiseRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) def factory(*args_, **kwargs_): if CurrentSubclassModule_ is not None: subclass = getSubclassFromModule_( @@ -11486,14 +12463,14 @@ def get_ns_prefix_(self): return self.ns_prefix_ def set_ns_prefix_(self, ns_prefix): self.ns_prefix_ = ns_prefix - def hasContent_(self): + def has__content(self): if ( - super(NoiseRegionType, self).hasContent_() + super(NoiseRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='NoiseRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='NoiseRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('NoiseRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -11508,22 +12485,24 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='N showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='NoiseRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='NoiseRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='NoiseRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='NoiseRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='NoiseRegionType'): - super(NoiseRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='NoiseRegionType') - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='NoiseRegionType', fromsubclass_=False, pretty_print=True): - super(NoiseRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='NoiseRegionType', mapping_=None, nsmap_=None): - element = super(NoiseRegionType, self).to_etree(parent_element, name_, mapping_) + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='NoiseRegionType'): + super(NoiseRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='NoiseRegionType') + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='NoiseRegionType', fromsubclass_=False, pretty_print=True): + super(NoiseRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def to_etree(self, parent_element=None, name_='NoiseRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(NoiseRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -11531,15 +12510,15 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): - super(NoiseRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): - super(NoiseRegionType, self).buildChildren(child_, node, nodeName_, True) + def _buildAttributes(self, node, attrs, already_processed): + super(NoiseRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(NoiseRegionType, self)._buildChildren(child_, node, nodeName_, True) pass def __hash__(self): return hash(self.id) @@ -11547,17 +12526,24 @@ def __hash__(self): class AdvertRegionType(RegionType): - """Regions containing advertisements. - The angle the rectangle encapsulating a region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - The background colour of the region""" + """AdvertRegionType -- + Regions containing advertisements. + + * orientation -- + The angle the rectangle encapsulating a region + has to be rotated in clockwise direction + in order to correct the present skew + (negative values indicate anti-clockwise rotation). + Range: -179.999,180 + + * bgColour -- + The background colour of the region + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional'}), + MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), + MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'bgColour'}), ] subclass = None superclass = RegionType @@ -11566,8 +12552,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(AdvertRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("AdvertRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) self.orientation = _cast(float, orientation) self.orientation_nsprefix_ = "pc" self.bgColour = _cast(None, bgColour) @@ -11608,14 +12594,14 @@ def validate_ColourSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ColourSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( - super(AdvertRegionType, self).hasContent_() + super(AdvertRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='AdvertRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='AdvertRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('AdvertRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -11630,32 +12616,34 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='A showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='AdvertRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='AdvertRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='AdvertRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='AdvertRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='AdvertRegionType'): - super(AdvertRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='AdvertRegionType') + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='AdvertRegionType'): + super(AdvertRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='AdvertRegionType') if self.orientation is not None and 'orientation' not in already_processed: already_processed.add('orientation') outfile.write(' orientation="%s"' % self.gds_format_float(self.orientation, input_name='orientation')) if self.bgColour is not None and 'bgColour' not in already_processed: already_processed.add('bgColour') outfile.write(' bgColour=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.bgColour), input_name='bgColour')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='AdvertRegionType', fromsubclass_=False, pretty_print=True): - super(AdvertRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='AdvertRegionType', mapping_=None, nsmap_=None): - element = super(AdvertRegionType, self).to_etree(parent_element, name_, mapping_) + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='AdvertRegionType', fromsubclass_=False, pretty_print=True): + super(AdvertRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def to_etree(self, parent_element=None, name_='AdvertRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(AdvertRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if self.orientation is not None: element.set('orientation', self.gds_format_float(self.orientation)) if self.bgColour is not None: element.set('bgColour', self.gds_format_string(self.bgColour)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -11663,12 +12651,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('orientation', node) if value is not None and 'orientation' not in already_processed: already_processed.add('orientation') @@ -11679,9 +12667,9 @@ def buildAttributes(self, node, attrs, already_processed): already_processed.add('bgColour') self.bgColour = value self.validate_ColourSimpleType(self.bgColour) # validate type ColourSimpleType - super(AdvertRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): - super(AdvertRegionType, self).buildChildren(child_, node, nodeName_, True) + super(AdvertRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(AdvertRegionType, self)._buildChildren(child_, node, nodeName_, True) pass def __hash__(self): return hash(self.id) @@ -11700,17 +12688,24 @@ def set_orientation(self, orientation): class MusicRegionType(RegionType): - """Regions containing musical notations. - The angle the rectangle encapsulating a region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - The background colour of the region""" + """MusicRegionType -- + Regions containing musical notations. + + * orientation -- + The angle the rectangle encapsulating a region + has to be rotated in clockwise direction + in order to correct the present skew + (negative values indicate anti-clockwise rotation). + Range: -179.999,180 + + * bgColour -- + The background colour of the region + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional'}), + MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), + MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'bgColour'}), ] subclass = None superclass = RegionType @@ -11719,8 +12714,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(MusicRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("MusicRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) self.orientation = _cast(float, orientation) self.orientation_nsprefix_ = "pc" self.bgColour = _cast(None, bgColour) @@ -11761,14 +12756,14 @@ def validate_ColourSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ColourSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( - super(MusicRegionType, self).hasContent_() + super(MusicRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='MusicRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='MusicRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('MusicRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -11783,32 +12778,34 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='M showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MusicRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MusicRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='MusicRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='MusicRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='MusicRegionType'): - super(MusicRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MusicRegionType') + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='MusicRegionType'): + super(MusicRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MusicRegionType') if self.orientation is not None and 'orientation' not in already_processed: already_processed.add('orientation') outfile.write(' orientation="%s"' % self.gds_format_float(self.orientation, input_name='orientation')) if self.bgColour is not None and 'bgColour' not in already_processed: already_processed.add('bgColour') outfile.write(' bgColour=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.bgColour), input_name='bgColour')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='MusicRegionType', fromsubclass_=False, pretty_print=True): - super(MusicRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='MusicRegionType', mapping_=None, nsmap_=None): - element = super(MusicRegionType, self).to_etree(parent_element, name_, mapping_) + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='MusicRegionType', fromsubclass_=False, pretty_print=True): + super(MusicRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def to_etree(self, parent_element=None, name_='MusicRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(MusicRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if self.orientation is not None: element.set('orientation', self.gds_format_float(self.orientation)) if self.bgColour is not None: element.set('bgColour', self.gds_format_string(self.bgColour)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -11816,12 +12813,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('orientation', node) if value is not None and 'orientation' not in already_processed: already_processed.add('orientation') @@ -11832,9 +12829,9 @@ def buildAttributes(self, node, attrs, already_processed): already_processed.add('bgColour') self.bgColour = value self.validate_ColourSimpleType(self.bgColour) # validate type ColourSimpleType - super(MusicRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): - super(MusicRegionType, self).buildChildren(child_, node, nodeName_, True) + super(MusicRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(MusicRegionType, self)._buildChildren(child_, node, nodeName_, True) pass def __hash__(self): return hash(self.id) @@ -11853,16 +12850,21 @@ def set_orientation(self, orientation): class MapRegionType(RegionType): - """Regions containing maps. - The angle the rectangle encapsulating a - region has to be rotated in clockwise - direction in order to correct the present - skew (negative values indicate - anti-clockwise rotation). Range: - -179.999,180""" + """MapRegionType -- + Regions containing maps. + + * orientation -- + The angle the rectangle encapsulating a + region has to be rotated in clockwise + direction in order to correct the present + skew (negative values indicate + anti-clockwise rotation). Range: + -179.999,180 + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional'}), + MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), ] subclass = None superclass = RegionType @@ -11871,8 +12873,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(MapRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("MapRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) self.orientation = _cast(float, orientation) self.orientation_nsprefix_ = "pc" def factory(*args_, **kwargs_): @@ -11894,14 +12896,14 @@ def get_orientation(self): return self.orientation def set_orientation(self, orientation): self.orientation = orientation - def hasContent_(self): + def has__content(self): if ( - super(MapRegionType, self).hasContent_() + super(MapRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='MapRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='MapRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('MapRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -11916,27 +12918,29 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='M showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MapRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MapRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='MapRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='MapRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='MapRegionType'): - super(MapRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MapRegionType') + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='MapRegionType'): + super(MapRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MapRegionType') if self.orientation is not None and 'orientation' not in already_processed: already_processed.add('orientation') outfile.write(' orientation="%s"' % self.gds_format_float(self.orientation, input_name='orientation')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='MapRegionType', fromsubclass_=False, pretty_print=True): - super(MapRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='MapRegionType', mapping_=None, nsmap_=None): - element = super(MapRegionType, self).to_etree(parent_element, name_, mapping_) + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='MapRegionType', fromsubclass_=False, pretty_print=True): + super(MapRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def to_etree(self, parent_element=None, name_='MapRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(MapRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if self.orientation is not None: element.set('orientation', self.gds_format_float(self.orientation)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -11944,20 +12948,20 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('orientation', node) if value is not None and 'orientation' not in already_processed: already_processed.add('orientation') value = self.gds_parse_float(value, node, 'orientation') self.orientation = value - super(MapRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): - super(MapRegionType, self).buildChildren(child_, node, nodeName_, True) + super(MapRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(MapRegionType, self)._buildChildren(child_, node, nodeName_, True) pass def __hash__(self): return hash(self.id) @@ -11976,18 +12980,25 @@ def set_orientation(self, orientation): class ChemRegionType(RegionType): - """Regions containing chemical formulas. - The angle the rectangle encapsulating a - region has to be rotated in clockwise - direction in order to correct the present - skew (negative values indicate - anti-clockwise rotation). Range: - -179.999,180 - The background colour of the region""" + """ChemRegionType -- + Regions containing chemical formulas. + + * orientation -- + The angle the rectangle encapsulating a + region has to be rotated in clockwise + direction in order to correct the present + skew (negative values indicate + anti-clockwise rotation). Range: + -179.999,180 + + * bgColour -- + The background colour of the region + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional'}), + MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), + MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'bgColour'}), ] subclass = None superclass = RegionType @@ -11996,8 +13007,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(ChemRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("ChemRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) self.orientation = _cast(float, orientation) self.orientation_nsprefix_ = "pc" self.bgColour = _cast(None, bgColour) @@ -12038,14 +13049,14 @@ def validate_ColourSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ColourSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( - super(ChemRegionType, self).hasContent_() + super(ChemRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='ChemRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='ChemRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('ChemRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -12060,32 +13071,34 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='C showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ChemRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ChemRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='ChemRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='ChemRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='ChemRegionType'): - super(ChemRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ChemRegionType') + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='ChemRegionType'): + super(ChemRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ChemRegionType') if self.orientation is not None and 'orientation' not in already_processed: already_processed.add('orientation') outfile.write(' orientation="%s"' % self.gds_format_float(self.orientation, input_name='orientation')) if self.bgColour is not None and 'bgColour' not in already_processed: already_processed.add('bgColour') outfile.write(' bgColour=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.bgColour), input_name='bgColour')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='ChemRegionType', fromsubclass_=False, pretty_print=True): - super(ChemRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='ChemRegionType', mapping_=None, nsmap_=None): - element = super(ChemRegionType, self).to_etree(parent_element, name_, mapping_) + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='ChemRegionType', fromsubclass_=False, pretty_print=True): + super(ChemRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def to_etree(self, parent_element=None, name_='ChemRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(ChemRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if self.orientation is not None: element.set('orientation', self.gds_format_float(self.orientation)) if self.bgColour is not None: element.set('bgColour', self.gds_format_string(self.bgColour)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -12093,12 +13106,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('orientation', node) if value is not None and 'orientation' not in already_processed: already_processed.add('orientation') @@ -12109,9 +13122,9 @@ def buildAttributes(self, node, attrs, already_processed): already_processed.add('bgColour') self.bgColour = value self.validate_ColourSimpleType(self.bgColour) # validate type ColourSimpleType - super(ChemRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): - super(ChemRegionType, self).buildChildren(child_, node, nodeName_, True) + super(ChemRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(ChemRegionType, self)._buildChildren(child_, node, nodeName_, True) pass def __hash__(self): return hash(self.id) @@ -12130,18 +13143,25 @@ def set_orientation(self, orientation): class MathsRegionType(RegionType): - """Regions containing equations and mathematical symbols + """MathsRegionType -- + Regions containing equations and mathematical symbols should be marked as maths regions. - The angle the rectangle encapsulating a region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - The background colour of the region""" + + * orientation -- + The angle the rectangle encapsulating a region + has to be rotated in clockwise direction + in order to correct the present skew + (negative values indicate anti-clockwise rotation). + Range: -179.999,180 + + * bgColour -- + The background colour of the region + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional'}), + MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), + MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'bgColour'}), ] subclass = None superclass = RegionType @@ -12150,8 +13170,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(MathsRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("MathsRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) self.orientation = _cast(float, orientation) self.orientation_nsprefix_ = "pc" self.bgColour = _cast(None, bgColour) @@ -12192,14 +13212,14 @@ def validate_ColourSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ColourSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( - super(MathsRegionType, self).hasContent_() + super(MathsRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='MathsRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='MathsRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('MathsRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -12214,32 +13234,34 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='M showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MathsRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MathsRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='MathsRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='MathsRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='MathsRegionType'): - super(MathsRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MathsRegionType') + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='MathsRegionType'): + super(MathsRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MathsRegionType') if self.orientation is not None and 'orientation' not in already_processed: already_processed.add('orientation') outfile.write(' orientation="%s"' % self.gds_format_float(self.orientation, input_name='orientation')) if self.bgColour is not None and 'bgColour' not in already_processed: already_processed.add('bgColour') outfile.write(' bgColour=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.bgColour), input_name='bgColour')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='MathsRegionType', fromsubclass_=False, pretty_print=True): - super(MathsRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='MathsRegionType', mapping_=None, nsmap_=None): - element = super(MathsRegionType, self).to_etree(parent_element, name_, mapping_) + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='MathsRegionType', fromsubclass_=False, pretty_print=True): + super(MathsRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def to_etree(self, parent_element=None, name_='MathsRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(MathsRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if self.orientation is not None: element.set('orientation', self.gds_format_float(self.orientation)) if self.bgColour is not None: element.set('bgColour', self.gds_format_string(self.bgColour)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -12247,12 +13269,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('orientation', node) if value is not None and 'orientation' not in already_processed: already_processed.add('orientation') @@ -12263,9 +13285,9 @@ def buildAttributes(self, node, attrs, already_processed): already_processed.add('bgColour') self.bgColour = value self.validate_ColourSimpleType(self.bgColour) # validate type ColourSimpleType - super(MathsRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): - super(MathsRegionType, self).buildChildren(child_, node, nodeName_, True) + super(MathsRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(MathsRegionType, self)._buildChildren(child_, node, nodeName_, True) pass def __hash__(self): return hash(self.id) @@ -12284,19 +13306,26 @@ def set_orientation(self, orientation): class SeparatorRegionType(RegionType): - """Separators are lines that lie between columns and + """SeparatorRegionType -- + Separators are lines that lie between columns and paragraphs and can be used to logically separate different articles from each other. - The angle the rectangle encapsulating a region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - The colour of the separator""" + + * orientation -- + The angle the rectangle encapsulating a region + has to be rotated in clockwise direction + in order to correct the present skew + (negative values indicate anti-clockwise rotation). + Range: -179.999,180 + + * colour -- + The colour of the separator + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('colour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional'}), + MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), + MemberSpec_('colour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'colour'}), ] subclass = None superclass = RegionType @@ -12305,8 +13334,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(SeparatorRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("SeparatorRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) self.orientation = _cast(float, orientation) self.orientation_nsprefix_ = "pc" self.colour = _cast(None, colour) @@ -12347,14 +13376,14 @@ def validate_ColourSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ColourSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( - super(SeparatorRegionType, self).hasContent_() + super(SeparatorRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='SeparatorRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='SeparatorRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('SeparatorRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -12369,32 +13398,34 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='S showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='SeparatorRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='SeparatorRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='SeparatorRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='SeparatorRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='SeparatorRegionType'): - super(SeparatorRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='SeparatorRegionType') + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='SeparatorRegionType'): + super(SeparatorRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='SeparatorRegionType') if self.orientation is not None and 'orientation' not in already_processed: already_processed.add('orientation') outfile.write(' orientation="%s"' % self.gds_format_float(self.orientation, input_name='orientation')) if self.colour is not None and 'colour' not in already_processed: already_processed.add('colour') outfile.write(' colour=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.colour), input_name='colour')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='SeparatorRegionType', fromsubclass_=False, pretty_print=True): - super(SeparatorRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='SeparatorRegionType', mapping_=None, nsmap_=None): - element = super(SeparatorRegionType, self).to_etree(parent_element, name_, mapping_) + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='SeparatorRegionType', fromsubclass_=False, pretty_print=True): + super(SeparatorRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def to_etree(self, parent_element=None, name_='SeparatorRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(SeparatorRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if self.orientation is not None: element.set('orientation', self.gds_format_float(self.orientation)) if self.colour is not None: element.set('colour', self.gds_format_string(self.colour)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -12402,12 +13433,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('orientation', node) if value is not None and 'orientation' not in already_processed: already_processed.add('orientation') @@ -12418,9 +13449,9 @@ def buildAttributes(self, node, attrs, already_processed): already_processed.add('colour') self.colour = value self.validate_ColourSimpleType(self.colour) # validate type ColourSimpleType - super(SeparatorRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): - super(SeparatorRegionType, self).buildChildren(child_, node, nodeName_, True) + super(SeparatorRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(SeparatorRegionType, self)._buildChildren(child_, node, nodeName_, True) pass def __hash__(self): return hash(self.id) @@ -12439,26 +13470,39 @@ def set_orientation(self, orientation): class ChartRegionType(RegionType): - """Regions containing charts or graphs of any type, should + """ChartRegionType -- + Regions containing charts or graphs of any type, should be marked as chart regions. - The angle the rectangle encapsulating a region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - The type of chart in the region - An approximation of the number of colours - used in the region - The background colour of the region - Specifies whether the region also contains - text""" + + * orientation -- + The angle the rectangle encapsulating a region + has to be rotated in clockwise direction + in order to correct the present skew + (negative values indicate anti-clockwise rotation). + Range: -179.999,180 + + * type -- + The type of chart in the region + + * numColours -- + An approximation of the number of colours + used in the region + + * bgColour -- + The background colour of the region + + * embText -- + Specifies whether the region also contains + text + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('type_', 'pc:ChartTypeSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('numColours', 'int', 0, 1, {'use': 'optional'}), - MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('embText', 'boolean', 0, 1, {'use': 'optional'}), + MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), + MemberSpec_('type_', 'pc:ChartTypeSimpleType', 0, 1, {'use': 'optional', 'name': 'type_'}), + MemberSpec_('numColours', 'int', 0, 1, {'use': 'optional', 'name': 'numColours'}), + MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'bgColour'}), + MemberSpec_('embText', 'boolean', 0, 1, {'use': 'optional', 'name': 'embText'}), ] subclass = None superclass = RegionType @@ -12467,8 +13511,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(ChartRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("ChartRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) self.orientation = _cast(float, orientation) self.orientation_nsprefix_ = "pc" self.type_ = _cast(None, type_) @@ -12540,14 +13584,14 @@ def validate_ColourSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ColourSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( - super(ChartRegionType, self).hasContent_() + super(ChartRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='ChartRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='ChartRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('ChartRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -12562,16 +13606,16 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='C showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ChartRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ChartRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='ChartRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='ChartRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='ChartRegionType'): - super(ChartRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ChartRegionType') + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='ChartRegionType'): + super(ChartRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ChartRegionType') if self.orientation is not None and 'orientation' not in already_processed: already_processed.add('orientation') outfile.write(' orientation="%s"' % self.gds_format_float(self.orientation, input_name='orientation')) @@ -12587,10 +13631,10 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.embText is not None and 'embText' not in already_processed: already_processed.add('embText') outfile.write(' embText="%s"' % self.gds_format_boolean(self.embText, input_name='embText')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='ChartRegionType', fromsubclass_=False, pretty_print=True): - super(ChartRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='ChartRegionType', mapping_=None, nsmap_=None): - element = super(ChartRegionType, self).to_etree(parent_element, name_, mapping_) + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='ChartRegionType', fromsubclass_=False, pretty_print=True): + super(ChartRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def to_etree(self, parent_element=None, name_='ChartRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(ChartRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if self.orientation is not None: element.set('orientation', self.gds_format_float(self.orientation)) if self.type_ is not None: @@ -12603,6 +13647,8 @@ def to_etree(self, parent_element=None, name_='ChartRegionType', mapping_=None, element.set('embText', self.gds_format_boolean(self.embText)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -12610,12 +13656,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('orientation', node) if value is not None and 'orientation' not in already_processed: already_processed.add('orientation') @@ -12644,9 +13690,9 @@ def buildAttributes(self, node, attrs, already_processed): self.embText = False else: raise_parse_error(node, 'Bad boolean attribute') - super(ChartRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): - super(ChartRegionType, self).buildChildren(child_, node, nodeName_, True) + super(ChartRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(ChartRegionType, self)._buildChildren(child_, node, nodeName_, True) pass def __hash__(self): return hash(self.id) @@ -12665,30 +13711,49 @@ def set_orientation(self, orientation): class TableRegionType(RegionType): - """Tabular data in any form is represented with a table + """TableRegionType -- + Tabular data in any form is represented with a table region. Rows and columns may or may not have separator lines; these lines are not separator regions. - The angle the rectangle encapsulating a region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - The number of rows present in the table - The number of columns present in the table - The colour of the lines used in the region - The background colour of the region - Specifies the presence of line separators - Specifies whether the region also contains - text""" + + * orientation -- + The angle the rectangle encapsulating a region + has to be rotated in clockwise direction + in order to correct the present skew + (negative values indicate anti-clockwise rotation). + Range: -179.999,180 + + * rows -- + The number of rows present in the table + + * columns -- + The number of columns present in the table + + * lineColour -- + The colour of the lines used in the region + + * bgColour -- + The background colour of the region + + * lineSeparators -- + Specifies the presence of line separators + + * embText -- + Specifies whether the region also contains + text + + * Grid -- Table grid (visible or virtual grid lines) + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('rows', 'int', 0, 1, {'use': 'optional'}), - MemberSpec_('columns', 'int', 0, 1, {'use': 'optional'}), - MemberSpec_('lineColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('lineSeparators', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('embText', 'boolean', 0, 1, {'use': 'optional'}), + MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), + MemberSpec_('rows', 'int', 0, 1, {'use': 'optional', 'name': 'rows'}), + MemberSpec_('columns', 'int', 0, 1, {'use': 'optional', 'name': 'columns'}), + MemberSpec_('lineColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'lineColour'}), + MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'bgColour'}), + MemberSpec_('lineSeparators', 'boolean', 0, 1, {'use': 'optional', 'name': 'lineSeparators'}), + MemberSpec_('embText', 'boolean', 0, 1, {'use': 'optional', 'name': 'embText'}), MemberSpec_('Grid', 'GridType', 0, 1, {'maxOccurs': '1', 'minOccurs': '0', 'name': 'Grid', 'type': 'GridType'}, None), ] subclass = None @@ -12698,8 +13763,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(TableRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("TableRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) self.orientation = _cast(float, orientation) self.orientation_nsprefix_ = "pc" self.rows = _cast(int, rows) @@ -12776,15 +13841,15 @@ def validate_ColourSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ColourSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( self.Grid is not None or - super(TableRegionType, self).hasContent_() + super(TableRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='TableRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='TableRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('TableRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -12799,16 +13864,16 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='T showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TableRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TableRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='TableRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='TableRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='TableRegionType'): - super(TableRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TableRegionType') + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='TableRegionType'): + super(TableRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TableRegionType') if self.orientation is not None and 'orientation' not in already_processed: already_processed.add('orientation') outfile.write(' orientation="%s"' % self.gds_format_float(self.orientation, input_name='orientation')) @@ -12830,8 +13895,8 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.embText is not None and 'embText' not in already_processed: already_processed.add('embText') outfile.write(' embText="%s"' % self.gds_format_boolean(self.embText, input_name='embText')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='TableRegionType', fromsubclass_=False, pretty_print=True): - super(TableRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='TableRegionType', fromsubclass_=False, pretty_print=True): + super(TableRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) if pretty_print: eol_ = '\n' else: @@ -12839,8 +13904,8 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', if self.Grid is not None: namespaceprefix_ = self.Grid_nsprefix_ + ':' if (UseCapturedNS_ and self.Grid_nsprefix_) else '' self.Grid.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Grid', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='TableRegionType', mapping_=None, nsmap_=None): - element = super(TableRegionType, self).to_etree(parent_element, name_, mapping_) + def to_etree(self, parent_element=None, name_='TableRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(TableRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if self.orientation is not None: element.set('orientation', self.gds_format_float(self.orientation)) if self.rows is not None: @@ -12857,9 +13922,11 @@ def to_etree(self, parent_element=None, name_='TableRegionType', mapping_=None, element.set('embText', self.gds_format_boolean(self.embText)) if self.Grid is not None: Grid_ = self.Grid - Grid_.to_etree(element, name_='Grid', mapping_=mapping_, nsmap_=nsmap_) + Grid_.to_etree(element, name_='Grid', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -12867,12 +13934,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('orientation', node) if value is not None and 'orientation' not in already_processed: already_processed.add('orientation') @@ -12914,14 +13981,14 @@ def buildAttributes(self, node, attrs, already_processed): self.embText = False else: raise_parse_error(node, 'Bad boolean attribute') - super(TableRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(TableRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'Grid': obj_ = GridType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) self.Grid = obj_ obj_.original_tagname_ = 'Grid' - super(TableRegionType, self).buildChildren(child_, node, nodeName_, True) + super(TableRegionType, self)._buildChildren(child_, node, nodeName_, True) def __hash__(self): return hash(self.id) def set_orientation(self, orientation): @@ -12939,24 +14006,35 @@ def set_orientation(self, orientation): class GraphicRegionType(RegionType): - """Regions containing simple graphics, such as a company + """GraphicRegionType -- + Regions containing simple graphics, such as a company logo, should be marked as graphic regions. - The angle the rectangle encapsulating a region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - The type of graphic in the region - An approximation of the number of colours - used in the region - Specifies whether the region also contains - text.""" + + * orientation -- + The angle the rectangle encapsulating a region + has to be rotated in clockwise direction + in order to correct the present skew + (negative values indicate anti-clockwise rotation). + Range: -179.999,180 + + * type -- + The type of graphic in the region + + * numColours -- + An approximation of the number of colours + used in the region + + * embText -- + Specifies whether the region also contains + text. + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('type_', 'pc:GraphicsTypeSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('numColours', 'int', 0, 1, {'use': 'optional'}), - MemberSpec_('embText', 'boolean', 0, 1, {'use': 'optional'}), + MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), + MemberSpec_('type_', 'pc:GraphicsTypeSimpleType', 0, 1, {'use': 'optional', 'name': 'type_'}), + MemberSpec_('numColours', 'int', 0, 1, {'use': 'optional', 'name': 'numColours'}), + MemberSpec_('embText', 'boolean', 0, 1, {'use': 'optional', 'name': 'embText'}), ] subclass = None superclass = RegionType @@ -12965,8 +14043,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(GraphicRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("GraphicRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) self.orientation = _cast(float, orientation) self.orientation_nsprefix_ = "pc" self.type_ = _cast(None, type_) @@ -13019,14 +14097,14 @@ def validate_GraphicsTypeSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on GraphicsTypeSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( - super(GraphicRegionType, self).hasContent_() + super(GraphicRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='GraphicRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GraphicRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('GraphicRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -13041,16 +14119,16 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='G showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphicRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphicRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GraphicRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GraphicRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GraphicRegionType'): - super(GraphicRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphicRegionType') + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GraphicRegionType'): + super(GraphicRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphicRegionType') if self.orientation is not None and 'orientation' not in already_processed: already_processed.add('orientation') outfile.write(' orientation="%s"' % self.gds_format_float(self.orientation, input_name='orientation')) @@ -13063,10 +14141,10 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.embText is not None and 'embText' not in already_processed: already_processed.add('embText') outfile.write(' embText="%s"' % self.gds_format_boolean(self.embText, input_name='embText')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='GraphicRegionType', fromsubclass_=False, pretty_print=True): - super(GraphicRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='GraphicRegionType', mapping_=None, nsmap_=None): - element = super(GraphicRegionType, self).to_etree(parent_element, name_, mapping_) + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GraphicRegionType', fromsubclass_=False, pretty_print=True): + super(GraphicRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def to_etree(self, parent_element=None, name_='GraphicRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(GraphicRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if self.orientation is not None: element.set('orientation', self.gds_format_float(self.orientation)) if self.type_ is not None: @@ -13077,6 +14155,8 @@ def to_etree(self, parent_element=None, name_='GraphicRegionType', mapping_=None element.set('embText', self.gds_format_boolean(self.embText)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -13084,12 +14164,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('orientation', node) if value is not None and 'orientation' not in already_processed: already_processed.add('orientation') @@ -13113,9 +14193,9 @@ def buildAttributes(self, node, attrs, already_processed): self.embText = False else: raise_parse_error(node, 'Bad boolean attribute') - super(GraphicRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): - super(GraphicRegionType, self).buildChildren(child_, node, nodeName_, True) + super(GraphicRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(GraphicRegionType, self)._buildChildren(child_, node, nodeName_, True) pass def __hash__(self): return hash(self.id) @@ -13134,23 +14214,34 @@ def set_orientation(self, orientation): class LineDrawingRegionType(RegionType): - """A line drawing is a single colour illustration without + """LineDrawingRegionType -- + A line drawing is a single colour illustration without solid areas. - The angle the rectangle encapsulating a region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - The pen (foreground) colour of the region - The background colour of the region - Specifies whether the region also contains - text""" + + * orientation -- + The angle the rectangle encapsulating a region + has to be rotated in clockwise direction + in order to correct the present skew + (negative values indicate anti-clockwise rotation). + Range: -179.999,180 + + * penColour -- + The pen (foreground) colour of the region + + * bgColour -- + The background colour of the region + + * embText -- + Specifies whether the region also contains + text + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('penColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('embText', 'boolean', 0, 1, {'use': 'optional'}), + MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), + MemberSpec_('penColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'penColour'}), + MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'bgColour'}), + MemberSpec_('embText', 'boolean', 0, 1, {'use': 'optional', 'name': 'embText'}), ] subclass = None superclass = RegionType @@ -13159,8 +14250,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(LineDrawingRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("LineDrawingRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) self.orientation = _cast(float, orientation) self.orientation_nsprefix_ = "pc" self.penColour = _cast(None, penColour) @@ -13213,14 +14304,14 @@ def validate_ColourSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ColourSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( - super(LineDrawingRegionType, self).hasContent_() + super(LineDrawingRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='LineDrawingRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='LineDrawingRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('LineDrawingRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -13235,16 +14326,16 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='L showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='LineDrawingRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='LineDrawingRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='LineDrawingRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='LineDrawingRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='LineDrawingRegionType'): - super(LineDrawingRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='LineDrawingRegionType') + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='LineDrawingRegionType'): + super(LineDrawingRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='LineDrawingRegionType') if self.orientation is not None and 'orientation' not in already_processed: already_processed.add('orientation') outfile.write(' orientation="%s"' % self.gds_format_float(self.orientation, input_name='orientation')) @@ -13257,10 +14348,10 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.embText is not None and 'embText' not in already_processed: already_processed.add('embText') outfile.write(' embText="%s"' % self.gds_format_boolean(self.embText, input_name='embText')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='LineDrawingRegionType', fromsubclass_=False, pretty_print=True): - super(LineDrawingRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='LineDrawingRegionType', mapping_=None, nsmap_=None): - element = super(LineDrawingRegionType, self).to_etree(parent_element, name_, mapping_) + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='LineDrawingRegionType', fromsubclass_=False, pretty_print=True): + super(LineDrawingRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def to_etree(self, parent_element=None, name_='LineDrawingRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(LineDrawingRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if self.orientation is not None: element.set('orientation', self.gds_format_float(self.orientation)) if self.penColour is not None: @@ -13271,6 +14362,8 @@ def to_etree(self, parent_element=None, name_='LineDrawingRegionType', mapping_= element.set('embText', self.gds_format_boolean(self.embText)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -13278,12 +14371,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('orientation', node) if value is not None and 'orientation' not in already_processed: already_processed.add('orientation') @@ -13308,9 +14401,9 @@ def buildAttributes(self, node, attrs, already_processed): self.embText = False else: raise_parse_error(node, 'Bad boolean attribute') - super(LineDrawingRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): - super(LineDrawingRegionType, self).buildChildren(child_, node, nodeName_, True) + super(LineDrawingRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(LineDrawingRegionType, self)._buildChildren(child_, node, nodeName_, True) pass def __hash__(self): return hash(self.id) @@ -13329,23 +14422,34 @@ def set_orientation(self, orientation): class ImageRegionType(RegionType): - """An image is considered to be more intricate and complex + """ImageRegionType -- + An image is considered to be more intricate and complex than a graphic. These can be photos or drawings. - The angle the rectangle encapsulating a region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - The colour bit depth required for the region - The background colour of the region - Specifies whether the region also contains - text""" + + * orientation -- + The angle the rectangle encapsulating a region + has to be rotated in clockwise direction + in order to correct the present skew + (negative values indicate anti-clockwise rotation). + Range: -179.999,180 + + * colourDepth -- + The colour bit depth required for the region + + * bgColour -- + The background colour of the region + + * embText -- + Specifies whether the region also contains + text + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('colourDepth', 'pc:ColourDepthSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('embText', 'boolean', 0, 1, {'use': 'optional'}), + MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), + MemberSpec_('colourDepth', 'pc:ColourDepthSimpleType', 0, 1, {'use': 'optional', 'name': 'colourDepth'}), + MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'bgColour'}), + MemberSpec_('embText', 'boolean', 0, 1, {'use': 'optional', 'name': 'embText'}), ] subclass = None superclass = RegionType @@ -13354,8 +14458,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(ImageRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("ImageRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) self.orientation = _cast(float, orientation) self.orientation_nsprefix_ = "pc" self.colourDepth = _cast(None, colourDepth) @@ -13421,14 +14525,14 @@ def validate_ColourSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ColourSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( - super(ImageRegionType, self).hasContent_() + super(ImageRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='ImageRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='ImageRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('ImageRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -13443,16 +14547,16 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='I showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ImageRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ImageRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='ImageRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='ImageRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='ImageRegionType'): - super(ImageRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ImageRegionType') + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='ImageRegionType'): + super(ImageRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ImageRegionType') if self.orientation is not None and 'orientation' not in already_processed: already_processed.add('orientation') outfile.write(' orientation="%s"' % self.gds_format_float(self.orientation, input_name='orientation')) @@ -13465,10 +14569,10 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.embText is not None and 'embText' not in already_processed: already_processed.add('embText') outfile.write(' embText="%s"' % self.gds_format_boolean(self.embText, input_name='embText')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='ImageRegionType', fromsubclass_=False, pretty_print=True): - super(ImageRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='ImageRegionType', mapping_=None, nsmap_=None): - element = super(ImageRegionType, self).to_etree(parent_element, name_, mapping_) + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='ImageRegionType', fromsubclass_=False, pretty_print=True): + super(ImageRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def to_etree(self, parent_element=None, name_='ImageRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(ImageRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if self.orientation is not None: element.set('orientation', self.gds_format_float(self.orientation)) if self.colourDepth is not None: @@ -13479,6 +14583,8 @@ def to_etree(self, parent_element=None, name_='ImageRegionType', mapping_=None, element.set('embText', self.gds_format_boolean(self.embText)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -13486,12 +14592,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('orientation', node) if value is not None and 'orientation' not in already_processed: already_processed.add('orientation') @@ -13516,9 +14622,9 @@ def buildAttributes(self, node, attrs, already_processed): self.embText = False else: raise_parse_error(node, 'Bad boolean attribute') - super(ImageRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): - super(ImageRegionType, self).buildChildren(child_, node, nodeName_, True) + super(ImageRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(ImageRegionType, self)._buildChildren(child_, node, nodeName_, True) pass def __hash__(self): return hash(self.id) @@ -13537,52 +14643,92 @@ def set_orientation(self, orientation): class TextRegionType(RegionType): - """Pure text is represented as a text region. This includes + """TextRegionType -- + Pure text is represented as a text region. This includes drop capitals, but practically ornate text may be considered as a graphic. - The angle the rectangle encapsulating the region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - (The rotated image can be further referenced - via “AlternativeImage”.) - Range: -179.999,180 - The nature of the text in the region - The degree of space in points between the lines of - text (line spacing) - The direction in which text within lines - should be read (order of words and characters), - in addition to “textLineOrder”. - The order of text lines within the block, - in addition to “readingDirection”. - The angle the baseline of text within the region - has to be rotated (relative to the rectangle - encapsulating the region) in clockwise direction - in order to correct the present skew, - in addition to “orientation” - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - Defines whether a region of text is indented or not - Text align - The primary language used in the region - The secondary language used in the region - The primary script used in the region - The secondary script used in the region""" + + * orientation -- + The angle the rectangle encapsulating the region + has to be rotated in clockwise direction + in order to correct the present skew + (negative values indicate anti-clockwise rotation). + (The rotated image can be further referenced + via + “ + AlternativeImage + ” + .) + Range: -179.999,180 + + * type -- + The nature of the text in the region + + * leading -- + The degree of space in points between the lines of + text (line spacing) + + * readingDirection -- + The direction in which text within lines + should be read (order of words and characters), + in addition to + “ + textLineOrder + ” + . + + * textLineOrder -- + The order of text lines within the block, + in addition to + “ + readingDirection + ” + . + + * readingOrientation -- + The angle the baseline of text within the region + has to be rotated (relative to the rectangle + encapsulating the region) in clockwise direction + in order to correct the present skew, + in addition to + “ + orientation + ” + (negative values indicate anti-clockwise rotation). + Range: -179.999,180 + + * indented -- + Defines whether a region of text is indented or not + + * align -- Text align + * primaryLanguage -- + The primary language used in the region + + * secondaryLanguage -- + The secondary language used in the region + + * primaryScript -- + The primary script used in the region + + * secondaryScript -- + The secondary script used in the region + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('type_', 'pc:TextTypeSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('leading', 'int', 0, 1, {'use': 'optional'}), - MemberSpec_('readingDirection', 'pc:ReadingDirectionSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('textLineOrder', 'pc:TextLineOrderSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('readingOrientation', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('indented', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('align', 'pc:AlignSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('primaryLanguage', 'pc:LanguageSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('secondaryLanguage', 'pc:LanguageSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('primaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('secondaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('production', 'pc:ProductionSimpleType', 0, 1, {'use': 'optional'}), + MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), + MemberSpec_('type_', 'pc:TextTypeSimpleType', 0, 1, {'use': 'optional', 'name': 'type_'}), + MemberSpec_('leading', 'int', 0, 1, {'use': 'optional', 'name': 'leading'}), + MemberSpec_('readingDirection', 'pc:ReadingDirectionSimpleType', 0, 1, {'use': 'optional', 'name': 'readingDirection'}), + MemberSpec_('textLineOrder', 'pc:TextLineOrderSimpleType', 0, 1, {'use': 'optional', 'name': 'textLineOrder'}), + MemberSpec_('readingOrientation', 'float', 0, 1, {'use': 'optional', 'name': 'readingOrientation'}), + MemberSpec_('indented', 'boolean', 0, 1, {'use': 'optional', 'name': 'indented'}), + MemberSpec_('align', 'pc:AlignSimpleType', 0, 1, {'use': 'optional', 'name': 'align'}), + MemberSpec_('primaryLanguage', 'pc:LanguageSimpleType', 0, 1, {'use': 'optional', 'name': 'primaryLanguage'}), + MemberSpec_('secondaryLanguage', 'pc:LanguageSimpleType', 0, 1, {'use': 'optional', 'name': 'secondaryLanguage'}), + MemberSpec_('primaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional', 'name': 'primaryScript'}), + MemberSpec_('secondaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional', 'name': 'secondaryScript'}), + MemberSpec_('production', 'pc:ProductionSimpleType', 0, 1, {'use': 'optional', 'name': 'production'}), MemberSpec_('TextLine', 'TextLineType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'TextLine', 'type': 'TextLineType'}, None), MemberSpec_('TextEquiv', 'TextEquivType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'TextEquiv', 'type': 'TextEquivType'}, None), MemberSpec_('TextStyle', 'TextStyleType', 0, 1, {'maxOccurs': '1', 'minOccurs': '0', 'name': 'TextStyle', 'type': 'TextStyleType'}, None), @@ -13594,8 +14740,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(TextRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("TextRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) self.orientation = _cast(float, orientation) self.orientation_nsprefix_ = "pc" self.type_ = _cast(None, type_) @@ -13816,17 +14962,17 @@ def validate_ProductionSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ProductionSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( self.TextLine or self.TextEquiv or self.TextStyle is not None or - super(TextRegionType, self).hasContent_() + super(TextRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='TextRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='TextRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('TextRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -13841,16 +14987,16 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='T showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TextRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TextRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='TextRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='TextRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='TextRegionType'): - super(TextRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TextRegionType') + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='TextRegionType'): + super(TextRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TextRegionType') if self.orientation is not None and 'orientation' not in already_processed: already_processed.add('orientation') outfile.write(' orientation="%s"' % self.gds_format_float(self.orientation, input_name='orientation')) @@ -13890,8 +15036,8 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.production is not None and 'production' not in already_processed: already_processed.add('production') outfile.write(' production=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.production), input_name='production')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='TextRegionType', fromsubclass_=False, pretty_print=True): - super(TextRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='TextRegionType', fromsubclass_=False, pretty_print=True): + super(TextRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) if pretty_print: eol_ = '\n' else: @@ -13905,8 +15051,8 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', if self.TextStyle is not None: namespaceprefix_ = self.TextStyle_nsprefix_ + ':' if (UseCapturedNS_ and self.TextStyle_nsprefix_) else '' self.TextStyle.export(outfile, level, namespaceprefix_, namespacedef_='', name_='TextStyle', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='TextRegionType', mapping_=None, nsmap_=None): - element = super(TextRegionType, self).to_etree(parent_element, name_, mapping_) + def to_etree(self, parent_element=None, name_='TextRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(TextRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if self.orientation is not None: element.set('orientation', self.gds_format_float(self.orientation)) if self.type_ is not None: @@ -13934,14 +15080,16 @@ def to_etree(self, parent_element=None, name_='TextRegionType', mapping_=None, n if self.production is not None: element.set('production', self.gds_format_string(self.production)) for TextLine_ in self.TextLine: - TextLine_.to_etree(element, name_='TextLine', mapping_=mapping_, nsmap_=nsmap_) + TextLine_.to_etree(element, name_='TextLine', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for TextEquiv_ in self.TextEquiv: - TextEquiv_.to_etree(element, name_='TextEquiv', mapping_=mapping_, nsmap_=nsmap_) + TextEquiv_.to_etree(element, name_='TextEquiv', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.TextStyle is not None: TextStyle_ = self.TextStyle - TextStyle_.to_etree(element, name_='TextStyle', mapping_=mapping_, nsmap_=nsmap_) + TextStyle_.to_etree(element, name_='TextStyle', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -13949,12 +15097,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('orientation', node) if value is not None and 'orientation' not in already_processed: already_processed.add('orientation') @@ -14023,8 +15171,8 @@ def buildAttributes(self, node, attrs, already_processed): already_processed.add('production') self.production = value self.validate_ProductionSimpleType(self.production) # validate type ProductionSimpleType - super(TextRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(TextRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'TextLine': obj_ = TextLineType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -14040,7 +15188,7 @@ def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collec obj_.build(child_, gds_collector_=gds_collector_) self.TextStyle = obj_ obj_.original_tagname_ = 'TextStyle' - super(TextRegionType, self).buildChildren(child_, node, nodeName_, True) + super(TextRegionType, self)._buildChildren(child_, node, nodeName_, True) def __hash__(self): return hash(self.id) def set_orientation(self, orientation): @@ -14057,6 +15205,11 @@ def set_orientation(self, orientation): # end class TextRegionType +# +# End data representation classes. +# + + GDSClassesMapping = { 'PcGts': PcGtsType, } @@ -14074,9 +15227,10 @@ def usage(): def get_root_tag(node): tag = Tag_pattern_.match(node.tag).groups()[-1] - rootClass = GDSClassesMapping.get(tag) + prefix_tag = TagNamePrefix + tag + rootClass = GDSClassesMapping.get(prefix_tag) if rootClass is None: - rootClass = globals().get(tag) + rootClass = globals().get(prefix_tag) return tag, rootClass @@ -14130,7 +15284,7 @@ def parse(inFileName, silence=False, print_warnings=True): def parseEtree(inFileName, silence=False, print_warnings=True, - mapping=None, nsmap=None): + mapping=None, reverse_mapping=None, nsmap=None): parser = None doc = parsexml_(inFileName, parser) gds_collector = GdsCollector_() @@ -14141,12 +15295,15 @@ def parseEtree(inFileName, silence=False, print_warnings=True, rootClass = PcGts rootObj = rootClass.factory() rootObj.build(rootNode, gds_collector_=gds_collector) - # Enable Python to collect the space used by the DOM. if mapping is None: mapping = {} + if reverse_mapping is None: + reverse_mapping = {} rootElement = rootObj.to_etree( - None, name_=rootTag, mapping_=mapping, nsmap_=nsmap) - reverse_mapping = rootObj.gds_reverse_node_mapping(mapping) + None, name_=rootTag, mapping_=mapping, + reverse_mapping_=reverse_mapping, nsmap_=nsmap) + reverse_node_mapping = rootObj.gds_reverse_node_mapping(mapping) + # Enable Python to collect the space used by the DOM. if not SaveElementTreeNode: doc = None rootNode = None @@ -14163,7 +15320,7 @@ def parseEtree(inFileName, silence=False, print_warnings=True, len(gds_collector.get_messages()), )) gds_collector.write_messages(sys.stderr) sys.stderr.write(separator) - return rootObj, rootElement, mapping, reverse_mapping + return rootObj, rootElement, mapping, reverse_node_mapping def parseString(inString, silence=False, print_warnings=True): @@ -14247,6 +15404,224 @@ def main(): RenameMappings_ = { } +# +# Mapping of namespaces to types defined in them +# and the file in which each is defined. +# simpleTypes are marked "ST" and complexTypes "CT". +NamespaceToDefMappings_ = {'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15': [('ColourSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('ReadingDirectionSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('TextLineOrderSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('TextTypeSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('PageTypeSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('ConfSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('LanguageSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('ScriptSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('ColourDepthSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('GraphicsTypeSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('ChartTypeSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('PointsType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('ProductionSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('AlignSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('GroupTypeSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('TextDataTypeSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('UnderlineStyleSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('PcGtsType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('MetadataType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('MetadataItemType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('LabelsType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('LabelType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('PageType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('TextRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('CoordsType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('TextLineType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('WordType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('GlyphType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('TextEquivType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('ImageRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('LineDrawingRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('GraphicRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('TableRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('GridType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('GridPointsType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('ChartRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('SeparatorRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('MathsRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('ChemRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('MapRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('MusicRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('AdvertRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('NoiseRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('UnknownRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('CustomRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('PrintSpaceType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('ReadingOrderType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('RegionRefIndexedType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('OrderedGroupIndexedType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('UnorderedGroupIndexedType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('RegionRefType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('OrderedGroupType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('UnorderedGroupType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('BorderType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('LayersType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('LayerType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('BaselineType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('RelationsType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('RelationType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('TextStyleType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('RegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('AlternativeImageType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('GraphemesType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('GraphemeBaseType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('GraphemeType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('NonPrintingCharType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('GraphemeGroupType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('UserDefinedType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('UserAttributeType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('TableCellRoleType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('RolesType', + 'src/ocrd_validators/page.xsd', + 'CT')]} + __all__ = [ "AdvertRegionType", "AlternativeImageType", diff --git a/src/ocrd_page_user_methods.py b/src/ocrd_page_user_methods.py index fe22dd89a..9cec0b30a 100644 --- a/src/ocrd_page_user_methods.py +++ b/src/ocrd_page_user_methods.py @@ -104,7 +104,7 @@ def _add_method(class_re, method_name, file_name=None): _add_method(r'^(OrderedGroupType|OrderedGroupIndexedType)$', 'clear_AllIndexed'), _add_method(r'^(OrderedGroupType|OrderedGroupIndexedType)$', 'extend_AllIndexed'), _add_method(r'^(OrderedGroupType|OrderedGroupIndexedType)$', 'sort_AllIndexed'), - _add_method(r'^(OrderedGroupType|OrderedGroupIndexedType)$', 'exportChildren', 'exportChildren_GroupType'), + _add_method(r'^(OrderedGroupType|OrderedGroupIndexedType)$', 'exportChildren', '_exportChildren_GroupType'), _add_method(r'^(UnorderedGroupType|UnorderedGroupIndexedType)$', 'get_UnorderedGroupChildren'), _add_method(r'^(PcGtsType|PageType)$', 'id'), _add_method(r'^(PageType)$', 'get_AllRegions'), diff --git a/src/ocrd_page_user_methods/exportChildren_GroupType.py b/src/ocrd_page_user_methods/_exportChildren_GroupType.py similarity index 65% rename from src/ocrd_page_user_methods/exportChildren_GroupType.py rename to src/ocrd_page_user_methods/_exportChildren_GroupType.py index 924ee6314..9dea9c422 100644 --- a/src/ocrd_page_user_methods/exportChildren_GroupType.py +++ b/src/ocrd_page_user_methods/_exportChildren_GroupType.py @@ -1,9 +1,14 @@ # pylint: disable=line-too-long,invalid-name,missing-module-docstring,missing-function-docstring -def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): # pylint: disable=unused-argument,too-many-arguments - namespaceprefix_ = 'pc:' +def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): # pylint: disable=unused-argument,too-many-arguments + if pretty_print: + eol_ = '\n' + else: + eol_ = '' if self.UserDefined is not None: + namespaceprefix_ = self.UserDefined_nsprefix_ + ':' if (UseCapturedNS_ and self.UserDefined_nsprefix_) else '' self.UserDefined.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UserDefined', pretty_print=pretty_print) for Labels_ in self.Labels: + namespaceprefix_ = self.Labels_nsprefix_ + ':' if (UseCapturedNS_ and self.Labels_nsprefix_) else '' Labels_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Labels', pretty_print=pretty_print) cleaned = [] def replaceWithRRI(group): @@ -21,4 +26,4 @@ def replaceWithRRI(group): else: cleaned.append(entry) for entry in cleaned: - entry.export(outfile, level, namespaceprefix_, namespacedef_='', name_=entry.__class__.__name__[:-4], pretty_print=pretty_print) + entry.export(outfile, level, entry.ns_prefix_, namespacedef_='', name_=entry.__class__.__name__[:-4], pretty_print=pretty_print) From 3e214cab44cadc307062bc8d4c501863f7f64408 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 14 Sep 2024 01:03:43 +0200 Subject: [PATCH 204/249] tests: make sure ocrd_utils.config gets reset whenever changing it globally --- tests/processor/test_processor.py | 12 ++++++------ tests/test_decorators.py | 6 +++++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 1497927a0..33a954881 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -17,7 +17,7 @@ ) from tests.test_mets_server import fixture_start_mets_server -from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging +from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging, config from ocrd.resolver import Resolver from ocrd.processor import Processor, run_processor, run_cli, NonUniqueInputFile from ocrd.processor.helpers import get_processor @@ -39,6 +39,10 @@ def setUp(self): self.workspace = self.resolver.workspace_from_url('mets.xml') self.addCleanup(stack.pop_all().close) + def tearDown(self): + super().tearDown() + config.reset_defaults() + def test_incomplete_processor(self): proc = IncompleteProcessor(None) proc.input_file_grp = 'OCR-D-IMG' @@ -230,7 +234,6 @@ def test_run_output_legacy(self): def test_run_output_missing(self): ws = self.workspace - from ocrd_utils import config # do not raise for number of failures: config.OCRD_MAX_MISSING_OUTPUTS = -1 config.OCRD_MISSING_OUTPUT = 'SKIP' @@ -240,6 +243,7 @@ def test_run_output_missing(self): # only half succeed assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) // 2 config.OCRD_MISSING_OUTPUT = 'ABORT' + config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' with pytest.raises(Exception) as exc: run_processor(DummyProcessorWithOutputFailures, workspace=ws, input_file_grp="OCR-D-IMG", @@ -262,7 +266,6 @@ def test_run_output_missing(self): def test_run_output_timeout(self): ws = self.workspace - from ocrd_utils import config # do not raise for number of failures: config.OCRD_MAX_MISSING_OUTPUTS = -1 config.OCRD_MISSING_OUTPUT = 'ABORT' @@ -286,7 +289,6 @@ def test_run_output_overwrite(self): ws = self.resolver.workspace_from_nothing(directory=tempdir) ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0002') - from ocrd_utils import config config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' ws.add_file('OCR-D-OUT', mimetype=MIMETYPE_PAGE, file_id='OCR-D-OUT_phys_0001', page_id='phys_0001') config.OCRD_EXISTING_OUTPUT = 'ABORT' @@ -422,7 +424,6 @@ def ocrd_tool(self): def test_run_output_metsserver(start_mets_server): mets_server_url, ws = start_mets_server - from ocrd_utils import config # do not raise for number of failures: config.OCRD_MAX_MISSING_OUTPUTS = -1 run_processor(DummyProcessorWithOutputSleep, workspace=ws, @@ -451,7 +452,6 @@ def test_run_output_metsserver(start_mets_server): @pytest.mark.timeout(4) def test_run_output_parallel(start_mets_server): mets_server_url, ws = start_mets_server - from ocrd_utils import config # do not raise for single-page timeout config.OCRD_PROCESSING_PAGE_TIMEOUT = -1 # do not raise for number of failures: diff --git a/tests/test_decorators.py b/tests/test_decorators.py index df8d6422b..c36577020 100644 --- a/tests/test_decorators.py +++ b/tests/test_decorators.py @@ -15,7 +15,7 @@ ocrd_loglevel, ocrd_cli_wrap_processor, ) # pylint: disable=protected-access -from ocrd_utils import pushd_popd, VERSION as OCRD_VERSION, disableLogging, initLogging, get_logging_config_files +from ocrd_utils import pushd_popd, VERSION as OCRD_VERSION, disableLogging, initLogging, get_logging_config_files, config @click.command() @ocrd_cli_options @@ -45,6 +45,10 @@ def setUp(self): super().setUp() disableLogging() + def tearDown(self): + super().tearDown() + config.reset_defaults() + def test_minimal(self): exit_code, out, err = self.invoke_cli(cli_with_ocrd_cli_options, ['-l', 'DEBUG']) print(out, err) From c1b92c8fbe30b86804f7012467e2fdde10e16b80 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 15 Sep 2024 23:52:08 +0200 Subject: [PATCH 205/249] ocrd_modelfactory.page_from_file: set OcrdPage.revmap to actual reverse mapping --- src/ocrd_modelfactory/__init__.py | 8 +++++++- src/ocrd_models/ocrd_page.py | 4 ++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/ocrd_modelfactory/__init__.py b/src/ocrd_modelfactory/__init__.py index 828949fe9..3f7d675f8 100644 --- a/src/ocrd_modelfactory/__init__.py +++ b/src/ocrd_modelfactory/__init__.py @@ -101,5 +101,11 @@ def page_from_file(input_file, **kwargs) -> OcrdPage: if input_file.mimetype.startswith('image'): return page_from_image(input_file) if input_file.mimetype == MIMETYPE_PAGE: - return OcrdPage(*parseEtree(input_file.local_filename, silence=True)) + revmap = {} + # the old/default gds.reverse_node_mapping is useless + # since 2.39.4, we can actually get the exact reverse mapping for perfect round-trip + # but awkwardly, we have to pass the dict in for that + page = OcrdPage(*parseEtree(input_file.local_filename, reverse_mapping=revmap, silence=True)) + page.revmap = revmap + return page raise ValueError("Unsupported mimetype '%s'" % input_file.mimetype) diff --git a/src/ocrd_models/ocrd_page.py b/src/ocrd_models/ocrd_page.py index 6a8ea4586..b491d402a 100644 --- a/src/ocrd_models/ocrd_page.py +++ b/src/ocrd_models/ocrd_page.py @@ -2,7 +2,7 @@ API to PAGE-XML, generated with generateDS from XML schema. """ from io import StringIO -from typing import Dict, Union +from typing import Dict, Union, Any from lxml import etree as ET from elementpath import XPath2Parser, XPathContext @@ -191,7 +191,7 @@ def __init__( pcgts : PcGtsType, etree : ET._Element, mapping : Dict[str, ET._Element], - revmap : Dict[ET._Element, str], + revmap : Dict[ET._Element, Any], ): self._pcgts = pcgts self.etree = etree From c549c42aef193589262863ba0566bfb311a40080 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 7 Sep 2024 14:25:37 +0200 Subject: [PATCH 206/249] OcrdPage: add PageType.get_ReadingOrderGroups() --- src/ocrd_page_user_methods.py | 1 + .../get_ReadingOrderGroups.py | 33 +++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 src/ocrd_page_user_methods/get_ReadingOrderGroups.py diff --git a/src/ocrd_page_user_methods.py b/src/ocrd_page_user_methods.py index 8a2332e6e..fe22dd89a 100644 --- a/src/ocrd_page_user_methods.py +++ b/src/ocrd_page_user_methods.py @@ -116,6 +116,7 @@ def _add_method(class_re, method_name, file_name=None): _add_method(r'^(PageType)$', 'set_Border'), _add_method(r'^(CoordsType)$', 'set_points'), _add_method(r'^(PageType)$', 'get_AllTextLines'), + _add_method(r'^(PageType)$', 'get_ReadingOrderGroups'), # for some reason, pagecontent.xsd does not declare @orientation at the abstract/base RegionType: _add_method(r'^(PageType|AdvertRegionType|MusicRegionType|MapRegionType|ChemRegionType|MathsRegionType|SeparatorRegionType|ChartRegionType|TableRegionType|GraphicRegionType|LineDrawingRegionType|ImageRegionType|TextRegionType)$', 'set_orientation'), ) diff --git a/src/ocrd_page_user_methods/get_ReadingOrderGroups.py b/src/ocrd_page_user_methods/get_ReadingOrderGroups.py new file mode 100644 index 000000000..e7d6c02b7 --- /dev/null +++ b/src/ocrd_page_user_methods/get_ReadingOrderGroups.py @@ -0,0 +1,33 @@ +def get_ReadingOrderGroups(self) -> dict: + """ + Aggregate recursive ReadingOrder into a dictionary, mapping each regionRef + (i.e. segment `@id`) to its referring group object (i.e one of + + \b + - :py:class:`.RegionRefType` + - :py:class:`.RegionRefIndexedType` + - :py:class:`.OrderedGroupType` + - :py:class:`.OrderedGroupIndexedType` + - :py:class:`.UnoderedGroupType` + - :py:class:`.UnoderedGroupIndexedType` + """ + def get_groupdict(group): + regionrefs = list() + if isinstance(group, (OrderedGroupType, OrderedGroupIndexedType)): + regionrefs = (group.get_RegionRefIndexed() + + group.get_OrderedGroupIndexed() + + group.get_UnorderedGroupIndexed()) + if isinstance(group, (UnorderedGroupType, UnorderedGroupIndexedType)): + regionrefs = (group.get_RegionRef() + + group.get_OrderedGroup() + + group.get_UnorderedGroup()) + refdict = {} + for elem in regionrefs: + refdict[elem.get_regionRef()] = elem + if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): + refdict = {**refdict, **get_groupdict(elem)} + return refdict + ro = self.get_ReadingOrder() + if ro is None: + return {} + return get_groupdict(ro.get_OrderedGroup() or ro.get_UnorderedGroup()) From 9c36854513dd799b9d91d4691a2435ad91b7ad44 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 15 Sep 2024 23:53:29 +0200 Subject: [PATCH 207/249] ocrd_page.to_xml: also allow non-root nodes --- src/ocrd_models/ocrd_page.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/ocrd_models/ocrd_page.py b/src/ocrd_models/ocrd_page.py index b491d402a..046606100 100644 --- a/src/ocrd_models/ocrd_page.py +++ b/src/ocrd_models/ocrd_page.py @@ -223,11 +223,15 @@ def to_xml(el, skip_declaration=False) -> str: # XXX remove potential empty ReadingOrder if hasattr(el, 'prune_ReadingOrder'): el.prune_ReadingOrder() + if hasattr(el, 'original_tagname_'): + name = el.original_tagname_ or 'PcGts' + else: + name = 'PcGts' sio = StringIO() el.export( outfile=sio, level=0, - name_='PcGts', + name_=name, namespaceprefix_='pc:', namespacedef_='xmlns:pc="%s" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="%s %s/pagecontent.xsd"' % ( NAMESPACES['page'], From 53b880f0cdf5166bd6101de95213504791082858 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 7 Sep 2024 14:25:58 +0200 Subject: [PATCH 208/249] update OcrdPage from generateds --- src/ocrd_models/ocrd_page_generateds.py | 55 ++++++++++++++++++++----- 1 file changed, 44 insertions(+), 11 deletions(-) diff --git a/src/ocrd_models/ocrd_page_generateds.py b/src/ocrd_models/ocrd_page_generateds.py index 6fef4c863..f2b7c0551 100644 --- a/src/ocrd_models/ocrd_page_generateds.py +++ b/src/ocrd_models/ocrd_page_generateds.py @@ -2,30 +2,28 @@ # -*- coding: utf-8 -*- # -# Generated Wed Nov 3 12:30:32 2021 by generateDS.py version 2.35.20. -# Python 3.6.9 (default, Jan 26 2021, 15:33:00) [GCC 8.4.0] +# Generated Sat Sep 7 14:17:39 2024 by generateDS.py version 2.35.20. +# Python 3.8.17+ (heads/3.8-dirty:1663f8ba84, Aug 15 2023, 18:13:01) [GCC 8.3.0] # # Command line options: # ('-f', '') # ('--root-element', 'PcGts') -# ('-o', 'ocrd_models/ocrd_models/ocrd_page_generateds.py') +# ('-o', 'src/ocrd_models/ocrd_page_generateds.py') # ('--silence', '') # ('--export', 'write etree') # ('--disable-generatedssuper-lookup', '') -# ('--user-methods', 'ocrd_models/ocrd_page_user_methods.py') +# ('--user-methods', 'src/ocrd_page_user_methods.py') # # Command line arguments: -# ocrd_validators/ocrd_validators/page.xsd +# src/ocrd_validators/page.xsd # # Command line: -# /home/kba/monorepo/ocrd_all/venv/bin/generateDS -f --root-element="PcGts" -o "ocrd_models/ocrd_models/ocrd_page_generateds.py" --silence --export="write etree" --disable-generatedssuper-lookup --user-methods="ocrd_models/ocrd_page_user_methods.py" ocrd_validators/ocrd_validators/page.xsd +# /data/ocr-d/ocrd_all/venv38/bin/generateDS -f --root-element="PcGts" -o "src/ocrd_models/ocrd_page_generateds.py" --silence --export="write etree" --disable-generatedssuper-lookup --user-methods="src/ocrd_page_user_methods.py" src/ocrd_validators/page.xsd # # Current working directory (os.getcwd()): # core # -# type: ignore - from itertools import zip_longest import os import sys @@ -223,7 +221,7 @@ def gds_validate_integer_list( try: int(value) except (TypeError, ValueError): - raise_parse_error(node, 'Requires sequence of integer values') + raise_parse_error(node, 'Requires sequence of integer valuess') return values def gds_format_float(self, input_data, input_name=''): return ('%.15f' % input_data).rstrip('0') @@ -1230,9 +1228,10 @@ def __hash__(self): return hash(self.id) @property def id(self): + from ocrd_utils import make_xml_id if hasattr(self, 'pcGtsId'): return self.pcGtsId or '' - return self.imageFilename + return make_xml_id(self.imageFilename) def get_AllAlternativeImagePaths(self, page=True, region=True, line=True, word=True, glyph=True): """ Get all the ``pc:AlternativeImage/@filename`` paths referenced in the PAGE-XML document. @@ -3116,9 +3115,10 @@ def __hash__(self): return hash(self.id) @property def id(self): + from ocrd_utils import make_xml_id if hasattr(self, 'pcGtsId'): return self.pcGtsId or '' - return self.imageFilename + return make_xml_id(self.imageFilename) # pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring def _region_class(self, x): # pylint: disable=unused-argument return x.__class__.__name__.replace('RegionType', '') @@ -3314,6 +3314,39 @@ def get_AllTextLines(self, region_order='document', respect_textline_order=True) ret += lines if lo in ['top-to-bottom', 'left-to-right'] else list(reversed(lines)) return ret + def get_ReadingOrderGroups(self) -> dict: + """ + Aggregate recursive ReadingOrder into a dictionary, mapping each regionRef + (i.e. segment `@id`) to its referring group object (i.e one of + + \b + - :py:class:`.RegionRefType` + - :py:class:`.RegionRefIndexedType` + - :py:class:`.OrderedGroupType` + - :py:class:`.OrderedGroupIndexedType` + - :py:class:`.UnoderedGroupType` + - :py:class:`.UnoderedGroupIndexedType` + """ + def get_groupdict(group): + regionrefs = list() + if isinstance(group, (OrderedGroupType, OrderedGroupIndexedType)): + regionrefs = (group.get_RegionRefIndexed() + + group.get_OrderedGroupIndexed() + + group.get_UnorderedGroupIndexed()) + if isinstance(group, (UnorderedGroupType, UnorderedGroupIndexedType)): + regionrefs = (group.get_RegionRef() + + group.get_OrderedGroup() + + group.get_UnorderedGroup()) + refdict = {} + for elem in regionrefs: + refdict[elem.get_regionRef()] = elem + if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): + refdict = {**refdict, **get_groupdict(elem)} + return refdict + ro = self.get_ReadingOrder() + if ro is None: + return {} + return get_groupdict(ro.get_OrderedGroup() or ro.get_UnorderedGroup()) def set_orientation(self, orientation): """ Set deskewing angle to given `orientation` number. From c70748153f62f64a27a70561a1a8232d34028a59 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 15 Sep 2024 23:54:01 +0200 Subject: [PATCH 209/249] ocrd-filter: simplify further --- .../processor/builtin/filter_processor.py | 38 ++++++++++++------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/src/ocrd/processor/builtin/filter_processor.py b/src/ocrd/processor/builtin/filter_processor.py index 10b5572c3..c81517b0e 100644 --- a/src/ocrd/processor/builtin/filter_processor.py +++ b/src/ocrd/processor/builtin/filter_processor.py @@ -8,6 +8,27 @@ from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor from ocrd_models import OcrdPage +_SEGTYPES = [ + "NoiseRegion", + "LineDrawingRegion", + "AdvertRegion", + "ImageRegion", + "ChartRegion", + "MusicRegion", + "GraphicRegion", + "UnknownRegion", + "CustomRegion", + "SeparatorRegion", + "MathsRegion", + "TextRegion", + "MapRegion", + "ChemRegion", + "TableRegion", + "TextLine", + "Word", + "Glyph" +] + class FilterProcessor(Processor): def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """ @@ -31,18 +52,11 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional """ pcgts = input_pcgts[0] result = OcrdPageResult(pcgts) - nodes = [node.attrib['id'] - for node in pcgts.xpath(self.parameter['select']) - if 'id' in node.attrib] + nodes = pcgts.xpath(self.parameter['select']) # get PAGE objects from matching etree nodes - # FIXME: this should be easier (OcrdPage should have id lookup mechanism) - regions = pcgts.get_Page().get_AllRegions() - textregions = [region for region in regions if region.original_tagname_ == 'TextRegion'] - lines = [line for region in textregions for line in region.get_TextLine() or []] - words = [word for line in lines for word in line.get_Word() or []] - glyphs = [glyph for word in words for glyph in word.get_Glyph() or []] - segments = [segment for segment in regions + lines + words + glyphs - if segment.id in nodes] + # but allow only hierarchy segments + segments = [segment for segment in map(pcgts.revmap.get, nodes) + if segment.__class__.__name__.replace('Type', '') in _SEGTYPES] if not(len(segments)): self.logger.info("no matches") return result @@ -50,8 +64,6 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional if self.parameter['plot']: page_image, page_coords, _ = self.workspace.image_from_page(pcgts.get_Page(), page_id) for segment in segments: - node = pcgts.mapping[id(segment)] - assert isinstance(node, etree._Element) segtype = segment.original_tagname_ self.logger.info("matched %s segment %s", segtype, segment.id) parent = segment.parent_object_ From 687b06f90784fcf9eac510ecc3442ea8d8c08bb3 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 16 Sep 2024 13:29:26 +0200 Subject: [PATCH 210/249] :package: v3.0.0b5 --- CHANGELOG.md | 29 +++++++++++++++++++++++++++++ VERSION | 2 +- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ec12c893..bbb91c078 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,35 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [3.0.0b5] - 2024-09-16 + +TODO + - update OcrdPage from generateds (HEAD -> new-processor-api, bertsky/new-processor-api) + - OcrdPage: add PageType.get_ReadingOrderGroups() + - tests: make sure ocrd_utils.config gets reset whenever changing it globally + - test processors: no need for 'force' kwarg anymore + - Processor / Workspace.add_file: always force if config.OCRD_EXISTING_OUTPUT==OVERWRITE + - lib.bash input-files: do not try to validate tasks here (impossible to get right with required parameters, and now covered by wrapped Processor.verify() already) + - lib.bash input-files: pass on --mets-server-url, --overwrite, and parameters (necessary for required params) + - lib.bash: fix errexit + - run_processor: be robust if ocrd_tool is missing steps + - Processor.verify: check output fileGrps as well (or OCRD_EXISTING_OUTPUT=OVERWRITE|SKIP or disjoint --page-id) + - Processor.process_workspace(): do not show NotImplementedError context if fallback process() raises anything itself + - ocrd.cli.validate tasks: pass on --mets-server-url, too + - ocrd.cli.bashlib input-files: pass on --mets-server-url, too + - ocrd.cli.workspace server: add 'reload' and 'save' + - ocrd.cli.workspace: consistently pass on --mets-server-url and --backup (also, simplify) + - METS Server: also export+delegate physical_pages + - processor CLI: delegate --resolve-resource, too + - ocrd.cli.ocrd-tool resolve-resource: fix (forgot to print result) + - PcGts.Page.id / make_xml_id: replace '/' with '_' + - Processor.process_page_file / OcrdPageResultImage: allow None instead of AlternativeImageType + - Processor.verify: revert 5819c816 (we still have no defaults in json loaded from v2) + - typing, extend docs + - test_processor: add test for force (overwrite) w/ METS Server + - test_mets_server: add test for force (overwrite) + - OcrdMetsServer.add_file: pass on 'force' kwarg, too + ## [3.0.0b4] - 2024-09-02 Fixed: diff --git a/VERSION b/VERSION index 9414e1270..09fb39d26 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.0.0b4 +3.0.0b5 From a43098e9ee01a15a753ace19a8eddcdff4849352 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 16 Sep 2024 14:27:50 +0200 Subject: [PATCH 211/249] :memo: improve b5 changelog --- CHANGELOG.md | 53 ++++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bbb91c078..abbfd5a4d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,32 +7,31 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## [3.0.0b5] - 2024-09-16 -TODO - - update OcrdPage from generateds (HEAD -> new-processor-api, bertsky/new-processor-api) - - OcrdPage: add PageType.get_ReadingOrderGroups() - - tests: make sure ocrd_utils.config gets reset whenever changing it globally - - test processors: no need for 'force' kwarg anymore - - Processor / Workspace.add_file: always force if config.OCRD_EXISTING_OUTPUT==OVERWRITE - - lib.bash input-files: do not try to validate tasks here (impossible to get right with required parameters, and now covered by wrapped Processor.verify() already) - - lib.bash input-files: pass on --mets-server-url, --overwrite, and parameters (necessary for required params) - - lib.bash: fix errexit - - run_processor: be robust if ocrd_tool is missing steps - - Processor.verify: check output fileGrps as well (or OCRD_EXISTING_OUTPUT=OVERWRITE|SKIP or disjoint --page-id) - - Processor.process_workspace(): do not show NotImplementedError context if fallback process() raises anything itself - - ocrd.cli.validate tasks: pass on --mets-server-url, too - - ocrd.cli.bashlib input-files: pass on --mets-server-url, too - - ocrd.cli.workspace server: add 'reload' and 'save' - - ocrd.cli.workspace: consistently pass on --mets-server-url and --backup (also, simplify) - - METS Server: also export+delegate physical_pages - - processor CLI: delegate --resolve-resource, too - - ocrd.cli.ocrd-tool resolve-resource: fix (forgot to print result) - - PcGts.Page.id / make_xml_id: replace '/' with '_' - - Processor.process_page_file / OcrdPageResultImage: allow None instead of AlternativeImageType - - Processor.verify: revert 5819c816 (we still have no defaults in json loaded from v2) - - typing, extend docs - - test_processor: add test for force (overwrite) w/ METS Server - - test_mets_server: add test for force (overwrite) - - OcrdMetsServer.add_file: pass on 'force' kwarg, too +Fixed: + - tests: ensure `ocrd_utils.config` gets reset whenever changing it globally + - `OcrdMetsServer.add_file`: pass on `force` kwarg + - `ocrd.cli.workspace`: consistently pass on `--mets-server-url` and `--backup` + - `ocrd.cli.validate "tasks"`: pass on `--mets-server-url` + - `ocrd.cli.bashlib "input-files"`: pass on `--mets-server-url` + - `lib.bash input-files`: pass on `--mets-server-url`, `--overwrite`, and parameters + - `lib.bash`: fix `errexit` handling + - `ocrd.cli.ocrd-tool "resolve-resource"`: forgot to actually print result + +Changed: + - :fire: `Processor` / `Workspace.add_file`: always `force` if `OCRD_EXISTING_OUTPUT==OVERWRITE` + - :fire: `Processor.verify`: revert 3.0.0b1 enforcing cardinality checks (stay backwards compatible) + - :fire: `Processor.verify`: check output fileGrps, too + (must not exist unless `OCRD_EXISTING_OUTPUT=OVERWRITE|SKIP` or disjoint `--page-id` range) + - lib.bash `input-files`: do not try to validate tasks here (now covered by `Processor.verify()`) + - `run_processor`: be robust if `ocrd_tool` is missing `steps` + - `PcGtsType.PageType.id` via `make_xml_id`: replace `/` with `_` + +Added: + - `OcrdPage`: new `PageType.get_ReadingOrderGroups()` to retrieve recursive RO as dict + - ocrd.cli.workspace `server`: add subcommands `reload` and `save` + - METS Server: export and delegate `physical_pages` + - processor CLI: delegate `--resolve-resource`, too + - `Processor.process_page_file` / `OcrdPageResultImage`: allow `None` besides `AlternativeImageType` ## [3.0.0b4] - 2024-09-02 @@ -2288,6 +2287,8 @@ Fixed Initial Release +[3.0.0b5]: ../../compare/v3.0.0b5..v3.0.0b4 +[3.0.0b4]: ../../compare/v3.0.0b4..v3.0.0b3 [3.0.0b3]: ../../compare/v3.0.0b3..v3.0.0b2 [3.0.0b2]: ../../compare/v3.0.0b2..v3.0.0b1 [3.0.0b1]: ../../compare/v3.0.0b1..v3.0.0a2 From d2cb0fb663c15c6179bbcf05477051f3d7737149 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 16 Sep 2024 16:55:41 +0200 Subject: [PATCH 212/249] ocrd.cli.workspace: assert non-server in cmds mutating METS --- src/ocrd/cli/workspace.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index ca4e8629d..05b37b6bc 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -149,7 +149,8 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR clone' instead of argument 'WORKSPACE_DIR' ('%s')" % workspace_dir)) ctx.directory = workspace_dir - assert not ctx.mets_server_url + assert not ctx.mets_server_url, \ + f"clone cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.resolver.workspace_from_url( mets_url, dst_dir=ctx.directory, @@ -185,7 +186,8 @@ def workspace_init(ctx, clobber_mets, directory): if directory: LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory)) ctx.directory = directory - assert not ctx.mets_server_url + assert not ctx.mets_server_url, \ + f"init cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.resolver.workspace_from_nothing( directory=ctx.directory, mets_basename=ctx.mets_basename, @@ -506,6 +508,8 @@ def workspace_remove_file(ctx, id, force, keep_file): # pylint: disable=redefin (If any ``ID`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ + assert not ctx.mets_server_url, \ + f"remove cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() for i in id: workspace.remove_file(i, force=force, keep_file=keep_file) @@ -524,6 +528,8 @@ def rename_group(ctx, old, new): """ Rename fileGrp (USE attribute ``NEW`` to ``OLD``). """ + assert not ctx.mets_server_url, \ + f"rename-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() workspace.rename_file_group(old, new) workspace.save_mets() @@ -545,6 +551,8 @@ def remove_group(ctx, group, recursive, force, keep_files): (If any ``GROUP`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ + assert not ctx.mets_server_url, \ + f"remove-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() for g in group: workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files) @@ -567,6 +575,8 @@ def prune_files(ctx, file_grp, mimetype, page_id, file_id): (If any ``FILTER`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ + assert not ctx.mets_server_url, \ + f"prune-files cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() with pushd_popd(workspace.directory): for f in workspace.find_files( @@ -762,6 +772,8 @@ def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id): if contentids: update_kwargs['CONTENTIDS'] = contentids try: + assert not ctx.mets_server_url, \ + f"update-page cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() workspace.mets.update_physical_page_attributes(page_id, **update_kwargs) workspace.save_mets() @@ -800,6 +812,8 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa mets_path = Path(mets_path) if filegrp_mapping: filegrp_mapping = loads(filegrp_mapping) + assert not ctx.mets_server_url, \ + f"merge cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name)) workspace.merge( From 8c566d76fce9940626e358370a31abc7ca5322e6 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Sep 2024 04:35:07 +0200 Subject: [PATCH 213/249] OcrdMets.get_physical_pages: cover return_divs w/o for_fileIds for_pageIds --- src/ocrd_models/ocrd_mets.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index c3fb11f60..de068567e 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -599,7 +599,16 @@ def get_physical_pages(self, for_fileIds : Optional[List[str]] = None, for_pageI If return_divs is set, returns div memory objects instead of strings of ids """ if for_fileIds is None and for_pageIds is None: + if return_divs: + if self._cache_flag: + return list(self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID].values()) + + return [x for x in self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', + namespaces=NS)] + return self.physical_pages + # log = getLogger('ocrd.models.ocrd_mets.get_physical_pages') if for_pageIds is not None: ret = [] From dd62418e55fae345d0613d4813432cb0d25ec135 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Sep 2024 04:37:03 +0200 Subject: [PATCH 214/249] ocrd.cli.workspace: use physical_pages if possible, fix default output_field --- src/ocrd/cli/workspace.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index ca4e8629d..4baab8f93 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -673,19 +673,15 @@ def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page will be interpreted as a regular expression.) """ workspace = ctx.workspace() - find_kwargs = {} - if page_id_range and 'ID' in output_field: - find_kwargs['pageId'] = page_id_range - page_ids = sorted({x.pageId for x in workspace.mets.find_files(**find_kwargs) if x.pageId}) ret = [] - - if output_field == ['ID']: - ret = [[x] for x in page_ids] - else: - for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=','.join(page_ids), return_divs=True)): + if page_id_range or list(output_field) != ['ID']: + for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=page_id_range, return_divs=True)): ret.append([]) for k in output_field: ret[i].append(page_div.get(k, 'None')) + else: + for page_id in workspace.mets.physical_pages: + ret.append([page_id]) if numeric_range: start, end = map(int, numeric_range.split('..')) From 1cfa6e309ca4591f55864e13bdecc7806646262e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Sep 2024 04:44:45 +0200 Subject: [PATCH 215/249] Processor.process_page_file: avoid process_page_pcgts() if OCRD_EXISTING_OUTPUT!=OVERWRITE --- src/ocrd/processor/base.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 26ea532d1..28cbaf726 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -618,6 +618,12 @@ def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: # not PAGE and not an image to generate PAGE for self._base_logger.error(f"non-PAGE input for page {page_id}: {err}") output_file_id = make_file_id(input_files[0], self.output_file_grp) + output_file = next(self.workspace.mets.find_files(ID=output_file_id), None) + if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE': + # short-cut avoiding useless computation: + raise FileExistsError( + f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set" + ) result = self.process_page_pcgts(*input_pcgts, page_id=page_id) for image_result in result.images: image_file_id = f'{output_file_id}_{image_result.file_id_suffix}' From f678dca0e42b66d5742209ffb692103fa7f15528 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Sep 2024 04:35:07 +0200 Subject: [PATCH 216/249] OcrdMets.get_physical_pages: cover return_divs w/o for_fileIds for_pageIds --- src/ocrd_models/ocrd_mets.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index c3fb11f60..de068567e 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -599,7 +599,16 @@ def get_physical_pages(self, for_fileIds : Optional[List[str]] = None, for_pageI If return_divs is set, returns div memory objects instead of strings of ids """ if for_fileIds is None and for_pageIds is None: + if return_divs: + if self._cache_flag: + return list(self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID].values()) + + return [x for x in self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', + namespaces=NS)] + return self.physical_pages + # log = getLogger('ocrd.models.ocrd_mets.get_physical_pages') if for_pageIds is not None: ret = [] From 9064db01380cfca0327320cfcfa7c0fd02e2cb21 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Sep 2024 04:37:03 +0200 Subject: [PATCH 217/249] ocrd.cli.workspace: use physical_pages if possible, fix default output_field --- src/ocrd/cli/workspace.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index 05b37b6bc..77797b303 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -683,19 +683,15 @@ def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page will be interpreted as a regular expression.) """ workspace = ctx.workspace() - find_kwargs = {} - if page_id_range and 'ID' in output_field: - find_kwargs['pageId'] = page_id_range - page_ids = sorted({x.pageId for x in workspace.mets.find_files(**find_kwargs) if x.pageId}) ret = [] - - if output_field == ['ID']: - ret = [[x] for x in page_ids] - else: - for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=','.join(page_ids), return_divs=True)): + if page_id_range or list(output_field) != ['ID']: + for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=page_id_range, return_divs=True)): ret.append([]) for k in output_field: ret[i].append(page_div.get(k, 'None')) + else: + for page_id in workspace.mets.physical_pages: + ret.append([page_id]) if numeric_range: start, end = map(int, numeric_range.split('..')) From 9530fcd346357d23f6e914534f87436c206fa038 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Sep 2024 04:44:45 +0200 Subject: [PATCH 218/249] Processor.process_page_file: avoid process_page_pcgts() if OCRD_EXISTING_OUTPUT!=OVERWRITE --- src/ocrd/processor/base.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 26ea532d1..28cbaf726 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -618,6 +618,12 @@ def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: # not PAGE and not an image to generate PAGE for self._base_logger.error(f"non-PAGE input for page {page_id}: {err}") output_file_id = make_file_id(input_files[0], self.output_file_grp) + output_file = next(self.workspace.mets.find_files(ID=output_file_id), None) + if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE': + # short-cut avoiding useless computation: + raise FileExistsError( + f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set" + ) result = self.process_page_pcgts(*input_pcgts, page_id=page_id) for image_result in result.images: image_file_id = f'{output_file_id}_{image_result.file_id_suffix}' From 31a8474e884812eae614916fcb3e878aa443995a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 9 Oct 2024 16:34:39 +0000 Subject: [PATCH 219/249] ocrd_utils.initLogging: also add handler to root logger (to be consistent with file config and prevent imported libraries from initing logging first), but disable propagation for ocrd loggers (to avoid duplication) --- src/ocrd_utils/logging.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index 181805118..dfac74988 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -48,6 +48,7 @@ # These are the loggers we add handlers to ROOT_OCRD_LOGGERS = [ + '', 'ocrd', 'ocrd_network' ] @@ -191,7 +192,10 @@ def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_L ocrd_handler.setFormatter(logging.Formatter(fmt=LOG_FORMAT, datefmt=LOG_TIMEFMT)) ocrd_handler.setLevel(logging.DEBUG) for logger_name in ROOT_OCRD_LOGGERS: - logging.getLogger(logger_name).addHandler(ocrd_handler) + logger = logging.getLogger(logger_name) + logger.addHandler(ocrd_handler) + if logger_name: + logger.propagate = False # avoid duplication (from root handler) for logger_name, logger_level in LOGGING_DEFAULTS.items(): logging.getLogger(logger_name).setLevel(logger_level) _initialized_flag = True @@ -210,7 +214,7 @@ def disableLogging(silent=not config.OCRD_LOGGING_DEBUG): # logging.basicConfig(level=logging.CRITICAL) # logging.disable(logging.ERROR) # remove all handlers for the ocrd logger - for logger_name in ROOT_OCRD_LOGGERS + ['']: + for logger_name in ROOT_OCRD_LOGGERS: for handler in logging.getLogger(logger_name).handlers[:]: logging.getLogger(logger_name).removeHandler(handler) for logger_name in LOGGING_DEFAULTS: From d7049b1bffb185723124028882ac0e5d88bfabba Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 10 Oct 2024 01:03:46 +0000 Subject: [PATCH 220/249] CLI decorator: only import ocrd_network when needed --- src/ocrd/decorators/__init__.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index f52a13575..f659bf58a 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -13,7 +13,6 @@ redirect_stderr_and_stdout_to_file, ) from ocrd_validators import WorkspaceValidator -from ocrd_network import ProcessingWorker, ProcessorServer, AgentType from ..resolver import Resolver from ..processor.base import ResourceNotFoundError, run_processor @@ -23,8 +22,6 @@ from .ocrd_cli_options import ocrd_cli_options from .mets_find_options import mets_find_options -SUBCOMMANDS = [AgentType.PROCESSING_WORKER, AgentType.PROCESSOR_SERVER] - def ocrd_cli_wrap_processor( processorClass, @@ -88,11 +85,9 @@ def ocrd_cli_wrap_processor( if list_resources: processor.list_resources() sys.exit() - if subcommand: + if subcommand or address or queue or database: # Used for checking/starting network agents for the WebAPI architecture check_and_run_network_agent(processorClass, subcommand, address, database, queue) - elif address or queue or database: - raise ValueError(f"Subcommand options --address --queue and --database are only valid for subcommands: {SUBCOMMANDS}") # from here: single-run processing context initLogging() @@ -162,6 +157,11 @@ def goexit(): def check_and_run_network_agent(ProcessorClass, subcommand: str, address: str, database: str, queue: str): """ """ + from ocrd_network import ProcessingWorker, ProcessorServer, AgentType + SUBCOMMANDS = [AgentType.PROCESSING_WORKER, AgentType.PROCESSOR_SERVER] + + if not subcommand: + raise ValueError(f"Subcommand options --address --queue and --database are only valid for subcommands: {SUBCOMMANDS}") if subcommand not in SUBCOMMANDS: raise ValueError(f"SUBCOMMAND can only be one of {SUBCOMMANDS}") From a9d49c1df906af98f618dbf99b01b2fb9900452b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 10 Oct 2024 14:28:41 +0000 Subject: [PATCH 221/249] =?UTF-8?q?Processor=20w/=20OCRD=5FMAX=5FPARALLEL?= =?UTF-8?q?=5FPAGES:=20ThreadPoolExecutor=E2=86=92ProcessPoolExecutor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/ocrd/processor/base.py | 174 ++++++++++++++++++++----------------- 1 file changed, 95 insertions(+), 79 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 28cbaf726..8ea53246d 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -23,7 +23,8 @@ import io import weakref from frozendict import frozendict -from concurrent.futures import ThreadPoolExecutor, TimeoutError +from concurrent.futures import ProcessPoolExecutor, TimeoutError +import multiprocessing as mp from click import wrap_text from deprecated import deprecated @@ -465,11 +466,7 @@ def process_workspace(self, workspace: Workspace) -> None: self.workspace = workspace self.verify() try: - nr_succeeded = 0 - nr_skipped = 0 - nr_copied = 0 - - # set up multithreading + # set up multitasking max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES) if self.max_workers > 0 and self.max_workers < config.OCRD_MAX_PARALLEL_PAGES: self._base_logger.info("limiting number of threads from %d to %d", max_workers, self.max_workers) @@ -481,80 +478,17 @@ def process_workspace(self, workspace: Workspace) -> None: if self.max_page_seconds > 0 and self.max_page_seconds < config.OCRD_PROCESSING_PAGE_TIMEOUT: self._base_logger.info("limiting page timeout from %d to %d sec", max_seconds, self.max_page_seconds) max_seconds = self.max_page_seconds - executor = ThreadPoolExecutor( - max_workers=max_workers or 1, - thread_name_prefix=f"pagetask.{workspace.mets.unique_identifier}" - ) - self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1) - tasks = {} - - for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False): - input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple) - page_id = next(input_file.pageId - for input_file in input_file_tuple - if input_file) - self._base_logger.info(f"preparing page {page_id}") - for i, input_file in enumerate(input_file_tuple): - if input_file is None: - # file/page not found in this file grp - continue - input_files[i] = input_file - if not self.download: - continue - try: - input_files[i] = self.workspace.download_file(input_file) - except (ValueError, FileNotFoundError, HTTPError) as e: - self._base_logger.error(repr(e)) - self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}") - # process page - tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files) - self._base_logger.debug("submitted %d processing tasks", len(tasks)) - - for task in tasks: - # wait for results, handle errors - page_id, input_files = tasks[task] - # FIXME: differentiate error cases in various ways: - # - ResourceNotFoundError → use ResourceManager to download (once), then retry - # - transient (I/O or OOM) error → maybe sleep, retry - # - persistent (data) error → skip / dummy / raise - try: - self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds) - task.result(timeout=max_seconds or None) - nr_succeeded += 1 - # exclude NotImplementedError, so we can try process() below - except NotImplementedError: - raise - # handle input failures separately - except FileExistsError as err: - if config.OCRD_EXISTING_OUTPUT == 'ABORT': - raise err - if config.OCRD_EXISTING_OUTPUT == 'SKIP': - continue - if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE': - # too late here, must not happen - raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE") - # broad coverage of output failures (including TimeoutError) - except (Exception, TimeoutError) as err: - # FIXME: add re-usable/actionable logging - if config.OCRD_MISSING_OUTPUT == 'ABORT': - self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") - raise err - self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") - if config.OCRD_MISSING_OUTPUT == 'SKIP': - nr_skipped += 1 - continue - if config.OCRD_MISSING_OUTPUT == 'COPY': - self._copy_page_file(input_files[0]) - nr_copied += 1 - else: - desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False) - raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}") - if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS: - raise Exception(f"too many failures with skipped output ({nr_skipped})") - if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS: - raise Exception(f"too many failures with fallback output ({nr_skipped})") - executor.shutdown() + with ProcessPoolExecutor( + max_workers=max_workers or 1, + # only forking method avoids pickling + mp_context=mp.get_context('fork'), + # share processor instance as global to avoid pickling + initializer=_page_worker_set_ctxt, + initargs=(self,), + ) as executor: + self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1) + self._process_workspace_run(executor, max_workers, max_seconds) except NotImplementedError: # fall back to deprecated method @@ -564,6 +498,80 @@ def process_workspace(self, workspace: Workspace) -> None: # suppress the NotImplementedError context raise err from None + def _process_workspace_run(self, executor, max_workers, max_seconds): + nr_succeeded = 0 + nr_skipped = 0 + nr_copied = 0 + + tasks = {} + for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False): + input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple) + page_id = next(input_file.pageId + for input_file in input_file_tuple + if input_file) + self._base_logger.info(f"preparing page {page_id}") + for i, input_file in enumerate(input_file_tuple): + if input_file is None: + # file/page not found in this file grp + continue + input_files[i] = input_file + if not self.download: + continue + try: + input_files[i] = self.workspace.download_file(input_file) + except (ValueError, FileNotFoundError, HTTPError) as e: + self._base_logger.error(repr(e)) + self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}") + # process page + #tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files) + tasks[executor.submit(_page_worker, *input_files)] = (page_id, input_files) + self._base_logger.debug("submitted %d processing tasks", len(tasks)) + + for task in tasks: + # wait for results, handle errors + page_id, input_files = tasks[task] + # FIXME: differentiate error cases in various ways: + # - ResourceNotFoundError → use ResourceManager to download (once), then retry + # - transient (I/O or OOM) error → maybe sleep, retry + # - persistent (data) error → skip / dummy / raise + try: + self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds) + task.result(timeout=max_seconds or None) + nr_succeeded += 1 + # exclude NotImplementedError, so we can try process() below + except NotImplementedError: + raise + # handle input failures separately + except FileExistsError as err: + if config.OCRD_EXISTING_OUTPUT == 'ABORT': + raise err + if config.OCRD_EXISTING_OUTPUT == 'SKIP': + continue + if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE': + # too late here, must not happen + raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE") + # broad coverage of output failures (including TimeoutError) + except (Exception, TimeoutError) as err: + # FIXME: add re-usable/actionable logging + if config.OCRD_MISSING_OUTPUT == 'ABORT': + self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") + raise err + self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") + if config.OCRD_MISSING_OUTPUT == 'SKIP': + nr_skipped += 1 + continue + if config.OCRD_MISSING_OUTPUT == 'COPY': + self._copy_page_file(input_files[0]) + nr_copied += 1 + else: + desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False) + raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}") + + if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS: + raise Exception(f"too many failures with skipped output ({nr_skipped})") + if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS: + raise Exception(f"too many failures with fallback output ({nr_skipped})") + def _copy_page_file(self, input_file : OcrdFileType) -> None: """ Copy the given ``input_file`` of the :py:data:`workspace`, @@ -940,6 +948,14 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): ifts.append(tuple(ifiles)) return ifts +_page_worker_processor = None +def _page_worker_set_ctxt(processor): + global _page_worker_processor + _page_worker_processor = processor + +def _page_worker(*input_files): + _page_worker_processor.process_page_file(*input_files) + def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None): """Generate a string describing the full CLI of this processor including params. From 588c91df826951d29b24f1e1677cced3a55b2153 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 17 Oct 2024 08:44:56 +0000 Subject: [PATCH 222/249] Processor.process_workspace: apply timeout on process_page_file worker itself (rather than future query) --- src/ocrd/processor/base.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 8ea53246d..ce6b3e494 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -25,6 +25,8 @@ from frozendict import frozendict from concurrent.futures import ProcessPoolExecutor, TimeoutError import multiprocessing as mp +from threading import Timer +from _thread import interrupt_main from click import wrap_text from deprecated import deprecated @@ -524,7 +526,7 @@ def _process_workspace_run(self, executor, max_workers, max_seconds): self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}") # process page #tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files) - tasks[executor.submit(_page_worker, *input_files)] = (page_id, input_files) + tasks[executor.submit(_page_worker, max_seconds, *input_files)] = (page_id, input_files) self._base_logger.debug("submitted %d processing tasks", len(tasks)) for task in tasks: @@ -536,7 +538,12 @@ def _process_workspace_run(self, executor, max_workers, max_seconds): # - persistent (data) error → skip / dummy / raise try: self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds) - task.result(timeout=max_seconds or None) + # timeout kwarg on future is useless: it only raises TimeoutError here, + # but does not stop the running process/thread, and executor offers nothing + # to that effect: + # task.result(timeout=max_seconds or None) + # so we instead apply the timeout within the worker function + task.result() nr_succeeded += 1 # exclude NotImplementedError, so we can try process() below except NotImplementedError: @@ -551,7 +558,7 @@ def _process_workspace_run(self, executor, max_workers, max_seconds): # too late here, must not happen raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE") # broad coverage of output failures (including TimeoutError) - except (Exception, TimeoutError) as err: + except Exception as err: # FIXME: add re-usable/actionable logging if config.OCRD_MISSING_OUTPUT == 'ABORT': self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") @@ -953,8 +960,21 @@ def _page_worker_set_ctxt(processor): global _page_worker_processor _page_worker_processor = processor -def _page_worker(*input_files): - _page_worker_processor.process_page_file(*input_files) +def _page_worker(timeout, *input_files): + page_id = next((file.pageId for file in input_files + if hasattr(file, 'pageId')), "") + if timeout > 0: + timer = Timer(timeout, interrupt_main) + timer.start() + try: + _page_worker_processor.process_page_file(*input_files) + _page_worker_processor.logger.debug("page worker completed for page %s", page_id) + except KeyboardInterrupt: + _page_worker_processor.logger.debug("page worker timed out for page %s", page_id) + raise TimeoutError() + finally: + if timeout > 0: + timer.cancel() def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None): """Generate a string describing the full CLI of this processor including params. From d126bdce4ef81c148c1bae4718d000082f863704 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 17 Oct 2024 08:46:21 +0000 Subject: [PATCH 223/249] =?UTF-8?q?Processor=20w/=20OCRD=5FMAX=5FPARALLEL?= =?UTF-8?q?=5FPAGES:=20concurrent.futures=E2=86=92loky?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 1 + src/ocrd/processor/base.py | 23 ++++++++++++++--------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/requirements.txt b/requirements.txt index e78c18661..05d4e9aa4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,6 +13,7 @@ httpx>=0.22.0 importlib_metadata ; python_version < '3.8' importlib_resources ; python_version < '3.10' jsonschema>=4 +loky lxml memory-profiler >= 0.58.0 # XXX explicitly do not restrict the numpy version because different diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index ce6b3e494..b6a41d6b5 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -23,7 +23,9 @@ import io import weakref from frozendict import frozendict -from concurrent.futures import ProcessPoolExecutor, TimeoutError +# concurrent.futures is buggy in py38, +# this is where the fixes came from: +from loky import ProcessPoolExecutor import multiprocessing as mp from threading import Timer from _thread import interrupt_main @@ -481,16 +483,19 @@ def process_workspace(self, workspace: Workspace) -> None: self._base_logger.info("limiting page timeout from %d to %d sec", max_seconds, self.max_page_seconds) max_seconds = self.max_page_seconds - with ProcessPoolExecutor( - max_workers=max_workers or 1, - # only forking method avoids pickling - mp_context=mp.get_context('fork'), - # share processor instance as global to avoid pickling - initializer=_page_worker_set_ctxt, - initargs=(self,), - ) as executor: + executor = ProcessPoolExecutor( + max_workers=max_workers or 1, + # only forking method avoids pickling + context=mp.get_context('fork'), + # share processor instance as global to avoid pickling + initializer=_page_worker_set_ctxt, + initargs=(self,), + ) + try: self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1) self._process_workspace_run(executor, max_workers, max_seconds) + finally: + executor.shutdown(kill_workers=True) except NotImplementedError: # fall back to deprecated method From afa7f30a6bf212fece28ebc354da726a658ba121 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 19 Oct 2024 00:23:06 +0000 Subject: [PATCH 224/249] Processor w/o OCRD_MAX_PARALLEL_PAGES: dummy instead of executor --- src/ocrd/processor/base.py | 46 ++++++++++++++++++++++++++++--- tests/processor/test_processor.py | 1 - 2 files changed, 42 insertions(+), 5 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index b6a41d6b5..7ff271eca 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -483,7 +483,29 @@ def process_workspace(self, workspace: Workspace) -> None: self._base_logger.info("limiting page timeout from %d to %d sec", max_seconds, self.max_page_seconds) max_seconds = self.max_page_seconds - executor = ProcessPoolExecutor( + class DummyExecutor: + """ + Mimics some of ProcessPoolExecutor but runs everything + immediately in this process. + """ + class DummyFuture: + def __init__(self, fn, *args, **kwargs): + self.fn = fn + self.args = args + self.kwargs = kwargs + def result(self): + return self.fn(*self.args, **self.kwargs) + def __init__(self, initializer=None, initargs=(), **kwargs): + initializer(*initargs) + def shutdown(self, **kwargs): + pass + def submit(self, fn, *args, **kwargs): + return DummyExecutor.DummyFuture(fn, *args, **kwargs) + if max_workers > 1: + executor_cls = ProcessPoolExecutor + else: + executor_cls = DummyExecutor + executor = executor_cls( max_workers=max_workers or 1, # only forking method avoids pickling context=mp.get_context('fork'), @@ -493,7 +515,7 @@ def process_workspace(self, workspace: Workspace) -> None: ) try: self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1) - self._process_workspace_run(executor, max_workers, max_seconds) + self._process_workspace_run(executor, max_seconds) finally: executor.shutdown(kill_workers=True) @@ -505,7 +527,7 @@ def process_workspace(self, workspace: Workspace) -> None: # suppress the NotImplementedError context raise err from None - def _process_workspace_run(self, executor, max_workers, max_seconds): + def _process_workspace_run(self, executor, max_seconds): nr_succeeded = 0 nr_skipped = 0 nr_copied = 0 @@ -961,11 +983,27 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): return ifts _page_worker_processor = None +""" +This global binding for the processor is required to avoid +squeezing the processor through a mp.Queue (which is impossible +due to unpicklable attributes like .workspace.mets._tree anyway) +when calling Processor.process_page_file as page worker processes +in Processor.process_workspace. Forking allows inheriting global +objects, and with the METS Server we do not mutate the local +processor instance anyway. +""" def _page_worker_set_ctxt(processor): + """ + Overwrites `ocrd.processor.base._page_worker_processor` instance + for sharing with subprocesses in ProcessPoolExecutor initializer. + """ global _page_worker_processor _page_worker_processor = processor - def _page_worker(timeout, *input_files): + """ + Wraps a `Processor.process_page_file` call as payload (call target) + of the ProcessPoolExecutor workers, but also enforces the given timeout. + """ page_id = next((file.pageId for file in input_files if hasattr(file, 'pageId')), "") if timeout > 0: diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 33a954881..5844cb877 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -277,7 +277,6 @@ def test_run_output_timeout(self): assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' config.OCRD_PROCESSING_PAGE_TIMEOUT = 1 - from concurrent.futures import TimeoutError with pytest.raises(TimeoutError) as exc: run_processor(DummyProcessorWithOutputSleep, workspace=ws, input_file_grp="OCR-D-IMG", From 58217018d8bcd85df5dc4e3e03eb62a0d9255690 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 19 Oct 2024 01:27:58 +0000 Subject: [PATCH 225/249] ocrd.process.profile logger: account for subprocess CPU time, too --- src/ocrd/processor/helpers.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index 2cbbbd97e..757f7ac04 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -2,6 +2,7 @@ Helper methods for running and documenting processors """ from time import perf_counter, process_time +from os import times from functools import lru_cache import json import inspect @@ -94,6 +95,7 @@ def run_processor( log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole) t0_wall = perf_counter() t0_cpu = process_time() + t0_os = times() if any(x in config.OCRD_PROFILE for x in ['RSS', 'PSS']): backend = 'psutil_pss' if 'PSS' in config.OCRD_PROFILE else 'psutil' from memory_profiler import memory_usage # pylint: disable=import-outside-toplevel @@ -123,7 +125,13 @@ def run_processor( t1_wall = perf_counter() - t0_wall t1_cpu = process_time() - t0_cpu - logProfile.info("Executing processor '%s' took %fs (wall) %fs (CPU)( [--input-file-grp='%s' --output-file-grp='%s' --parameter='%s' --page-id='%s']" % ( + t1_os = times() + # add CPU time from child processes (page worker etc) + t1_cpu += t1_os.children_user - t0_os.children_user + t1_cpu += t1_os.children_system - t0_os.children_system + logProfile.info( + "Executing processor '%s' took %fs (wall) %fs (CPU)( " + "[--input-file-grp='%s' --output-file-grp='%s' --parameter='%s' --page-id='%s']", ocrd_tool['executable'], t1_wall, t1_cpu, @@ -131,7 +139,7 @@ def run_processor( processor.output_file_grp or '', json.dumps(processor.parameter) or '', processor.page_id or '' - )) + ) workspace.mets.add_agent( name=name, _type='OTHER', From 53b1854e139f66e3061d2e4feae5411c9b8d092a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 21 Oct 2024 12:47:33 +0000 Subject: [PATCH 226/249] Processor.process_workspace: improve reporting, raise early if too many failures already (rate will be too low) --- src/ocrd/processor/base.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 7ff271eca..46b07c716 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -22,6 +22,7 @@ import tarfile import io import weakref +from collections import defaultdict from frozendict import frozendict # concurrent.futures is buggy in py38, # this is where the fixes came from: @@ -528,9 +529,10 @@ def submit(self, fn, *args, **kwargs): raise err from None def _process_workspace_run(self, executor, max_seconds): + # aggregate info for logging: nr_succeeded = 0 - nr_skipped = 0 - nr_copied = 0 + nr_failed = 0 + nr_errors = defaultdict(int) # count causes tasks = {} for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False): @@ -572,8 +574,8 @@ def _process_workspace_run(self, executor, max_seconds): # so we instead apply the timeout within the worker function task.result() nr_succeeded += 1 - # exclude NotImplementedError, so we can try process() below except NotImplementedError: + # exclude NotImplementedError, so we can try process() below raise # handle input failures separately except FileExistsError as err: @@ -587,24 +589,35 @@ def _process_workspace_run(self, executor, max_seconds): # broad coverage of output failures (including TimeoutError) except Exception as err: # FIXME: add re-usable/actionable logging + nr_errors[err.__class__.__name__] += 1 + nr_failed += 1 if config.OCRD_MISSING_OUTPUT == 'ABORT': self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") raise err self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") if config.OCRD_MISSING_OUTPUT == 'SKIP': - nr_skipped += 1 + if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / len(tasks) > config.OCRD_MAX_MISSING_OUTPUTS: + # already irredeemably many failures, stop short + raise Exception(f"too many failures with skipped output ({nr_failed} of {nr_failed+nr_succeeded})") continue if config.OCRD_MISSING_OUTPUT == 'COPY': + if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / len(tasks) > config.OCRD_MAX_MISSING_OUTPUTS: + # already irredeemably many failures, stop short + raise Exception(f"too many failures with fallback-copied output ({nr_failed} of {nr_failed+nr_succeeded})") self._copy_page_file(input_files[0]) - nr_copied += 1 else: desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False) raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}") - if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS: - raise Exception(f"too many failures with skipped output ({nr_skipped})") - if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS: - raise Exception(f"too many failures with fallback output ({nr_skipped})") + if nr_failed > 0: + nr_all = nr_succeeded + nr_failed + if config.OCRD_MISSING_OUTPUT == 'SKIP': + reason = "skipped" + if config.OCRD_MISSING_OUTPUT == 'COPY': + reason = "fallback-copied" + if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / nr_all > config.OCRD_MAX_MISSING_OUTPUTS: + raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_all})") + self._base_logger.info("%s %d of %d pages due to %s", reason, nr_failed, nr_all, str(dict(nr_errors))) def _copy_page_file(self, input_file : OcrdFileType) -> None: """ From 4d66e3702dfdd1063307ab09c33126ddc2f930a2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 23 Oct 2024 22:12:57 +0000 Subject: [PATCH 227/249] Processor: refactor process_workspace into overridable subfuncs --- repo/spec | 2 +- src/ocrd/processor/base.py | 299 +++++++++++++++++++++++++------------ 2 files changed, 201 insertions(+), 100 deletions(-) diff --git a/repo/spec b/repo/spec index df2a07e3f..506b33936 160000 --- a/repo/spec +++ b/repo/spec @@ -1 +1 @@ -Subproject commit df2a07e3fda634b2eda5785afe67399b61a81173 +Subproject commit 506b33936d89080a683fa8a26837f2a23b23e5e2 diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 46b07c716..85a0dea21 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -16,7 +16,7 @@ import os from os import getcwd from pathlib import Path -from typing import Any, List, Optional, Union, get_args +from typing import Any, Dict, List, Optional, Tuple, Union, get_args import sys import inspect import tarfile @@ -26,7 +26,7 @@ from frozendict import frozendict # concurrent.futures is buggy in py38, # this is where the fixes came from: -from loky import ProcessPoolExecutor +from loky import Future, ProcessPoolExecutor import multiprocessing as mp from threading import Timer from _thread import interrupt_main @@ -111,6 +111,31 @@ def __init__(self, fileGrp, pageId, mimetype): f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}") super().__init__(self.message) +class DummyFuture: + """ + Mimics some of `concurrent.futures.Future` but runs immediately. + """ + def __init__(self, fn, *args, **kwargs): + self.fn = fn + self.args = args + self.kwargs = kwargs + def result(self): + return self.fn(*self.args, **self.kwargs) +class DummyExecutor: + """ + Mimics some of `concurrent.futures.ProcessPoolExecutor` but runs + everything immediately in this process. + """ + def __init__(self, initializer=None, initargs=(), **kwargs): + initializer(*initargs) + def shutdown(self, **kwargs): + pass + def submit(self, fn, *args, **kwargs) -> DummyFuture: + return DummyFuture(fn, *args, **kwargs) + +TFuture = Union[DummyFuture, Future] +TExecutor = Union[DummyExecutor, ProcessPoolExecutor] + class Processor(): """ A processor is a tool that implements the uniform OCR-D @@ -462,6 +487,9 @@ def process_workspace(self, workspace: Workspace) -> None: for the given :py:data:`page_id` (or all pages) under the given :py:data:`parameter`. + Delegates to :py:meth:`.process_workspace_submit_tasks` + and :py:meth:`.process_workspace_handle_tasks`. + (This will iterate over pages and files, calling :py:meth:`.process_page_file` and handling exceptions. It should be overridden by subclasses to handle cases @@ -484,24 +512,6 @@ def process_workspace(self, workspace: Workspace) -> None: self._base_logger.info("limiting page timeout from %d to %d sec", max_seconds, self.max_page_seconds) max_seconds = self.max_page_seconds - class DummyExecutor: - """ - Mimics some of ProcessPoolExecutor but runs everything - immediately in this process. - """ - class DummyFuture: - def __init__(self, fn, *args, **kwargs): - self.fn = fn - self.args = args - self.kwargs = kwargs - def result(self): - return self.fn(*self.args, **self.kwargs) - def __init__(self, initializer=None, initargs=(), **kwargs): - initializer(*initargs) - def shutdown(self, **kwargs): - pass - def submit(self, fn, *args, **kwargs): - return DummyExecutor.DummyFuture(fn, *args, **kwargs) if max_workers > 1: executor_cls = ProcessPoolExecutor else: @@ -516,7 +526,8 @@ def submit(self, fn, *args, **kwargs): ) try: self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1) - self._process_workspace_run(executor, max_seconds) + tasks = self.process_workspace_submit_tasks(executor, max_seconds) + stats = self.process_workspace_handle_tasks(tasks) finally: executor.shutdown(kill_workers=True) @@ -528,96 +539,186 @@ def submit(self, fn, *args, **kwargs): # suppress the NotImplementedError context raise err from None - def _process_workspace_run(self, executor, max_seconds): - # aggregate info for logging: - nr_succeeded = 0 - nr_failed = 0 - nr_errors = defaultdict(int) # count causes - + def process_workspace_submit_tasks(self, executor : TExecutor, max_seconds : int) -> Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]: + """ + Look up all input files of the given ``workspace`` + from the given :py:data:`input_file_grp` + for the given :py:data:`page_id` (or all pages), + and schedules calling :py:meth:`.process_page_file` + on them for each page via `executor` (enforcing + a per-page time limit of `max_seconds`). + + When running with `OCRD_MAX_PARALLEL_PAGES>1` and + the workspace via METS Server, the executor will fork + this many worker parallel subprocesses each processing + one page at a time. (Interprocess communication is + done via task and result queues.) + + Otherwise, tasks are run sequentially in the + current process. + + Delegates to :py:meth:`.zip_input_files` to get + the input files for each page, and then calls + :py:meth:`.process_workspace_submit_page_task`. + + Returns a dict mapping the per-page tasks + (i.e. futures submitted to the executor) + to their corresponding pageId and input files. + """ tasks = {} for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False): - input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple) - page_id = next(input_file.pageId - for input_file in input_file_tuple - if input_file) - self._base_logger.info(f"preparing page {page_id}") - for i, input_file in enumerate(input_file_tuple): - if input_file is None: - # file/page not found in this file grp - continue - input_files[i] = input_file - if not self.download: - continue - try: - input_files[i] = self.workspace.download_file(input_file) - except (ValueError, FileNotFoundError, HTTPError) as e: - self._base_logger.error(repr(e)) - self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}") - # process page - #tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files) - tasks[executor.submit(_page_worker, max_seconds, *input_files)] = (page_id, input_files) + task, page_id, input_files = self.process_workspace_submit_page_task(executor, max_seconds, input_file_tuple) + tasks[task] = (page_id, input_files) self._base_logger.debug("submitted %d processing tasks", len(tasks)) + return tasks + def process_workspace_submit_page_task(self, executor : TExecutor, max_seconds : int, input_file_tuple : List[Optional[OcrdFileType]]) -> Tuple[TFuture, str, List[Optional[OcrdFileType]]]: + """ + Ensure all input files for a single page are + downloaded to the workspace, then schedule + :py:meth:`.process_process_file` to be run on + them via `executor` (enforcing a per-page time + limit of `max_seconds`). + + Delegates to :py:meth:`.process_page_file` + (wrapped in :py:func:`_page_worker` to share + the processor instance across forked processes). + + \b + Returns a tuple of: + - the scheduled future object, + - the corresponding pageId, + - the corresponding input files. + """ + input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple) + page_id = next(input_file.pageId + for input_file in input_file_tuple + if input_file) + self._base_logger.info(f"preparing page {page_id}") + for i, input_file in enumerate(input_file_tuple): + if input_file is None: + # file/page not found in this file grp + continue + input_files[i] = input_file + if not self.download: + continue + try: + input_files[i] = self.workspace.download_file(input_file) + except (ValueError, FileNotFoundError, HTTPError) as e: + self._base_logger.error(repr(e)) + self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}") + # process page + #executor.submit(self.process_page_file, *input_files) + return executor.submit(_page_worker, max_seconds, *input_files), page_id, input_files + + def process_workspace_handle_tasks(self, tasks : Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]) -> Tuple[int, int, Dict[str, int], int]: + """ + Look up scheduled per-page futures one by one, + handle errors (exceptions) and gather results. + + \b + Enforces policies configured by the following + environment variables: + - `OCRD_EXISTING_OUTPUT` (abort/skip/overwrite) + - `OCRD_MISSING_OUTPUT` (abort/skip/fallback-copy) + - `OCRD_MAX_MISSING_OUTPUTS` (abort after all). + + \b + Returns a tuple of: + - the number of successfully processed pages + - the number of failed (i.e. skipped or copied) pages + - a dict of the type and corresponding number of exceptions seen + - the number of total requested pages (i.e. success+fail+existing). + + Delegates to :py:meth:`.process_workspace_handle_page_task` + for each page. + """ + # aggregate info for logging: + nr_succeeded = 0 + nr_failed = 0 + nr_errors = defaultdict(int) # count causes + if config.OCRD_MISSING_OUTPUT == 'SKIP': + reason = "skipped" + elif config.OCRD_MISSING_OUTPUT == 'COPY': + reason = "fallback-copied" for task in tasks: # wait for results, handle errors page_id, input_files = tasks[task] - # FIXME: differentiate error cases in various ways: - # - ResourceNotFoundError → use ResourceManager to download (once), then retry - # - transient (I/O or OOM) error → maybe sleep, retry - # - persistent (data) error → skip / dummy / raise - try: - self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds) - # timeout kwarg on future is useless: it only raises TimeoutError here, - # but does not stop the running process/thread, and executor offers nothing - # to that effect: - # task.result(timeout=max_seconds or None) - # so we instead apply the timeout within the worker function - task.result() - nr_succeeded += 1 - except NotImplementedError: - # exclude NotImplementedError, so we can try process() below - raise - # handle input failures separately - except FileExistsError as err: - if config.OCRD_EXISTING_OUTPUT == 'ABORT': - raise err - if config.OCRD_EXISTING_OUTPUT == 'SKIP': - continue - if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE': - # too late here, must not happen - raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE") - # broad coverage of output failures (including TimeoutError) - except Exception as err: - # FIXME: add re-usable/actionable logging - nr_errors[err.__class__.__name__] += 1 + result = self.process_workspace_handle_page_task(page_id, input_files, task) + if isinstance(result, Exception): + nr_errors[result.__class__.__name__] += 1 nr_failed += 1 - if config.OCRD_MISSING_OUTPUT == 'ABORT': - self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") - raise err - self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") - if config.OCRD_MISSING_OUTPUT == 'SKIP': - if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / len(tasks) > config.OCRD_MAX_MISSING_OUTPUTS: - # already irredeemably many failures, stop short - raise Exception(f"too many failures with skipped output ({nr_failed} of {nr_failed+nr_succeeded})") - continue - if config.OCRD_MISSING_OUTPUT == 'COPY': - if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / len(tasks) > config.OCRD_MAX_MISSING_OUTPUTS: - # already irredeemably many failures, stop short - raise Exception(f"too many failures with fallback-copied output ({nr_failed} of {nr_failed+nr_succeeded})") - self._copy_page_file(input_files[0]) - else: - desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False) - raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}") - + # FIXME: this is just prospective, because len(tasks)==nr_failed+nr_succeeded is not guaranteed + if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / len(tasks) > config.OCRD_MAX_MISSING_OUTPUTS: + # already irredeemably many failures, stop short + raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_failed+nr_succeeded})") + elif result: + nr_succeeded += 1 + # else skipped - already exists + nr_errors = dict(nr_errors) if nr_failed > 0: nr_all = nr_succeeded + nr_failed - if config.OCRD_MISSING_OUTPUT == 'SKIP': - reason = "skipped" - if config.OCRD_MISSING_OUTPUT == 'COPY': - reason = "fallback-copied" if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / nr_all > config.OCRD_MAX_MISSING_OUTPUTS: raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_all})") - self._base_logger.info("%s %d of %d pages due to %s", reason, nr_failed, nr_all, str(dict(nr_errors))) + self._base_logger.info("%s %d of %d pages due to %s", reason, nr_failed, nr_all, str(nr_errors)) + return nr_succeeded, nr_failed, nr_errors, len(tasks) + + def process_workspace_handle_page_task(self, page_id : str, input_files : List[Optional[OcrdFileType]], task : TFuture) -> Union[bool, Exception]: + """ + \b + Await a single page result and handle errors (exceptions), + enforcing policies configured by the following + environment variables: + - `OCRD_EXISTING_OUTPUT` (abort/skip/overwrite) + - `OCRD_MISSING_OUTPUT` (abort/skip/fallback-copy) + - `OCRD_MAX_MISSING_OUTPUTS` (abort after all). + + \b + Returns + - true in case of success + - false in case the output already exists + - the exception in case of failure + """ + # FIXME: differentiate error cases in various ways: + # - ResourceNotFoundError → use ResourceManager to download (once), then retry + # - transient (I/O or OOM) error → maybe sleep, retry + # - persistent (data) error → skip / dummy / raise + try: + self._base_logger.debug("waiting for output of task %s (page %s)", task, page_id) + # timeout kwarg on future is useless: it only raises TimeoutError here, + # but does not stop the running process/thread, and executor itself + # offers nothing to that effect: + # task.result(timeout=max_seconds or None) + # so we instead applied the timeout within the worker function + task.result() + return True + except NotImplementedError: + # exclude NotImplementedError, so we can try process() below + raise + # handle input failures separately + except FileExistsError as err: + if config.OCRD_EXISTING_OUTPUT == 'ABORT': + raise err + if config.OCRD_EXISTING_OUTPUT == 'SKIP': + return False + if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE': + # too late here, must not happen + raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE") + # broad coverage of output failures (including TimeoutError) + except Exception as err: + # FIXME: add re-usable/actionable logging + if config.OCRD_MISSING_OUTPUT == 'ABORT': + self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") + raise err + self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") + if config.OCRD_MISSING_OUTPUT == 'SKIP': + pass + elif config.OCRD_MISSING_OUTPUT == 'COPY': + self._copy_page_file(input_files[0]) + else: + desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False) + raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}") + return err def _copy_page_file(self, input_file : OcrdFileType) -> None: """ From 71d6d496fdc42bdc9c7b338b1ce78d593b36555d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 30 Oct 2024 20:31:18 +0000 Subject: [PATCH 228/249] Processor.process_workspace_handle_page_task: do not handler sigint --- src/ocrd/processor/base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 85a0dea21..297b34647 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -704,6 +704,8 @@ def process_workspace_handle_page_task(self, page_id : str, input_files : List[O if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE': # too late here, must not happen raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE") + except KeyboardInterrupt: + raise # broad coverage of output failures (including TimeoutError) except Exception as err: # FIXME: add re-usable/actionable logging @@ -1113,6 +1115,7 @@ def _page_worker_set_ctxt(processor): """ global _page_worker_processor _page_worker_processor = processor + def _page_worker(timeout, *input_files): """ Wraps a `Processor.process_page_file` call as payload (call target) From d2d5290a0fb789979b1ce29690f9e93f64c61c1f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 30 Oct 2024 20:32:22 +0000 Subject: [PATCH 229/249] Processor.process_workspace_handle_tasks: log nr of ignored exceptions in the end --- src/ocrd/processor/base.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 297b34647..87e6731df 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -529,7 +529,7 @@ def process_workspace(self, workspace: Workspace) -> None: tasks = self.process_workspace_submit_tasks(executor, max_seconds) stats = self.process_workspace_handle_tasks(tasks) finally: - executor.shutdown(kill_workers=True) + executor.shutdown(kill_workers=True, wait=False) except NotImplementedError: # fall back to deprecated method @@ -651,7 +651,8 @@ def process_workspace_handle_tasks(self, tasks : Dict[TFuture, Tuple[str, List[O # FIXME: this is just prospective, because len(tasks)==nr_failed+nr_succeeded is not guaranteed if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / len(tasks) > config.OCRD_MAX_MISSING_OUTPUTS: # already irredeemably many failures, stop short - raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_failed+nr_succeeded})") + nr_errors = dict(nr_errors) + raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_failed+nr_succeeded}, {str(nr_errors)})") elif result: nr_succeeded += 1 # else skipped - already exists @@ -659,8 +660,8 @@ def process_workspace_handle_tasks(self, tasks : Dict[TFuture, Tuple[str, List[O if nr_failed > 0: nr_all = nr_succeeded + nr_failed if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / nr_all > config.OCRD_MAX_MISSING_OUTPUTS: - raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_all})") - self._base_logger.info("%s %d of %d pages due to %s", reason, nr_failed, nr_all, str(nr_errors)) + raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_all}, {str(nr_errors)})") + self._base_logger.warning("%s %d of %d pages due to %s", reason, nr_failed, nr_all, str(nr_errors)) return nr_succeeded, nr_failed, nr_errors, len(tasks) def process_workspace_handle_page_task(self, page_id : str, input_files : List[Optional[OcrdFileType]], task : TFuture) -> Union[bool, Exception]: From 7d1503ebc40d4bd03d6c6e6a9813e8d6279a70a0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 30 Oct 2024 22:47:18 +0100 Subject: [PATCH 230/249] :package: v3.0.0b6 --- CHANGELOG.md | 23 +++++++++++++++++++++++ VERSION | 2 +- repo/spec | 2 +- 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index abbfd5a4d..da422654b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,28 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [3.0.0b6] - 2024-10-30 + +Fixed: + - `OcrdMets.get_physical_pages`: cover `return_divs` w/o `for_fileIds` and `for_pageIds` + +Changed: + - :fire: `ocrd_utils.initLogging`: also add handler to root logger (as in file config), + but disable message propagation to avoid duplication + - only import `ocrd_network` in `src/ocrd/decorators/__init__.py` once needed + - `Processor.process_page_file`: skip computing `process_page_pcgts` if output already exists, + but `OCRD_EXISTING_OUTPUT!=OVERWRITE` + - :fire: `OCRD_MAX_PARALLEL_PAGES>1`: switch from multithreading to multiprocessing, depend on + `loky` instead of stdlib `concurrent.futures` + - `OCRD_PROCESSING_PAGE_TIMEOUT>0`: actually enforce timeout within worker + - `OCRD_MAX_MISSING_OUTPUTS>0`: abort early if too many failures already, prospectively + - `Processor.process_workspace`: split up into overridable sub-methods: + - `process_workspace_submit_tasks` (iterate input file group and schedule page tasks) + - `process_workspace_submit_page_task` (download input files and submit single page task) + - `process_workspace_handle_tasks` (monitor page tasks and aggregate results) + - `process_workspace_handle_page_task` (await single page task and handle errors) + + ## [3.0.0b5] - 2024-09-16 Fixed: @@ -2287,6 +2309,7 @@ Fixed Initial Release +[3.0.0b6]: ../../compare/v3.0.0b6..v3.0.0b5 [3.0.0b5]: ../../compare/v3.0.0b5..v3.0.0b4 [3.0.0b4]: ../../compare/v3.0.0b4..v3.0.0b3 [3.0.0b3]: ../../compare/v3.0.0b3..v3.0.0b2 diff --git a/VERSION b/VERSION index 09fb39d26..43662e8c2 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.0.0b5 +3.0.0b6 diff --git a/repo/spec b/repo/spec index 506b33936..df2a07e3f 160000 --- a/repo/spec +++ b/repo/spec @@ -1 +1 @@ -Subproject commit 506b33936d89080a683fa8a26837f2a23b23e5e2 +Subproject commit df2a07e3fda634b2eda5785afe67399b61a81173 From 08a631ccc89401724caf32b3211529abc0a13382 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 7 Nov 2024 18:27:25 +0000 Subject: [PATCH 231/249] tests: prevent side effects from ocrd_logging --- tests/base.py | 2 -- tests/cli/test_log.py | 11 +++++-- tests/processor/test_processor.py | 32 +++++++++++++------- tests/test_decorators.py | 17 +++++------ tests/test_logging.py | 6 ++++ tests/test_logging_conf.py | 49 +++++++++++++------------------ tests/test_mets_server.py | 28 ++++++++++++------ 7 files changed, 83 insertions(+), 62 deletions(-) diff --git a/tests/base.py b/tests/base.py index 53f393e08..9eb1f20db 100644 --- a/tests/base.py +++ b/tests/base.py @@ -26,8 +26,6 @@ class TestCase(VanillaTestCase): def setUp(self): chdir(dirname(realpath(__file__)) + '/..') - disableLogging() - initLogging(builtin_only=True) class CapturingTestCase(TestCase): """ diff --git a/tests/cli/test_log.py b/tests/cli/test_log.py index c63d78c31..3d81e8266 100644 --- a/tests/cli/test_log.py +++ b/tests/cli/test_log.py @@ -6,8 +6,8 @@ from tests.base import CapturingTestCase as TestCase, main, assets, copy_of_directory from ocrd.decorators import ocrd_loglevel -from ocrd_utils import setOverrideLogLevel, logging, disableLogging -import logging as python_logging +from ocrd_utils import disableLogging, initLogging +import logging @click.group() @ocrd_loglevel @@ -18,14 +18,19 @@ def mock_ocrd_cli(log_level): class TestLogCli(TestCase): def _get_log_output(self, *args): - disableLogging() code, out, err = self.invoke_cli(mock_ocrd_cli, args) print({'code': code, 'out': out, 'err': err}) return err + def setUp(self): + super().setUp() + initLogging() + def tearDown(self): if 'OCRD_TOOL_NAME' in ENV: del(ENV['OCRD_TOOL_NAME']) + super().tearDown() + disableLogging() def test_loglevel(self): assert 'DEBUG ocrd.log_cli - foo' not in self._get_log_output('log', 'debug', 'foo') diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 5844cb877..06c129c3c 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -27,21 +27,21 @@ class TestProcessor(TestCase): + def run(self, result=None): + with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as workdir: + with pushd_popd(workdir): + self.resolver = Resolver() + self.workspace = self.resolver.workspace_from_url('mets.xml') + super().run(result=result) + def setUp(self): super().setUp() - # make sure we get an isolated temporary copy of the testdata each time - # as long as we are not using pytest but unittest, we need to manage contexts - # (enterContext is only supported starting with py311) - with ExitStack() as stack: - self.resolver = Resolver() - self.workdir = stack.enter_context(copy_of_directory(assets.path_to('SBB0000F29300010000/data'))) - stack.enter_context(pushd_popd(self.workdir)) - self.workspace = self.resolver.workspace_from_url('mets.xml') - self.addCleanup(stack.pop_all().close) + initLogging() def tearDown(self): super().tearDown() config.reset_defaults() + disableLogging() def test_incomplete_processor(self): proc = IncompleteProcessor(None) @@ -423,6 +423,7 @@ def ocrd_tool(self): def test_run_output_metsserver(start_mets_server): mets_server_url, ws = start_mets_server + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == 0 # do not raise for number of failures: config.OCRD_MAX_MISSING_OUTPUTS = -1 run_processor(DummyProcessorWithOutputSleep, workspace=ws, @@ -446,22 +447,33 @@ def test_run_output_metsserver(start_mets_server): parameter={"sleep": 0}, mets_server_url=mets_server_url) assert "already exists" in str(exc.value) + config.reset_defaults() # 2s (+ 2s tolerance) instead of 3*3s (+ 2s tolerance) -@pytest.mark.timeout(4) +# fixme: pytest-timeout does not shut down / finalize the fixture properly +# (regardless of method or func_only), so the next test in the suite +# does not execute ("previous item was not torn down properly") +# so we must instead wait for completion and assert on the time spent... +#@pytest.mark.timeout(timeout=4, func_only=True, method="signal") def test_run_output_parallel(start_mets_server): + import time mets_server_url, ws = start_mets_server + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == 0 # do not raise for single-page timeout config.OCRD_PROCESSING_PAGE_TIMEOUT = -1 # do not raise for number of failures: config.OCRD_MAX_MISSING_OUTPUTS = -1 config.OCRD_MAX_PARALLEL_PAGES = 3 + start_time = time.time() run_processor(DummyProcessorWithOutputSleep, workspace=ws, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-OUT", parameter={"sleep": 2}, mets_server_url=mets_server_url) + run_time = time.time() - start_time + assert run_time < 3, f"run_processor took {run_time}s" assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + config.reset_defaults() if __name__ == "__main__": main(__file__) diff --git a/tests/test_decorators.py b/tests/test_decorators.py index c36577020..561fdc762 100644 --- a/tests/test_decorators.py +++ b/tests/test_decorators.py @@ -41,22 +41,20 @@ def cli_dummy_processor(*args, **kwargs): class TestDecorators(TestCase): - def setUp(self): - super().setUp() - disableLogging() - def tearDown(self): super().tearDown() config.reset_defaults() + disableLogging() def test_minimal(self): - exit_code, out, err = self.invoke_cli(cli_with_ocrd_cli_options, ['-l', 'DEBUG']) - print(out, err) - assert not exit_code + initLogging() + code, out, err = self.invoke_cli(cli_with_ocrd_cli_options, ['-l', 'DEBUG']) + assert not code, (out, err) def test_loglevel_invalid(self): - code, _, err = self.invoke_cli(cli_with_ocrd_loglevel, ['--log-level', 'foo']) - assert code + initLogging() + code, out, err = self.invoke_cli(cli_with_ocrd_loglevel, ['--log-level', 'foo']) + assert code, (out, err) import click if int(click.__version__[0]) < 8: assert 'invalid choice: foo' in err @@ -67,7 +65,6 @@ def test_loglevel_override(self): if get_logging_config_files(): pytest.skip(f"ocrd_logging.conf found at {get_logging_config_files()}, skipping logging test") import logging - disableLogging() assert logging.getLogger('').getEffectiveLevel() == logging.WARNING assert logging.getLogger('ocrd').getEffectiveLevel() == logging.WARNING initLogging() diff --git a/tests/test_logging.py b/tests/test_logging.py index c2b6913b1..091fc25be 100644 --- a/tests/test_logging.py +++ b/tests/test_logging.py @@ -26,16 +26,22 @@ class TestLogging(TestCase): def setUp(self): pass # do not chdir + def tearDown(self): + super().tearDown() + disableLogging() + def test_loglevel_inheritance(self): initLogging(builtin_only=True) ocrd_logger = logging.getLogger('ocrd') assert ocrd_logger.getEffectiveLevel() == logging.INFO some_logger = getLogger('ocrd.foo') + assert some_logger.level == logging.NOTSET assert some_logger.getEffectiveLevel() == logging.INFO setOverrideLogLevel('ERROR') assert ocrd_logger.getEffectiveLevel() == logging.ERROR assert some_logger.getEffectiveLevel() == logging.ERROR another_logger = getLogger('ocrd.bar') + assert another_logger.level == logging.NOTSET assert another_logger.getEffectiveLevel() == logging.ERROR def test_getLevelName(self): diff --git a/tests/test_logging_conf.py b/tests/test_logging_conf.py index f8e0e9e89..071767410 100644 --- a/tests/test_logging_conf.py +++ b/tests/test_logging_conf.py @@ -21,74 +21,67 @@ # sys.path.append(os.path.dirname(os.path.realpath(__file__)) + '/../ocrd') TEST_ROOT = pathlib.Path(os.path.dirname(os.path.abspath(__file__))).parent -def resetLogging(): - disableLogging() - initLogging() - - @pytest.fixture(name="logging_conf") -def _fixture_logging_conf(tmpdir): +def _fixture_logging_conf(tmpdir, capfd): path_logging_conf_orig = os.path.join( str(TEST_ROOT), 'src', 'ocrd_utils', 'ocrd_logging.conf') path_logging_conf_dest = os.path.join(str(tmpdir), 'ocrd_logging.conf') shutil.copy(path_logging_conf_orig, path_logging_conf_dest) - return str(tmpdir) + with pushd_popd(tmpdir): + with capfd.disabled(): + initLogging() + yield str(tmpdir) + disableLogging() -def test_configured_dateformat(logging_conf, capsys): +def test_configured_dateformat(logging_conf, capfd): """Ensure example ocrd_logging.conf is valid and produces desired record format""" # arrange - with pushd_popd(logging_conf): - resetLogging() - test_logger = getLogger('') + test_logger = getLogger('ocrd') - # act - test_logger.info("test logger initialized") + # act + test_logger.info("test logger initialized") - log_info_output = capsys.readouterr().err - must_not_match = r"^\d{4}-\d{2}-\d{2}.*" - assert not re.match(must_not_match, log_info_output) - match_pattern = r"^\d{2}:\d{2}:\d{2}.*" - assert re.match(match_pattern, log_info_output) + log_info_output = capfd.readouterr().err + must_not_match = r"^\d{4}-\d{2}-\d{2}.*" + assert not re.match(must_not_match, log_info_output) + match_pattern = r"^\d{2}:\d{2}:\d{2}.*" + assert re.match(match_pattern, log_info_output), log_info_output -def test_configured_tensorflow_logger_present(logging_conf, capsys): +def test_configured_tensorflow_logger_present(logging_conf, capfd): """Ensure example ocrd_logging.conf is valid and contains logger tensorflow""" # arrange - os.chdir(logging_conf) - resetLogging() logger_under_test = getLogger('tensorflow') # act info logger_under_test.info("tensorflow logger initialized") - log_info_output = capsys.readouterr().err + log_info_output = capfd.readouterr().err assert not log_info_output # act error logger_under_test.error("tensorflow has error") - log_error_output = capsys.readouterr().err + log_error_output = capfd.readouterr().err assert log_error_output -def test_configured_shapely_logger_present(logging_conf, capsys): +def test_configured_shapely_logger_present(logging_conf, capfd): """Ensure example ocrd_logging.conf is valid and contains logger shapely.geos""" # arrange - os.chdir(logging_conf) - resetLogging() logger_under_test = getLogger('shapely.geos') # act info logger_under_test.info("shapely.geos logger initialized") - log_info_output = capsys.readouterr().err + log_info_output = capfd.readouterr().err assert not log_info_output # act error logger_under_test.error("shapely alert") - log_error_output = capsys.readouterr().err + log_error_output = capfd.readouterr().err assert log_error_output if __name__ == '__main__': diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index dc94d6c56..3bb96535c 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -22,20 +22,17 @@ from requests.exceptions import ConnectionError from ocrd import Resolver, OcrdMetsServer, Workspace -from ocrd_utils import pushd_popd, MIMETYPE_PAGE, initLogging, setOverrideLogLevel +from ocrd_utils import pushd_popd, MIMETYPE_PAGE, initLogging, setOverrideLogLevel, disableLogging, getLogger TRANSPORTS = ['/tmp/ocrd-mets-server.sock', 'http://127.0.0.1:12345'] -initLogging() -setOverrideLogLevel(10) - @fixture(scope='function', name='start_mets_server', params=TRANSPORTS) def fixture_start_mets_server(request, tmpdir) -> Iterable[Tuple[str, Workspace]]: - tmpdir = str(tmpdir) - def _start_mets_server(*args, **kwargs): - mets_server = OcrdMetsServer(*args, **kwargs) - mets_server.startup() + initLogging() + #setOverrideLogLevel(10) + logger = getLogger('ocrd') + tmpdir = str(tmpdir) mets_server_url = request.param if mets_server_url == TRANSPORTS[0]: @@ -47,13 +44,26 @@ def _start_mets_server(*args, **kwargs): copytree(assets.path_to('SBB0000F29300010000/data'), tmpdir) workspace = Workspace(Resolver(), tmpdir) - p = Process(target=_start_mets_server, kwargs={'workspace': workspace, 'url': request.param}) + class MetsServerProcess(Process): + def __init__(self, *args, **kwargs): + self.server = OcrdMetsServer(*args, **kwargs) + super().__init__() + def run(self): + self.server.startup() + def terminate(self): + self.server.workspace.save_mets() + super().terminate() + p = MetsServerProcess(workspace=workspace, url=request.param) p.start() + logger.info("started METS Server") sleep(1) # sleep to start up server workspace_server = Workspace(Resolver(), tmpdir, mets_server_url=mets_server_url) yield mets_server_url, workspace_server p.terminate() + p.join() + logger.info("terminated METS Server") rmtree(tmpdir, ignore_errors=True) + disableLogging() def add_file_server(x, force=False): mets_server_url, directory, i = x From f3e423ac52f5293596cf88ac2031384857be4145 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 7 Nov 2024 18:36:17 +0000 Subject: [PATCH 232/249] initLogging: do not remove any previous handlers/levels --- src/ocrd_utils/logging.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index dfac74988..404ac7ddb 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -161,18 +161,6 @@ def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_L global _initialized_flag if _initialized_flag and not force_reinit: return - # disableLogging() - - # https://docs.python.org/3/library/logging.html#logging.disable - # If logging.disable(logging.NOTSET) is called, it effectively removes this - # overriding level, so that logging output again depends on the effective - # levels of individual loggers. - logging.disable(logging.NOTSET) - - # remove all handlers for the ocrd root loggers - for logger_name in ROOT_OCRD_LOGGERS: - for handler in logging.getLogger(logger_name).handlers[:]: - logging.getLogger(logger_name).removeHandler(handler) config_file = None if not builtin_only: From 31435187dffb43c692f24f3108f24d0ed1093cfd Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 7 Nov 2024 18:38:44 +0000 Subject: [PATCH 233/249] initLogging: only add root handler instead of multiple redundant handlers with propagate=false --- src/ocrd_utils/logging.py | 7 ++----- src/ocrd_utils/ocrd_logging.conf | 28 +++++++++++++--------------- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index 404ac7ddb..7f59221c8 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -179,11 +179,8 @@ def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_L ocrd_handler = logging.StreamHandler(stream=sys.stderr) ocrd_handler.setFormatter(logging.Formatter(fmt=LOG_FORMAT, datefmt=LOG_TIMEFMT)) ocrd_handler.setLevel(logging.DEBUG) - for logger_name in ROOT_OCRD_LOGGERS: - logger = logging.getLogger(logger_name) - logger.addHandler(ocrd_handler) - if logger_name: - logger.propagate = False # avoid duplication (from root handler) + root_logger = logging.getLogger('') + root_logger.addHandler(ocrd_handler) for logger_name, logger_level in LOGGING_DEFAULTS.items(): logging.getLogger(logger_name).setLevel(logger_level) _initialized_flag = True diff --git a/src/ocrd_utils/ocrd_logging.conf b/src/ocrd_utils/ocrd_logging.conf index 5cf161398..0af039b2a 100644 --- a/src/ocrd_utils/ocrd_logging.conf +++ b/src/ocrd_utils/ocrd_logging.conf @@ -56,22 +56,22 @@ handlers=consoleHandler,fileHandler # ocrd loggers [logger_ocrd] level=INFO -handlers=consoleHandler,fileHandler +handlers= qualname=ocrd -propagate=0 [logger_ocrd_network] level=INFO -handlers=consoleHandler,processingServerHandler +#handlers=consoleHandler,processingServerHandler +handlers=processingServerHandler qualname=ocrd_network -propagate=0 +#propagate=0 # # logger tensorflow # [logger_ocrd_tensorflow] level=ERROR -handlers=consoleHandler +handlers= qualname=tensorflow # @@ -79,7 +79,7 @@ qualname=tensorflow # [logger_ocrd_shapely_geos] level=ERROR -handlers=consoleHandler +handlers= qualname=shapely.geos @@ -88,7 +88,7 @@ qualname=shapely.geos # [logger_ocrd_PIL] level=INFO -handlers=consoleHandler +handlers= qualname=PIL # @@ -96,34 +96,32 @@ qualname=PIL # [logger_paramiko] level=INFO -handlers=consoleHandler +handlers= qualname=paramiko -propagate=0 [logger_paramiko_transport] level=INFO -handlers=consoleHandler +handlers= qualname=paramiko.transport -propagate=0 # # uvicorn loggers # [logger_uvicorn] level=INFO -handlers=consoleHandler +handlers= qualname=uvicorn [logger_uvicorn_access] level=WARN -handlers=consoleHandler +handlers= qualname=uvicorn.access [logger_uvicorn_error] level=INFO -handlers=consoleHandler +handlers= qualname=uvicorn.error [logger_multipart] level=INFO -handlers=consoleHandler +handlers= qualname=multipart From 27323c665edc608958a484ce7ae4aebaa65f45f6 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 7 Nov 2024 18:41:20 +0000 Subject: [PATCH 234/249] disableLogging: remove all handlers, reset all levels --- src/ocrd_utils/logging.py | 35 ++++++++++------------------------- 1 file changed, 10 insertions(+), 25 deletions(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index 7f59221c8..db7921c84 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -46,13 +46,6 @@ 'setOverrideLogLevel', ] -# These are the loggers we add handlers to -ROOT_OCRD_LOGGERS = [ - '', - 'ocrd', - 'ocrd_network' -] - LOGGING_DEFAULTS = { 'ocrd': logging.INFO, 'ocrd_network': logging.INFO, @@ -196,24 +189,16 @@ def disableLogging(silent=not config.OCRD_LOGGING_DEBUG): if _initialized_flag and not silent: print("[LOGGING] Disabling logging", file=sys.stderr) _initialized_flag = False - # logging.basicConfig(level=logging.CRITICAL) - # logging.disable(logging.ERROR) - # remove all handlers for the ocrd logger - for logger_name in ROOT_OCRD_LOGGERS: - for handler in logging.getLogger(logger_name).handlers[:]: - logging.getLogger(logger_name).removeHandler(handler) - for logger_name in LOGGING_DEFAULTS: - logging.getLogger(logger_name).setLevel(logging.NOTSET) + # remove all handlers we might have added (via initLogging on builtin or file config) + for logger_name in logging.root.manager.loggerDict: + if not silent: + print(f'[LOGGING] Resetting {logger_name} log level and handlers') + logger = logging.getLogger(logger_name) + logger.setLevel(logging.NOTSET) + for handler in logger.handlers[:]: + logger.removeHandler(handler) + for handler in logging.root.handlers[:]: + logging.root.removeHandler(handler) # Python default log level is WARNING logging.root.setLevel(logging.WARNING) -# Initializing stream handlers at module level -# would cause message output in all runtime contexts, -# including those which are already run for std output -# (--dump-json, --version, ocrd-tool, bashlib etc). -# So this needs to be an opt-in from the CLIs/decorators: -#initLogging() -# Also, we even have to block log output for libraries -# (like matplotlib/tensorflow) which set up logging -# themselves already: -disableLogging() From eb3120d77fab33ce2da91515dc452ffe438833e9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 7 Nov 2024 18:42:52 +0000 Subject: [PATCH 235/249] setOverrideLogLevel: override all currently active loggers' level --- src/ocrd_utils/logging.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index db7921c84..98c2f58b2 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -107,18 +107,15 @@ def setOverrideLogLevel(lvl, silent=not config.OCRD_LOGGING_DEBUG): lvl (string): Log level name. silent (boolean): Whether to log the override call """ - if not _initialized_flag: - initLogging(silent=silent) - ocrd_logger = logging.getLogger('ocrd') - - if lvl is None: - if not silent: - print('[LOGGING] Reset log level override', file=sys.stderr) - ocrd_logger.setLevel(logging.NOTSET) - else: - if not silent: - print(f'[LOGGING] Overriding ocrd log level to {lvl}', file=sys.stderr) - ocrd_logger.setLevel(lvl) + if lvl is not None: + lvl = getLevelName(lvl) + if not _initialized_flag: + initLogging(silent=silent) + # affect all configured loggers + for logger_name in logging.root.manager.loggerDict: + if not silent: + print(f'[LOGGING] Overriding {logger_name} log level to {lvl}', file=sys.stderr) + logging.getLogger(logger_name).setLevel(lvl) def get_logging_config_files(): """ From 0186c53795c0f32167a148172ea123906db79c41 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 7 Nov 2024 18:43:40 +0000 Subject: [PATCH 236/249] logging: increase default root (not ocrd) level from INFO to WARNING --- src/ocrd_utils/logging.py | 1 + src/ocrd_utils/ocrd_logging.conf | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index 98c2f58b2..ddb8b88b2 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -47,6 +47,7 @@ ] LOGGING_DEFAULTS = { + '': logging.WARNING, 'ocrd': logging.INFO, 'ocrd_network': logging.INFO, # 'ocrd.resolver': logging.INFO, diff --git a/src/ocrd_utils/ocrd_logging.conf b/src/ocrd_utils/ocrd_logging.conf index 0af039b2a..41e6d5af7 100644 --- a/src/ocrd_utils/ocrd_logging.conf +++ b/src/ocrd_utils/ocrd_logging.conf @@ -34,7 +34,7 @@ keys=defaultFormatter,detailedFormatter # default logger "root" using consoleHandler # [logger_root] -level=INFO +level=WARNING handlers=consoleHandler,fileHandler From 5ba27209d396c44eb4d5e53f784a9fd42167a9ee Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 7 Nov 2024 18:44:06 +0000 Subject: [PATCH 237/249] Processor: update max_workers docstring --- src/ocrd/processor/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 87e6731df..f0d453f4a 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -158,12 +158,12 @@ class Processor(): max_workers : int = -1 """ - maximum number of processor threads for page-parallel processing (ignored if negative), + maximum number of processor forks for page-parallel processing (ignored if negative), to be applied on top of :py:data:`~ocrd_utils.config.OCRD_MAX_PARALLEL_PAGES` (i.e. whatever is smaller). (Override this if you know how many pages fit into processing units - GPU shaders / CPU cores - - at once, or if your class is not thread-safe.) + - at once, or if your class already creates threads prior to forking, e.g. during ``setup``.) """ max_page_seconds : int = -1 From f8f71d809207f3bf1fc94dbdb9525272c13cd286 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 11 Nov 2024 13:34:10 +0000 Subject: [PATCH 238/249] initLogging: call disableLogging if already initialized and force_reinit --- src/ocrd_utils/logging.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index ddb8b88b2..52b01883f 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -150,8 +150,11 @@ def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_L - silent (bool): Whether to log logging behavior by printing to stderr """ global _initialized_flag - if _initialized_flag and not force_reinit: - return + if _initialized_flag: + if force_reinit: + disableLogging(silent=silent) + else: + return config_file = None if not builtin_only: From 5f2f602f5917d2f0970ff0fc15d64b148083b98b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 11 Nov 2024 16:02:44 +0000 Subject: [PATCH 239/249] Processor: replace weakref with __del__ to trigger shutdown --- src/ocrd/processor/base.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index f0d453f4a..7ec77162e 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -21,7 +21,6 @@ import inspect import tarfile import io -import weakref from collections import defaultdict from frozendict import frozendict # concurrent.futures is buggy in py38, @@ -366,12 +365,14 @@ def __init__( self._base_logger = getLogger('ocrd.processor.base') if parameter is not None: self.parameter = parameter - # ensure that shutdown gets called at destruction - self._finalizer = weakref.finalize(self, self.shutdown) # workaround for deprecated#72 (@deprecated decorator does not work for subclasses): setattr(self, 'process', deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')(getattr(self, 'process'))) + def __del__(self): + self._base_logger.debug("shutting down") + self.shutdown() + def show_help(self, subcommand=None): """ Print a usage description including the standard CLI and all of this processor's ocrd-tool From 0446b82be55093536c5c0818de3b49d0aecc727a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 11 Nov 2024 23:23:15 +0000 Subject: [PATCH 240/249] Processor parallel pages: log via QueueHandler in subprocess, QueueListener in main --- repo/spec | 2 +- src/ocrd/processor/base.py | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/repo/spec b/repo/spec index df2a07e3f..506b33936 160000 --- a/repo/spec +++ b/repo/spec @@ -1 +1 @@ -Subproject commit df2a07e3fda634b2eda5785afe67399b61a81173 +Subproject commit 506b33936d89080a683fa8a26837f2a23b23e5e2 diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 7ec77162e..d6348b40e 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -18,6 +18,8 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union, get_args import sys +import logging +import logging.handlers import inspect import tarfile import io @@ -515,22 +517,31 @@ def process_workspace(self, workspace: Workspace) -> None: if max_workers > 1: executor_cls = ProcessPoolExecutor + log_queue = mp.Queue() + # forward messages from log queue (in subprocesses) to all root handlers + log_listener = logging.handlers.QueueListener(log_queue, *logging.root.handlers, respect_handler_level=True) else: executor_cls = DummyExecutor + log_queue = None + log_listener = None executor = executor_cls( max_workers=max_workers or 1, # only forking method avoids pickling context=mp.get_context('fork'), # share processor instance as global to avoid pickling initializer=_page_worker_set_ctxt, - initargs=(self,), + initargs=(self, log_queue), ) + if max_workers > 1: + log_listener.start() try: self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1) tasks = self.process_workspace_submit_tasks(executor, max_seconds) stats = self.process_workspace_handle_tasks(tasks) finally: executor.shutdown(kill_workers=True, wait=False) + if max_workers > 1: + log_listener.stop() except NotImplementedError: # fall back to deprecated method @@ -1110,13 +1121,16 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): objects, and with the METS Server we do not mutate the local processor instance anyway. """ -def _page_worker_set_ctxt(processor): +def _page_worker_set_ctxt(processor, log_queue): """ Overwrites `ocrd.processor.base._page_worker_processor` instance for sharing with subprocesses in ProcessPoolExecutor initializer. """ global _page_worker_processor _page_worker_processor = processor + if log_queue: + # replace all log handlers with just one queue handler + logging.root.handlers = [logging.handlers.QueueHandler(log_queue)] def _page_worker(timeout, *input_files): """ From 53c4c18240684936d2cd4e87051b5bbcc57f9cb2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 12 Nov 2024 00:46:38 +0000 Subject: [PATCH 241/249] :package: v3.0.0b7 --- CHANGELOG.md | 13 +++++++++++++ VERSION | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index da422654b..04ea2d42a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,19 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [3.0.0b7] - 2024-11-12 + +Fixed: + - `initLogging`: only add root handler instead of multiple redundant handlers with `propagate=false` + - `setOverrideLogLevel`: override all currently active loggers' level + +Changed: + - :fire: logging: increase default root (not `ocrd`) level from `INFO` to `WARNING` + - :fire: `initLogging`: do not remove any previous handlers/levels, unless `force_reinit` + - :fire: `disableLogging`: remove all handlers, reset all levels - instead of being selective + - :fire: Processor: replace `weakref` with `__del__` to trigger `shutdown` + - :fire: `OCRD_MAX_PARALLEL_PAGES>1`: log via `QueueHandler` in subprocess, `QueueListener` in main + ## [3.0.0b6] - 2024-10-30 Fixed: diff --git a/VERSION b/VERSION index 43662e8c2..1129dfd44 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.0.0b6 +3.0.0b7 From db21d754e2561664deeb68da26f98307b8e67382 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 6 Jan 2025 16:12:59 +0100 Subject: [PATCH 242/249] ocrd_cli_wrap_processor: always do initLogging --- src/ocrd/decorators/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index f659bf58a..6e0ceb1f1 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -48,6 +48,9 @@ def ocrd_cli_wrap_processor( # ocrd_network params end # **kwargs ): + # init logging handlers so no imported libs can preempt ours + initLogging() + # FIXME: remove workspace arg entirely processor = processorClass(None) if not sys.argv[1:]: @@ -89,8 +92,6 @@ def ocrd_cli_wrap_processor( # Used for checking/starting network agents for the WebAPI architecture check_and_run_network_agent(processorClass, subcommand, address, database, queue) - # from here: single-run processing context - initLogging() if 'parameter' in kwargs: # Disambiguate parameter file/literal, and resolve file def resolve(name): From 6e048e113aeffd6e5ef990333d8a373b10a153a9 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 8 Jan 2025 14:52:28 +0100 Subject: [PATCH 243/249] fix help output for multi-line config option descriptions --- src/ocrd/cli/__init__.py | 4 ++-- src/ocrd_utils/config.py | 38 +++++++++++++++++++++++++------------- 2 files changed, 27 insertions(+), 15 deletions(-) diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index 9e8a37b8b..2af14ce63 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -12,11 +12,11 @@ # pylint: disable=wrong-import-position -def command_with_replaced_help(*replacements): +def command_with_replaced_help(*replacements: tuple[str, str]): class CommandWithReplacedHelp(click.Command): def get_help(self, ctx): - newhelp = super().get_help(ctx) + newhelp : str = super().get_help(ctx) for replacement in replacements: newhelp = re.sub(*replacement, newhelp) # print(newhelp) diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 36399870e..16c9eb02e 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -21,7 +21,7 @@ def _parser_boolean(val): class OcrdEnvVariable(): - def __init__(self, name, description, parser=str, validator=lambda val: True, default=[False, None]): + def __init__(self, name, description, parser=str, validator=lambda _: True, default=[False, None]): """ An environment variable for use in OCR-D. @@ -47,10 +47,19 @@ def __str__(self): return f'{self.name}: {self.description}' def describe(self, wrap_text=True, indent_text=True): + """ + Output help information on a config option. + + If ``option.description`` is a multiline string with complex formatting + (e.g. markdown lists), replace empty lines with ``\b`` and set + ``wrap_text`` to ``False``. + """ desc = self.description if self.has_default: default = self.default() if callable(self.default) else self.default - desc += f' (Default: "{default}")' + if not desc.endswith('\n'): + desc += ' ' + desc += f'(Default: "{default}")' ret = '' ret = f'{self.name}\n' if wrap_text: @@ -146,11 +155,11 @@ def raw_value(self, name): description="""\ Whether to enable gathering runtime statistics on the `ocrd.profile` logger (comma-separated): - +\b - `CPU`: yields CPU and wall-time, - `RSS`: also yields peak memory (resident set size) - `PSS`: also yields peak memory (proportional set size) - +\b """, validator=lambda val : all(t in ('', 'CPU', 'RSS', 'PSS') for t in val.split(',')), default=(True, '')) @@ -183,11 +192,12 @@ def _ocrd_download_timeout_parser(val): config.add("OCRD_MISSING_INPUT", description="""\ -How to deal with missing input files (for some fileGrp/pageId) during processing: - +How to deal with missing input files +(for some fileGrp/pageId) during processing: +\b - `SKIP`: ignore and proceed with next page's input - `ABORT`: throw :py:class:`.MissingInputFile` - +\b """, default=(True, 'SKIP'), validator=lambda val: val in ['SKIP', 'ABORT'], @@ -195,12 +205,13 @@ def _ocrd_download_timeout_parser(val): config.add("OCRD_MISSING_OUTPUT", description="""\ -How to deal with missing output files (for some fileGrp/pageId) during processing: - +How to deal with missing output files +(for some fileGrp/pageId) during processing: +\b - `SKIP`: ignore and proceed processing next page - `COPY`: fall back to copying input PAGE to output fileGrp for page - `ABORT`: re-throw whatever caused processing to fail - +\b """, default=(True, 'SKIP'), validator=lambda val: val in ['SKIP', 'COPY', 'ABORT'], @@ -213,12 +224,13 @@ def _ocrd_download_timeout_parser(val): config.add("OCRD_EXISTING_OUTPUT", description="""\ -How to deal with already existing output files (for some fileGrp/pageId) during processing: - +How to deal with already existing output files +(for some fileGrp/pageId) during processing: +\b - `SKIP`: ignore and proceed processing next page - `OVERWRITE`: force writing result to output fileGrp for page - `ABORT`: re-throw :py:class:`FileExistsError` - +\b """, default=(True, 'SKIP'), validator=lambda val: val in ['SKIP', 'OVERWRITE', 'ABORT'], From 3eea1739964b30ae2a6624372e79ef99be13f8a2 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 8 Jan 2025 19:07:14 +0100 Subject: [PATCH 244/249] merge master --- .github/workflows/publish-pypi.yml | 31 ++++++ CHANGELOG.md | 73 ++++++++++++ Dockerfile.cuda-torch | 2 - Makefile | 8 +- .../ocrd_network.client_utils.rst | 7 ++ docs/api/ocrd_network/ocrd_network.rst | 1 + docs/conf.py | 3 +- repo/assets | 2 +- src/ocrd/cli/__init__.py | 2 + src/ocrd/mets_server.py | 104 +++++++++++------- src/ocrd/resource_manager.py | 2 + src/ocrd_models/ocrd_exif.py | 4 +- src/ocrd_network/cli/client.py | 35 ++++-- src/ocrd_network/client.py | 15 ++- src/ocrd_network/client_utils.py | 39 ++++--- src/ocrd_network/processing_server.py | 42 ++++--- src/ocrd_network/processing_worker.py | 11 +- src/ocrd_network/processor_server.py | 3 +- src/ocrd_network/rabbitmq_utils/connector.py | 4 +- src/ocrd_network/runtime_data/deployer.py | 46 +++++--- src/ocrd_network/server_cache.py | 49 +++++---- src/ocrd_network/server_utils.py | 44 +++++++- src/ocrd_network/tcp_to_uds_mets_proxy.py | 13 ++- src/ocrd_network/utils.py | 34 +++--- src/ocrd_utils/config.py | 16 ++- tests/model/test_exif.py | 8 +- tests/network/config.py | 12 +- .../network/test_modules_mets_server_proxy.py | 2 +- tests/test_resolver.py | 2 +- tests/test_resource_manager.py | 2 +- 30 files changed, 440 insertions(+), 176 deletions(-) create mode 100644 .github/workflows/publish-pypi.yml create mode 100644 docs/api/ocrd_network/ocrd_network.client_utils.rst diff --git a/.github/workflows/publish-pypi.yml b/.github/workflows/publish-pypi.yml new file mode 100644 index 000000000..e811c958a --- /dev/null +++ b/.github/workflows/publish-pypi.yml @@ -0,0 +1,31 @@ +# This workflow will upload a Python Package using Twine when a release is created +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries + +name: Upload Python Package + +on: + release: + types: [published] + workflow_dispatch: + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.8' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel build twine + pip install -r requirements.txt + - name: Build and publish + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: make pypi diff --git a/CHANGELOG.md b/CHANGELOG.md index 04ea2d42a..7f7a0eb2e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -163,6 +163,73 @@ Added: - `Processor.verify`: handle fileGrp cardinality verification, with default implementation - `Processor.setup`: to set up processor before processing, optional +## [2.71.0] - 2024-11-20 + +Changed: + + * Rewrite `ocrd_utils.logging`, #1288 + * Handle only `''` as the root logger + * `disableLogging`: Remove handlers from root and all configured loggers + * Do not do any module-level modification of the log config + +Fixed: + + * Typo in processing_worker log message, #1293 + * Call `initLogging` at the right time in `ocrd_network`, #1292 + * `make docs` fixed with absolute path to location, #1273 + +## [2.70.0] - 2024-10-10 + +Added: + + - `ocrd network client workflow run`: Add `--print-status` flag to periodically print the job status, #1277 + - Processing Server: `DELETE /mets_server_zombies` to kill any renegade METS servers, #1277 + - No more zombie METS Server by properly shutting them down, #1284 + - `OCRD_NETWORK_RABBITMQ_HEARBEAT` to allow overriding the [heartbeat](https://pika.readthedocs.io/en/stable/examples/heartbeat_and_blocked_timeouts.html) behavior of RabbitMQ, #1285 + +Changed: + + - significantly more detailed logging for the METS Server and Processing Server, #1284 + - Only import `ocrd_network` in src/ocrd/decorators/__init__.py once needed, #1289 + - Automate release via GitHub Actions, #1290 + +Fixed: + + - `ocrd/core-cuda-torch`: Install torchvision as well, #1286 + - Processing Server: remove shut down METS servers from deployer's cache, #1287 + - typos, #1274 + +## [2.69.0] - 2024-09-30 + +Fixed: + - tests: ensure `ocrd_utils.config` gets reset whenever changing it globally + - `ocrd.cli.workspace`: consistently pass on `--mets-server-url` and `--backup` + - `ocrd.cli.workspace`: make `list-page` work w/ METS Server + - `ocrd.cli.validate "tasks"`: pass on `--mets-server-url` + - `lib.bash`: fix `errexit` handling + - actually apply CLI `--log-filename`, and show in `--help` + - adapt to Pillow changes + - `ocrd workspace clone`: do pass on `--file-grp` (for download filtering) + - `OcrdMetsServer.add_file`: pass on `force` kwarg + - `Workspace.reload_mets`: handle ClientSideOcrdMets as well + - `OcrdMets.get_physical_pages`: cover `return_divs` w/o `for_fileIds` and `for_pageIds` + - `disableLogging`: also re-instate root logger to Python defaults + - `OcrdExif`: handle multi-frame TIFFs gracefully in `identify` callout, #1276 + +Changed: + - `run_processor`: be robust if `ocrd_tool` is missing `steps` + - `PcGtsType.PageType.id` via `make_xml_id`: replace `/` with `_` + - `ClientSideOcrdMets`: use same logger name prefix as METS Server + - `Processor.zip_input_files`: when `--page-id` yields empty list, just log instead of raise + +Added: + - `OcrdPage`: new `PageType.get_ReadingOrderGroups()` to retrieve recursive RO as dict + - METS Server: export and delegate `physical_pages` + - ocrd.cli.workspace `server`: add subcommands `reload` and `save` + - processor CLI: delegate `--resolve-resource`, too + - `OcrdConfig.reset_defaults` to reset config variables to their defaults + - `ocrd_utils.scale_coordinates` for resizing images + ## [2.68.0] - 2024-08-23 Changed: @@ -2322,6 +2389,7 @@ Fixed Initial Release +<<<<<<< HEAD [3.0.0b6]: ../../compare/v3.0.0b6..v3.0.0b5 [3.0.0b5]: ../../compare/v3.0.0b5..v3.0.0b4 [3.0.0b4]: ../../compare/v3.0.0b4..v3.0.0b3 @@ -2330,6 +2398,11 @@ Initial Release [3.0.0b1]: ../../compare/v3.0.0b1..v3.0.0a2 [3.0.0a2]: ../../compare/v3.0.0a2..v3.0.0a1 [3.0.0a1]: ../../compare/v3.0.0a1..v2.67.2 +======= +[2.71.0]: ../../compare/v2.71.0..v2.70.0 +[2.70.0]: ../../compare/v2.70.0..v2.69.0 +[2.69.0]: ../../compare/v2.69.0..v2.68.0 +>>>>>>> master [2.68.0]: ../../compare/v2.68.0..v2.67.2 [2.67.2]: ../../compare/v2.67.2..v2.67.1 [2.67.1]: ../../compare/v2.67.1..v2.67.0 diff --git a/Dockerfile.cuda-torch b/Dockerfile.cuda-torch index 8d6c3aa62..59ce1144b 100644 --- a/Dockerfile.cuda-torch +++ b/Dockerfile.cuda-torch @@ -9,7 +9,5 @@ RUN make deps-torch WORKDIR /data -RUN rm -fr /build - CMD ["/usr/local/bin/ocrd", "--help"] diff --git a/Makefile b/Makefile index 1a4a6bbdb..bb5126955 100644 --- a/Makefile +++ b/Makefile @@ -63,7 +63,7 @@ deps-cuda: CONDA_EXE ?= /usr/local/bin/conda deps-cuda: export CONDA_PREFIX ?= /conda deps-cuda: PYTHON_PREFIX != $(PYTHON) -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])' deps-cuda: - curl -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba + curl --retry 6 -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba mv bin/micromamba $(CONDA_EXE) # Install Conda system-wide (for interactive / login shells) echo 'export MAMBA_EXE=$(CONDA_EXE) MAMBA_ROOT_PREFIX=$(CONDA_PREFIX) CONDA_PREFIX=$(CONDA_PREFIX) PATH=$(CONDA_PREFIX)/bin:$$PATH' >> /etc/profile.d/98-conda.sh @@ -97,7 +97,7 @@ deps-cuda: # works, too: shopt -s nullglob; \ $(PIP) install nvidia-pyindex \ - && $(PIP) install nvidia-cudnn-cu11==8.7.* \ + && $(PIP) install nvidia-cudnn-cu11~=8.7 \ nvidia-cublas-cu11~=11.11 \ nvidia-cusparse-cu11~=11.7 \ nvidia-cusolver-cu11~=11.4 \ @@ -158,7 +158,7 @@ deps-tf2: fi deps-torch: - $(PIP) install -i https://download.pytorch.org/whl/cu118 torch + $(PIP) install -i https://download.pytorch.org/whl/cu118 torchvision==0.16.2+cu118 torch==2.1.2+cu118 # Dependencies for deployment in an ubuntu/debian linux deps-ubuntu: @@ -178,7 +178,7 @@ build: # (Re)install the tool install: #build - # not stricttly necessary but a precaution against outdated python build tools, https://github.com/OCR-D/core/pull/1166 + # not strictly necessary but a precaution against outdated python build tools, https://github.com/OCR-D/core/pull/1166 $(PIP) install -U pip wheel $(PIP_INSTALL) . $(PIP_INSTALL_CONFIG_OPTION) @# workaround for shapely#1598 diff --git a/docs/api/ocrd_network/ocrd_network.client_utils.rst b/docs/api/ocrd_network/ocrd_network.client_utils.rst new file mode 100644 index 000000000..973e27cdb --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.client_utils.rst @@ -0,0 +1,7 @@ +ocrd\_network.client\_utils module +================================== + +.. automodule:: ocrd_network.client_utils + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.rst b/docs/api/ocrd_network/ocrd_network.rst index 449770275..d61da3931 100644 --- a/docs/api/ocrd_network/ocrd_network.rst +++ b/docs/api/ocrd_network/ocrd_network.rst @@ -24,6 +24,7 @@ Submodules :maxdepth: 4 ocrd_network.client + ocrd_network.client_utils ocrd_network.constants ocrd_network.database ocrd_network.logging_utils diff --git a/docs/conf.py b/docs/conf.py index 917c5c62c..939277ad5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -15,7 +15,8 @@ # import os # import sys # # sys.path.insert(0, os.path.abspath('..')) -with open('../VERSION', encoding='utf-8') as f: +from pathlib import Path +with open(Path(__file__).parent.parent / 'VERSION', encoding='utf-8') as f: VERSION = f.read() diff --git a/repo/assets b/repo/assets index 05568aaa2..ca108faf0 160000 --- a/repo/assets +++ b/repo/assets @@ -1 +1 @@ -Subproject commit 05568aaa2dc20678bf87ffec77f3baf2924d7c24 +Subproject commit ca108faf0e95cc823a9e84cd0a1602282ae006b1 diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index 9e8a37b8b..667bddc7c 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -83,6 +83,8 @@ def get_help(self, ctx): \b {config.describe('OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS')} \b +{config.describe('OCRD_NETWORK_RABBITMQ_HEARTBEAT')} +\b {config.describe('OCRD_PROFILE_FILE')} \b {config.describe('OCRD_PROFILE', wrap_text=False)} diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 101727e06..e0f002957 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -1,8 +1,10 @@ """ # METS server functionality """ +import os import re from os import _exit, chmod +import signal from typing import Dict, Optional, Union, List, Tuple from time import sleep from pathlib import Path @@ -155,13 +157,13 @@ def save(self): Request writing the changes to the file system """ if not self.multiplexing_mode: - self.session.request("PUT", url=self.url) + return self.session.request("PUT", url=self.url).text else: - self.session.request( + return self.session.request( "POST", self.url, json=MpxReq.save(self.ws_dir_path) - ) + ).json()["text"] def stop(self): """ @@ -169,14 +171,13 @@ def stop(self): """ try: if not self.multiplexing_mode: - self.session.request("DELETE", self.url) - return + return self.session.request("DELETE", self.url).text else: - self.session.request( + return self.session.request( "POST", self.url, json=MpxReq.stop(self.ws_dir_path) - ) + ).json()["text"] except ConnectionError: # Expected because we exit the process without returning pass @@ -323,7 +324,7 @@ def add_file( class MpxReq: - """This class wrapps the request bodies needed for the tcp forwarding + """This class wraps the request bodies needed for the tcp forwarding For every mets-server-call like find_files or workspace_path a special request_body is needed to call `MetsServerProxy.forward_tcp_request`. These are created by this functions. @@ -346,12 +347,12 @@ def __args_wrapper( @staticmethod def save(ws_dir_path: str) -> Dict: return MpxReq.__args_wrapper( - ws_dir_path, method_type="PUT", response_type="empty", request_url="", request_data={}) + ws_dir_path, method_type="PUT", response_type="text", request_url="", request_data={}) @staticmethod def stop(ws_dir_path: str) -> Dict: return MpxReq.__args_wrapper( - ws_dir_path, method_type="DELETE", response_type="empty", request_url="", request_data={}) + ws_dir_path, method_type="DELETE", response_type="text", request_url="", request_data={}) @staticmethod def reload(ws_dir_path: str) -> Dict: @@ -428,18 +429,24 @@ def create_process(mets_server_url: str, ws_dir_path: str, log_file: str) -> int @staticmethod def kill_process(mets_server_pid: int): - subprocess_run(args=["kill", "-s", "SIGINT", f"{mets_server_pid}"], shell=False, universal_newlines=True) + os.kill(mets_server_pid, signal.SIGINT) + sleep(3) + try: + os.kill(mets_server_pid, signal.SIGKILL) + except ProcessLookupError as e: + pass def shutdown(self): + pid = os.getpid() + self.log.info(f"Shutdown method of mets server[{pid}] invoked, sending SIGTERM signal.") + os.kill(pid, signal.SIGTERM) if self.is_uds: if Path(self.url).exists(): - self.log.debug(f'UDS socket {self.url} still exists, removing it') + self.log.warning(f"Due to a server shutdown, removing the existing UDS socket file: {self.url}") Path(self.url).unlink() - # os._exit because uvicorn catches SystemExit raised by sys.exit - _exit(0) def startup(self): - self.log.info("Starting up METS server") + self.log.info(f"Configuring the Mets Server") workspace = self.workspace @@ -465,32 +472,49 @@ def save(): """ Write current changes to the file system """ - return workspace.save_mets() + workspace.save_mets() + response = Response(content="The Mets Server is writing changes to disk.", media_type='text/plain') + self.log.info(f"PUT / -> {response.__dict__}") + return response @app.delete(path='/') - async def stop(): + def stop(): """ Stop the mets server """ - getLogger('ocrd.models.ocrd_mets').info(f'Shutting down METS Server {self.url}') workspace.save_mets() + response = Response(content="The Mets Server will shut down soon...", media_type='text/plain') self.shutdown() + self.log.info(f"DELETE / -> {response.__dict__}") + return response @app.post(path='/reload') - async def workspace_reload_mets(): + def workspace_reload_mets(): """ Reload mets file from the file system """ workspace.reload_mets() - return Response(content=f'Reloaded from {workspace.directory}', media_type="text/plain") + response = Response(content=f"Reloaded from {workspace.directory}", media_type='text/plain') + self.log.info(f"POST /reload -> {response.__dict__}") + return response @app.get(path='/unique_identifier', response_model=str) async def unique_identifier(): - return Response(content=workspace.mets.unique_identifier, media_type='text/plain') + response = Response(content=workspace.mets.unique_identifier, media_type='text/plain') + self.log.info(f"GET /unique_identifier -> {response.__dict__}") + return response @app.get(path='/workspace_path', response_model=str) async def workspace_path(): - return Response(content=workspace.directory, media_type="text/plain") + response = Response(content=workspace.directory, media_type="text/plain") + self.log.info(f"GET /workspace_path -> {response.__dict__}") + return response + + @app.get(path='/physical_pages', response_model=OcrdPageListModel) + async def physical_pages(): + response = {'physical_pages': workspace.mets.physical_pages} + self.log.info(f"GET /physical_pages -> {response}") + return response @app.get(path='/physical_pages', response_model=OcrdPageListModel) async def physical_pages(): @@ -498,18 +522,24 @@ async def physical_pages(): @app.get(path='/file_groups', response_model=OcrdFileGroupListModel) async def file_groups(): - return {'file_groups': workspace.mets.file_groups} + response = {'file_groups': workspace.mets.file_groups} + self.log.info(f"GET /file_groups -> {response}") + return response @app.get(path='/agent', response_model=OcrdAgentListModel) async def agents(): - return OcrdAgentListModel.create(workspace.mets.agents) + response = OcrdAgentListModel.create(workspace.mets.agents) + self.log.info(f"GET /agent -> {response.__dict__}") + return response @app.post(path='/agent', response_model=OcrdAgentModel) async def add_agent(agent: OcrdAgentModel): kwargs = agent.dict() kwargs['_type'] = kwargs.pop('type') workspace.mets.add_agent(**kwargs) - return agent + response = agent + self.log.info(f"POST /agent -> {response.__dict__}") + return response @app.get(path="/file", response_model=OcrdFileListModel) async def find_files( @@ -526,7 +556,9 @@ async def find_files( found = workspace.mets.find_all_files( fileGrp=file_grp, ID=file_id, pageId=page_id, mimetype=mimetype, local_filename=local_filename, url=url ) - return OcrdFileListModel.create(found) + response = OcrdFileListModel.create(found) + self.log.info(f"GET /file -> {response.__dict__}") + return response @app.post(path='/file', response_model=OcrdFileModel) async def add_file( @@ -549,7 +581,9 @@ async def add_file( # Add to workspace kwargs = file_resource.dict() workspace.add_file(**kwargs, force=force) - return file_resource + response = file_resource + self.log.info(f"POST /file -> {response.__dict__}") + return response # ------------- # @@ -557,9 +591,6 @@ async def add_file( # Create socket and change to world-readable and -writable to avoid permission errors self.log.debug(f"chmod 0o677 {self.url}") server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - if Path(self.url).exists() and not is_socket_in_use(self.url): - # remove leftover unused socket which blocks startup - Path(self.url).unlink() server.bind(self.url) # creates the socket file atexit.register(self.shutdown) server.close() @@ -571,16 +602,5 @@ async def add_file( uvicorn_kwargs['log_config'] = None uvicorn_kwargs['access_log'] = False - self.log.debug("Starting uvicorn") + self.log.info("Starting the uvicorn Mets Server") uvicorn.run(app, **uvicorn_kwargs) - - -def is_socket_in_use(socket_path): - if Path(socket_path).exists(): - client = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - try: - client.connect(socket_path) - except OSError: - return False - client.close() - return True diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index 3c4c60306..95d0fec4e 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -23,6 +23,8 @@ # pylint: enable=wrong-import-position +# pylint: enable=wrong-import-position + from ocrd_validators import OcrdResourceListValidator from ocrd_utils import getLogger, directory_size, get_moduledir, guess_media_type, config from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json diff --git a/src/ocrd_models/ocrd_exif.py b/src/ocrd_models/ocrd_exif.py index ab050bae5..937416f5e 100644 --- a/src/ocrd_models/ocrd_exif.py +++ b/src/ocrd_models/ocrd_exif.py @@ -49,11 +49,11 @@ def run_identify(self, img): for prop in ['compression', 'photometric_interpretation']: setattr(self, prop, img.info[prop] if prop in img.info else None) if img.filename: - ret = run(['identify', '-format', r'%[resolution.x] %[resolution.y] %U', img.filename], check=False, stderr=PIPE, stdout=PIPE) + ret = run(['identify', '-format', r'%[resolution.x] %[resolution.y] %U ', img.filename], check=False, stderr=PIPE, stdout=PIPE) else: with BytesIO() as bio: img.save(bio, format=img.format) - ret = run(['identify', '-format', r'%[resolution.x] %[resolution.y] %U', '/dev/stdin'], check=False, stderr=PIPE, stdout=PIPE, input=bio.getvalue()) + ret = run(['identify', '-format', r'%[resolution.x] %[resolution.y] %U ', '/dev/stdin'], check=False, stderr=PIPE, stdout=PIPE, input=bio.getvalue()) if ret.returncode: stderr = ret.stderr.decode('utf-8') if 'no decode delegate for this image format' in stderr: diff --git a/src/ocrd_network/cli/client.py b/src/ocrd_network/cli/client.py index 9c7f15c88..350cf64b9 100644 --- a/src/ocrd_network/cli/client.py +++ b/src/ocrd_network/cli/client.py @@ -2,6 +2,7 @@ from json import dumps from typing import List, Optional, Tuple from ocrd.decorators.parameter_option import parameter_option, parameter_override_option +from ocrd_network.constants import JobState from ocrd_utils import DEFAULT_METS_BASENAME from ocrd_utils.introspect import set_json_key_value_overrides from ocrd_utils.str import parse_json_string_or_file @@ -104,8 +105,10 @@ def check_processing_job_status(address: Optional[str], processing_job_id: str): @click.option('--result-queue-name') @click.option('--callback-url') @click.option('--agent-type', default='worker') -@click.option('-b', '--block', default=False, +@click.option('-b', '--block', default=False, is_flag=True, help='If set, the client will block till job timeout, fail or success.') +@click.option('-p', '--print-state', default=False, is_flag=True, + help='If set, the client will print job states by each iteration.') def send_processing_job_request( address: Optional[str], processor_name: str, @@ -120,7 +123,8 @@ def send_processing_job_request( # TODO: This is temporally available to toggle # between the ProcessingWorker/ProcessorServer agent_type: Optional[str], - block: Optional[bool] + block: Optional[bool], + print_state: Optional[bool] ): """ Submit a processing job to the processing server. @@ -146,7 +150,7 @@ def send_processing_job_request( assert processing_job_id print(f"Processing job id: {processing_job_id}") if block: - client.poll_job_status(job_id=processing_job_id) + client.poll_job_status(job_id=processing_job_id, print_state=print_state) @client_cli.group('workflow') @@ -176,24 +180,39 @@ def check_workflow_job_status(address: Optional[str], workflow_job_id: str): 'the "OCRD_NETWORK_SERVER_ADDR_PROCESSING" env variable is used by default') @click.option('-m', '--path-to-mets', required=True) @click.option('-w', '--path-to-workflow', required=True) -@click.option('-b', '--block', default=False, +@click.option('--page-wise/--no-page-wise', is_flag=True, default=False, help="Whether to generate per-page jobs") +@click.option('-b', '--block', default=False, is_flag=True, help='If set, the client will block till job timeout, fail or success.') +@click.option('-p', '--print-state', default=False, is_flag=True, + help='If set, the client will print job states by each iteration.') def send_workflow_job_request( address: Optional[str], path_to_mets: str, path_to_workflow: str, - block: Optional[bool] + page_wise: bool, + block: bool, + print_state: bool ): """ Submit a workflow job to the processing server. """ client = Client(server_addr_processing=address) - workflow_job_id = client.send_workflow_job_request(path_to_wf=path_to_workflow, path_to_mets=path_to_mets) + workflow_job_id = client.send_workflow_job_request( + path_to_wf=path_to_workflow, + path_to_mets=path_to_mets, + page_wise=page_wise, + ) assert workflow_job_id print(f"Workflow job id: {workflow_job_id}") if block: - client.poll_workflow_status(job_id=workflow_job_id) - + print(f"Polling state of workflow job {workflow_job_id}") + state = client.poll_workflow_status(job_id=workflow_job_id, print_state=print_state) + if state != JobState.success: + print(f"Workflow failed with {state}") + exit(1) + else: + print(f"Workflow succeeded") + exit(0) @client_cli.group('workspace') def workspace_cli(): diff --git a/src/ocrd_network/client.py b/src/ocrd_network/client.py index 8ec8e541e..bb7cf4dbf 100644 --- a/src/ocrd_network/client.py +++ b/src/ocrd_network/client.py @@ -46,18 +46,21 @@ def check_job_status(self, job_id: str): def check_workflow_status(self, workflow_job_id: str): return get_ps_workflow_job_status(self.server_addr_processing, workflow_job_id=workflow_job_id) - def poll_job_status(self, job_id: str) -> str: + def poll_job_status(self, job_id: str, print_state: bool = False) -> str: return poll_job_status_till_timeout_fail_or_success( - ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait) + ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait, + print_state=print_state) - def poll_workflow_status(self, job_id: str) -> str: + def poll_workflow_status(self, job_id: str, print_state: bool = False) -> str: return poll_wf_status_till_timeout_fail_or_success( - ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait) + ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait, + print_state=print_state) def send_processing_job_request(self, processor_name: str, req_params: dict) -> str: return post_ps_processing_request( ps_server_host=self.server_addr_processing, processor=processor_name, job_input=req_params) - def send_workflow_job_request(self, path_to_wf: str, path_to_mets: str): + def send_workflow_job_request(self, path_to_wf: str, path_to_mets: str, page_wise: bool = False): return post_ps_workflow_request( - ps_server_host=self.server_addr_processing, path_to_wf=path_to_wf, path_to_mets=path_to_mets) + ps_server_host=self.server_addr_processing, path_to_wf=path_to_wf, path_to_mets=path_to_mets, + page_wise=page_wise) diff --git a/src/ocrd_network/client_utils.py b/src/ocrd_network/client_utils.py index 9b924c16a..4eaf4ea95 100644 --- a/src/ocrd_network/client_utils.py +++ b/src/ocrd_network/client_utils.py @@ -1,9 +1,10 @@ +import json from requests import get as request_get, post as request_post from time import sleep from .constants import JobState, NETWORK_PROTOCOLS -def _poll_endpoint_status(ps_server_host: str, job_id: str, job_type: str, tries: int, wait: int): +def _poll_endpoint_status(ps_server_host: str, job_id: str, job_type: str, tries: int, wait: int, print_state: bool = False) -> JobState: if job_type not in ["workflow", "processor"]: raise ValueError(f"Unknown job type '{job_type}', expected 'workflow' or 'processor'") job_state = JobState.unset @@ -13,18 +14,22 @@ def _poll_endpoint_status(ps_server_host: str, job_id: str, job_type: str, tries job_state = get_ps_processing_job_status(ps_server_host, job_id) if job_type == "workflow": job_state = get_ps_workflow_job_status(ps_server_host, job_id) + if print_state: + print(f"State of the {job_type} job {job_id}: {job_state}") if job_state == JobState.success or job_state == JobState.failed: break tries -= 1 return job_state -def poll_job_status_till_timeout_fail_or_success(ps_server_host: str, job_id: str, tries: int, wait: int) -> JobState: - return _poll_endpoint_status(ps_server_host, job_id, "processor", tries, wait) +def poll_job_status_till_timeout_fail_or_success( + ps_server_host: str, job_id: str, tries: int, wait: int, print_state: bool = False) -> JobState: + return _poll_endpoint_status(ps_server_host, job_id, "processor", tries, wait, print_state) -def poll_wf_status_till_timeout_fail_or_success(ps_server_host: str, job_id: str, tries: int, wait: int) -> JobState: - return _poll_endpoint_status(ps_server_host, job_id, "workflow", tries, wait) +def poll_wf_status_till_timeout_fail_or_success( + ps_server_host: str, job_id: str, tries: int, wait: int, print_state: bool = False) -> JobState: + return _poll_endpoint_status(ps_server_host, job_id, "workflow", tries, wait, print_state) def get_ps_deployed_processors(ps_server_host: str): @@ -47,22 +52,21 @@ def get_ps_processing_job_log(ps_server_host: str, processing_job_id: str): return response -def get_ps_processing_job_status(ps_server_host: str, processing_job_id: str) -> str: +def get_ps_processing_job_status(ps_server_host: str, processing_job_id: str) -> JobState: request_url = f"{ps_server_host}/processor/job/{processing_job_id}" response = request_get(url=request_url, headers={"accept": "application/json; charset=utf-8"}) assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}" job_state = response.json()["state"] assert job_state - return job_state - + return getattr(JobState, job_state.lower()) -def get_ps_workflow_job_status(ps_server_host: str, workflow_job_id: str) -> str: +def get_ps_workflow_job_status(ps_server_host: str, workflow_job_id: str) -> JobState: request_url = f"{ps_server_host}/workflow/job-simple/{workflow_job_id}" response = request_get(url=request_url, headers={"accept": "application/json; charset=utf-8"}) assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}" job_state = response.json()["state"] assert job_state - return job_state + return getattr(JobState, job_state.lower()) def post_ps_processing_request(ps_server_host: str, processor: str, job_input: dict) -> str: @@ -78,9 +82,13 @@ def post_ps_processing_request(ps_server_host: str, processor: str, job_input: d return processing_job_id -# TODO: Can be extended to include other parameters such as page_wise -def post_ps_workflow_request(ps_server_host: str, path_to_wf: str, path_to_mets: str) -> str: - request_url = f"{ps_server_host}/workflow/run?mets_path={path_to_mets}&page_wise=True" +def post_ps_workflow_request( + ps_server_host: str, + path_to_wf: str, + path_to_mets: str, + page_wise: bool = False, +) -> str: + request_url = f"{ps_server_host}/workflow/run?mets_path={path_to_mets}&page_wise={'True' if page_wise else 'False'}" response = request_post( url=request_url, headers={"accept": "application/json; charset=utf-8"}, @@ -88,8 +96,11 @@ def post_ps_workflow_request(ps_server_host: str, path_to_wf: str, path_to_mets: ) # print(response.json()) # print(response.__dict__) + json_resp_raw = response.text + # print(f'post_ps_workflow_request >> {response.status_code}') + # print(f'post_ps_workflow_request >> {json_resp_raw}') assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}" - wf_job_id = response.json()["job_id"] + wf_job_id = json.loads(json_resp_raw)["job_id"] assert wf_job_id return wf_job_id diff --git a/src/ocrd_network/processing_server.py b/src/ocrd_network/processing_server.py index 34c22e5cf..31eeca529 100644 --- a/src/ocrd_network/processing_server.py +++ b/src/ocrd_network/processing_server.py @@ -1,7 +1,7 @@ from datetime import datetime from os import getpid from pathlib import Path -from typing import Dict, List, Union +from typing import Dict, List, Optional, Union from uvicorn import run as uvicorn_run from fastapi import APIRouter, FastAPI, File, HTTPException, Request, status, UploadFile @@ -48,6 +48,7 @@ get_workflow_content, get_from_database_workspace, get_from_database_workflow_job, + kill_mets_server_zombies, parse_workflow_tasks, raise_http_exception, request_processor_server_tool_json, @@ -78,7 +79,6 @@ class ProcessingServer(FastAPI): """ def __init__(self, config_path: str, host: str, port: int) -> None: - initLogging() self.title = "OCR-D Processing Server" super().__init__( title=self.title, @@ -86,6 +86,7 @@ def __init__(self, config_path: str, host: str, port: int) -> None: on_shutdown=[self.on_shutdown], description="OCR-D Processing Server" ) + initLogging() self.log = getLogger("ocrd_network.processing_server") log_file = get_processing_server_logging_file_path(pid=getpid()) configure_file_handler_with_formatter(self.log, log_file=log_file, mode="a") @@ -155,7 +156,7 @@ def start(self) -> None: queue_names = self.deployer.find_matching_network_agents( worker_only=True, str_names_only=True, unique_only=True ) - self.log.debug(f"Creating message queues on RabbitMQ instance url: {self.rabbitmq_url}") + self.log.info(f"Creating message queues on RabbitMQ instance url: {self.rabbitmq_url}") create_message_queues(logger=self.log, rmq_publisher=self.rmq_publisher, queue_names=queue_names) self.deployer.deploy_network_agents(mongodb_url=self.mongodb_url, rabbitmq_url=self.rabbitmq_url) @@ -167,6 +168,7 @@ def start(self) -> None: uvicorn_run(self, host=self.hostname, port=int(self.port)) async def on_startup(self): + self.log.info(f"Initializing the Database on: {self.mongodb_url}") await initiate_database(db_url=self.mongodb_url) async def on_shutdown(self) -> None: @@ -200,6 +202,14 @@ def add_api_routes_others(self): tags=[ServerApiTags.WORKSPACE], summary="Forward a TCP request to UDS mets server" ) + others_router.add_api_route( + path="/kill_mets_server_zombies", + endpoint=self.kill_mets_server_zombies, + methods=["DELETE"], + tags=[ServerApiTags.WORKFLOW, ServerApiTags.PROCESSING], + status_code=status.HTTP_200_OK, + summary="!! Workaround Do Not Use Unless You Have A Reason !! Kill all METS servers on this machine that have been created more than 60 minutes ago." + ) self.include_router(others_router) def add_api_routes_processing(self): @@ -320,7 +330,7 @@ async def forward_tcp_request_to_uds_mets_server(self, request: Request) -> Dict """Forward mets-server-request A processor calls a mets related method like add_file with ClientSideOcrdMets. This sends - a request to this endpoint. This request contains all infomation neccessary to make a call + a request to this endpoint. This request contains all information necessary to make a call to the uds-mets-server. This information is used by `MetsServerProxy` to make a the call to the local (local for the processing-server) reachable the uds-mets-server. """ @@ -574,26 +584,20 @@ async def _cancel_cached_dependent_jobs(self, workspace_key: str, job_id: str) - ) async def _consume_cached_jobs_of_workspace( - self, workspace_key: str, mets_server_url: str + self, workspace_key: str, mets_server_url: str, path_to_mets: str ) -> List[PYJobInput]: - - # Check whether the internal queue for the workspace key still exists - if workspace_key not in self.cache_processing_requests.processing_requests: - self.log.debug(f"No internal queue available for workspace with key: {workspace_key}") - return [] - # decrease the internal cache counter by 1 request_counter = self.cache_processing_requests.update_request_counter( workspace_key=workspace_key, by_value=-1 ) self.log.debug(f"Internal processing job cache counter value: {request_counter}") - if not len(self.cache_processing_requests.processing_requests[workspace_key]): + if (workspace_key not in self.cache_processing_requests.processing_requests or + not len(self.cache_processing_requests.processing_requests[workspace_key])): if request_counter <= 0: # Shut down the Mets Server for the workspace_key since no # more internal callbacks are expected for that workspace self.log.debug(f"Stopping the mets server: {mets_server_url}") - - self.deployer.stop_uds_mets_server(mets_server_url=mets_server_url) + self.deployer.stop_uds_mets_server(mets_server_url=mets_server_url, path_to_mets=path_to_mets) try: # The queue is empty - delete it @@ -609,6 +613,10 @@ async def _consume_cached_jobs_of_workspace( else: self.log.debug(f"Internal request cache is empty but waiting for {request_counter} result callbacks.") return [] + # Check whether the internal queue for the workspace key still exists + if workspace_key not in self.cache_processing_requests.processing_requests: + self.log.debug(f"No internal queue available for workspace with key: {workspace_key}") + return [] consumed_requests = await self.cache_processing_requests.consume_cached_requests(workspace_key=workspace_key) return consumed_requests @@ -643,7 +651,7 @@ async def remove_job_from_request_cache(self, result_message: PYResultMessage): raise_http_exception(self.log, status.HTTP_404_NOT_FOUND, message, error) consumed_cached_jobs = await self._consume_cached_jobs_of_workspace( - workspace_key=workspace_key, mets_server_url=mets_server_url + workspace_key=workspace_key, mets_server_url=mets_server_url, path_to_mets=path_to_mets ) await self.push_cached_jobs_to_agents(processing_jobs=consumed_cached_jobs) @@ -817,6 +825,10 @@ async def get_workflow_info(self, workflow_job_id) -> Dict: response = self._produce_workflow_status_response(processing_jobs=jobs) return response + async def kill_mets_server_zombies(self, minutes_ago : Optional[int] = None, dry_run : Optional[bool] = None) -> List[int]: + pids_killed = kill_mets_server_zombies(minutes_ago=minutes_ago, dry_run=dry_run) + return pids_killed + async def get_workflow_info_simple(self, workflow_job_id) -> Dict[str, JobState]: """ Simplified version of the `get_workflow_info` that returns a single state for the entire workflow. diff --git a/src/ocrd_network/processing_worker.py b/src/ocrd_network/processing_worker.py index a352ea5fd..302100743 100644 --- a/src/ocrd_network/processing_worker.py +++ b/src/ocrd_network/processing_worker.py @@ -9,12 +9,12 @@ """ from datetime import datetime -from os import getpid +from os import getpid, getppid from pika import BasicProperties from pika.adapters.blocking_connection import BlockingChannel from pika.spec import Basic -from ocrd_utils import getLogger +from ocrd_utils import getLogger, initLogging from .constants import JobState from .database import sync_initiate_database, sync_db_get_workspace, sync_db_update_processing_job, verify_database_uri from .logging_utils import ( @@ -35,14 +35,16 @@ class ProcessingWorker: def __init__(self, rabbitmq_addr, mongodb_addr, processor_name, ocrd_tool: dict, processor_class=None) -> None: + initLogging() self.log = getLogger(f'ocrd_network.processing_worker') log_file = get_processing_worker_logging_file_path(processor_name=processor_name, pid=getpid()) configure_file_handler_with_formatter(self.log, log_file=log_file, mode="a") try: verify_database_uri(mongodb_addr) - self.log.debug(f'Verified MongoDB URL: {mongodb_addr}') + self.log.info(f'Verified MongoDB URL: {mongodb_addr}') self.rmq_data = verify_and_parse_mq_uri(rabbitmq_addr) + self.log.info(f'Verified RabbitMQ URL: {rabbitmq_addr}') except ValueError as error: msg = f"Failed to parse data, error: {error}" self.log.exception(msg) @@ -61,6 +63,7 @@ def __init__(self, rabbitmq_addr, mongodb_addr, processor_name, ocrd_tool: dict, # Gets assigned when the `connect_publisher` is called on the worker object # Used to publish OcrdResultMessage type message to the queue with name {processor_name}-result self.rmq_publisher = None + self.log.info(f"Initialized processing worker: {processor_name}") def connect_consumer(self): self.rmq_consumer = connect_rabbitmq_consumer(self.log, self.rmq_data) @@ -240,7 +243,7 @@ def publish_result_to_all(self, processing_message: OcrdProcessingMessage, resul # post the result message (callback to a user defined endpoint) post_to_callback_url(self.log, callback_url, result_message) if internal_callback_url: - self.log.info(f"Publishing result to internal callback url (Processing Server): {callback_url}") + self.log.info(f"Publishing result to internal callback url (Processing Server): {internal_callback_url}") # If the internal callback_url field is set, # post the result message (callback to Processing Server endpoint) post_to_callback_url(self.log, internal_callback_url, result_message) diff --git a/src/ocrd_network/processor_server.py b/src/ocrd_network/processor_server.py index 5aed89d72..60674afbf 100644 --- a/src/ocrd_network/processor_server.py +++ b/src/ocrd_network/processor_server.py @@ -42,13 +42,13 @@ class ProcessorServer(FastAPI): def __init__(self, mongodb_addr: str, processor_name: str = "", processor_class=None): if not (processor_name or processor_class): raise ValueError("Either 'processor_name' or 'processor_class' must be provided") - initLogging() super().__init__( on_startup=[self.on_startup], on_shutdown=[self.on_shutdown], title=f"Network agent - Processor Server", description="Network agent - Processor Server" ) + initLogging() self.log = getLogger("ocrd_network.processor_server") log_file = get_processor_server_logging_file_path(processor_name=processor_name, pid=getpid()) configure_file_handler_with_formatter(self.log, log_file=log_file, mode="a") @@ -69,6 +69,7 @@ def __init__(self, mongodb_addr: str, processor_name: str = "", processor_class= self.processor_name = self.ocrd_tool["executable"] self.add_api_routes_processing() + self.log.info(f"Initialized processor server: {processor_name}") async def on_startup(self): await initiate_database(db_url=self.db_url) diff --git a/src/ocrd_network/rabbitmq_utils/connector.py b/src/ocrd_network/rabbitmq_utils/connector.py index 893d55a21..8fbbc84ab 100644 --- a/src/ocrd_network/rabbitmq_utils/connector.py +++ b/src/ocrd_network/rabbitmq_utils/connector.py @@ -6,6 +6,7 @@ from typing import Any, Optional, Union from pika import BasicProperties, BlockingConnection, ConnectionParameters, PlainCredentials from pika.adapters.blocking_connection import BlockingChannel +from ocrd_utils import config from .constants import ( DEFAULT_EXCHANGER_NAME, DEFAULT_EXCHANGER_TYPE, @@ -69,8 +70,7 @@ def open_blocking_connection( port=port, virtual_host=vhost, credentials=credentials, - # TODO: The heartbeat should not be disabled (0)! - heartbeat=0 + heartbeat=config.OCRD_NETWORK_RABBITMQ_HEARTBEAT ), ) return blocking_connection diff --git a/src/ocrd_network/runtime_data/deployer.py b/src/ocrd_network/runtime_data/deployer.py index b956904d0..919d5b97c 100644 --- a/src/ocrd_network/runtime_data/deployer.py +++ b/src/ocrd_network/runtime_data/deployer.py @@ -8,7 +8,7 @@ """ from __future__ import annotations from pathlib import Path -from subprocess import Popen, run as subprocess_run +import psutil from time import sleep from typing import Dict, List, Union @@ -30,6 +30,8 @@ def __init__(self, config_path: str) -> None: self.data_hosts: List[DataHost] = parse_hosts_data(ps_config["hosts"]) self.internal_callback_url = ps_config.get("internal_callback_url", None) self.mets_servers: Dict = {} # {"mets_server_url": "mets_server_pid"} + # This is required to store UDS urls that are multiplexed through the TCP proxy and are not preserved anywhere + self.mets_servers_paths: Dict = {} # {"ws_dir_path": "mets_server_url"} self.use_tcp_mets = ps_config.get("use_tcp_mets", False) # TODO: Reconsider this. @@ -146,25 +148,33 @@ def start_uds_mets_server(self, ws_dir_path: str) -> Path: if is_mets_server_running(mets_server_url=str(mets_server_url)): self.log.debug(f"The UDS mets server for {ws_dir_path} is already started: {mets_server_url}") return mets_server_url + elif Path(mets_server_url).is_socket(): + self.log.warning( + f"The UDS mets server for {ws_dir_path} is not running but the socket file exists: {mets_server_url}." + "Removing to avoid any weird behavior before starting the server.") + Path(mets_server_url).unlink() self.log.info(f"Starting UDS mets server: {mets_server_url}") - pid = OcrdMetsServer.create_process(mets_server_url=mets_server_url, ws_dir_path=ws_dir_path, log_file=log_file) - self.mets_servers[mets_server_url] = pid + pid = OcrdMetsServer.create_process(mets_server_url=str(mets_server_url), ws_dir_path=str(ws_dir_path), log_file=str(log_file)) + self.mets_servers[str(mets_server_url)] = pid + self.mets_servers_paths[str(ws_dir_path)] = str(mets_server_url) return mets_server_url - def stop_uds_mets_server(self, mets_server_url: str, stop_with_pid: bool = False) -> None: + def stop_uds_mets_server(self, mets_server_url: str, path_to_mets: str) -> None: self.log.info(f"Stopping UDS mets server: {mets_server_url}") - if stop_with_pid: - if Path(mets_server_url) not in self.mets_servers: - message = f"UDS Mets server not found at URL: {mets_server_url}" - self.log.exception(message) - raise Exception(message) - mets_server_pid = self.mets_servers[Path(mets_server_url)] - OcrdMetsServer.kill_process(mets_server_pid=mets_server_pid) - return - # TODO: Reconsider this again - # Not having this sleep here causes connection errors - # on the last request processed by the processing worker. - # Sometimes 3 seconds is enough, sometimes not. - sleep(5) - stop_mets_server(mets_server_url=mets_server_url) + self.log.info(f"Path to the mets file: {path_to_mets}") + self.log.debug(f"mets_server: {self.mets_servers}") + self.log.debug(f"mets_server_paths: {self.mets_servers_paths}") + workspace_path = str(Path(path_to_mets).parent) + mets_server_url_uds = self.mets_servers_paths[workspace_path] + mets_server_pid = self.mets_servers[mets_server_url_uds] + self.log.info(f"Terminating mets server with pid: {mets_server_pid}") + p = psutil.Process(mets_server_pid) + stop_mets_server(self.log, mets_server_url=mets_server_url, ws_dir_path=workspace_path) + if p.is_running(): + p.wait() + self.log.info(f"Terminated mets server with pid: {mets_server_pid}") + else: + self.log.info(f"Mets server with pid: {mets_server_pid} has already terminated.") + del self.mets_servers_paths[workspace_path] + del self.mets_servers[mets_server_url_uds] return diff --git a/src/ocrd_network/server_cache.py b/src/ocrd_network/server_cache.py index b57f3fd23..179a76139 100644 --- a/src/ocrd_network/server_cache.py +++ b/src/ocrd_network/server_cache.py @@ -31,7 +31,7 @@ def check_if_locked_pages_for_output_file_grps( self, workspace_key: str, output_file_grps: List[str], page_ids: List[str] ) -> bool: if not self.locked_pages.get(workspace_key, None): - self.log.debug(f"No entry found in the locked pages cache for workspace key: {workspace_key}") + self.log.info(f"No entry found in the locked pages cache for workspace key: {workspace_key}") return False debug_message = f"Caching the received request due to locked output file grp pages." for file_group in output_file_grps: @@ -46,46 +46,45 @@ def check_if_locked_pages_for_output_file_grps( def get_locked_pages(self, workspace_key: str) -> Dict[str, List[str]]: if not self.locked_pages.get(workspace_key, None): - self.log.debug(f"No locked pages available for workspace key: {workspace_key}") + self.log.info(f"No locked pages available for workspace key: {workspace_key}") return {} return self.locked_pages[workspace_key] def lock_pages(self, workspace_key: str, output_file_grps: List[str], page_ids: List[str]) -> None: if not self.locked_pages.get(workspace_key, None): - self.log.debug(f"No entry found in the locked pages cache for workspace key: {workspace_key}") - self.log.debug(f"Creating an entry in the locked pages cache for workspace key: {workspace_key}") + self.log.info(f"No entry found in the locked pages cache for workspace key: {workspace_key}") + self.log.info(f"Creating an entry in the locked pages cache for workspace key: {workspace_key}") self.locked_pages[workspace_key] = {} for file_group in output_file_grps: if file_group not in self.locked_pages[workspace_key]: - self.log.debug(f"Creating an empty list for output file grp: {file_group}") + self.log.info(f"Creating an empty list for output file grp: {file_group}") self.locked_pages[workspace_key][file_group] = [] # The page id list is not empty - only some pages are in the request if page_ids: - self.log.debug(f"Locking pages for '{file_group}': {page_ids}") + self.log.info(f"Locking pages for '{file_group}': {page_ids}") self.locked_pages[workspace_key][file_group].extend(page_ids) - self.log.debug(f"Locked pages of '{file_group}': " - f"{self.locked_pages[workspace_key][file_group]}") + self.log.info(f"Locked pages of '{file_group}': {self.locked_pages[workspace_key][file_group]}") else: # Lock all pages with a single value - self.log.debug(f"Locking pages for '{file_group}': {self.placeholder_all_pages}") + self.log.info(f"Locking pages for '{file_group}': {self.placeholder_all_pages}") self.locked_pages[workspace_key][file_group].append(self.placeholder_all_pages) def unlock_pages(self, workspace_key: str, output_file_grps: List[str], page_ids: List[str]) -> None: if not self.locked_pages.get(workspace_key, None): - self.log.debug(f"No entry found in the locked pages cache for workspace key: {workspace_key}") + self.log.info(f"No entry found in the locked pages cache for workspace key: {workspace_key}") return for file_group in output_file_grps: if file_group in self.locked_pages[workspace_key]: if page_ids: # Unlock the previously locked pages - self.log.debug(f"Unlocking pages of '{file_group}': {page_ids}") + self.log.info(f"Unlocking pages of '{file_group}': {page_ids}") self.locked_pages[workspace_key][file_group] = \ [x for x in self.locked_pages[workspace_key][file_group] if x not in page_ids] - self.log.debug(f"Remaining locked pages of '{file_group}': " - f"{self.locked_pages[workspace_key][file_group]}") + self.log.info(f"Remaining locked pages of '{file_group}': " + f"{self.locked_pages[workspace_key][file_group]}") else: # Remove the single variable used to indicate all pages are locked - self.log.debug(f"Unlocking all pages for: {file_group}") + self.log.info(f"Unlocking all pages for: {file_group}") self.locked_pages[workspace_key][file_group].remove(self.placeholder_all_pages) @@ -127,11 +126,11 @@ def __print_job_input_debug_message(self, job_input: PYJobInput): debug_message += f", page ids: {job_input.page_id}" debug_message += f", job id: {job_input.job_id}" debug_message += f", job depends on: {job_input.depends_on}" - self.log.debug(debug_message) + self.log.info(debug_message) async def consume_cached_requests(self, workspace_key: str) -> List[PYJobInput]: if not self.has_workspace_cached_requests(workspace_key=workspace_key): - self.log.debug(f"No jobs to be consumed for workspace key: {workspace_key}") + self.log.info(f"No jobs to be consumed for workspace key: {workspace_key}") return [] found_consume_requests = [] for current_element in self.processing_requests[workspace_key]: @@ -165,25 +164,27 @@ def update_request_counter(self, workspace_key: str, by_value: int) -> int: # If a record counter of this workspace key does not exist # in the requests counter cache yet, create one and assign 0 if not self.processing_counter.get(workspace_key, None): - self.log.debug(f"Creating an internal request counter for workspace key: {workspace_key}") + self.log.info(f"Creating an internal request counter for workspace key: {workspace_key}") self.processing_counter[workspace_key] = 0 self.processing_counter[workspace_key] = self.processing_counter[workspace_key] + by_value + self.log.info(f"The new request counter of {workspace_key}: {self.processing_counter[workspace_key]}") return self.processing_counter[workspace_key] def cache_request(self, workspace_key: str, data: PYJobInput): # If a record queue of this workspace key does not exist in the requests cache if not self.processing_requests.get(workspace_key, None): - self.log.debug(f"Creating an internal request queue for workspace_key: {workspace_key}") + self.log.info(f"Creating an internal request queue for workspace_key: {workspace_key}") self.processing_requests[workspace_key] = [] self.__print_job_input_debug_message(job_input=data) # Add the processing request to the end of the internal queue + self.log.info(f"Caching a processing request of {workspace_key}: {data.job_id}") self.processing_requests[workspace_key].append(data) async def cancel_dependent_jobs(self, workspace_key: str, processing_job_id: str) -> List[PYJobInput]: if not self.has_workspace_cached_requests(workspace_key=workspace_key): - self.log.debug(f"No jobs to be cancelled for workspace key: {workspace_key}") + self.log.info(f"No jobs to be cancelled for workspace key: {workspace_key}") return [] - self.log.debug(f"Cancelling jobs dependent on job id: {processing_job_id}") + self.log.info(f"Cancelling jobs dependent on job id: {processing_job_id}") found_cancel_requests = [] for i, current_element in enumerate(self.processing_requests[workspace_key]): if processing_job_id in current_element.depends_on: @@ -192,7 +193,7 @@ async def cancel_dependent_jobs(self, workspace_key: str, processing_job_id: str for cancel_element in found_cancel_requests: try: self.processing_requests[workspace_key].remove(cancel_element) - self.log.debug(f"For job id: '{processing_job_id}', cancelling job id: '{cancel_element.job_id}'") + self.log.info(f"For job id: '{processing_job_id}', cancelling job id: '{cancel_element.job_id}'") cancelled_jobs.append(cancel_element) await db_update_processing_job(job_id=cancel_element.job_id, state=JobState.cancelled) # Recursively cancel dependent jobs for the cancelled job @@ -225,9 +226,11 @@ async def sync_is_caching_required(self, job_dependencies: List[str]) -> bool: def has_workspace_cached_requests(self, workspace_key: str) -> bool: if not self.processing_requests.get(workspace_key, None): - self.log.debug(f"In processing requests cache, no workspace key found: {workspace_key}") + self.log.info(f"In processing requests cache, no workspace key found: {workspace_key}") return False if not len(self.processing_requests[workspace_key]): - self.log.debug(f"The processing requests cache is empty for workspace key: {workspace_key}") + self.log.info(f"The processing requests cache is empty for workspace key: {workspace_key}") return False + self.log.info(f"The processing requests cache has {len(self.processing_requests[workspace_key])} " + f"entries for workspace key: {workspace_key} ") return True diff --git a/src/ocrd_network/server_utils.py b/src/ocrd_network/server_utils.py index 9d8628170..6e485f261 100644 --- a/src/ocrd_network/server_utils.py +++ b/src/ocrd_network/server_utils.py @@ -1,12 +1,18 @@ +import os +import re +import signal +from pathlib import Path +from json import dumps, loads +from urllib.parse import urljoin +from typing import Dict, List, Optional, Union +from time import time + from fastapi import HTTPException, status, UploadFile from fastapi.responses import FileResponse from httpx import AsyncClient, Timeout -from json import dumps, loads from logging import Logger -from pathlib import Path from requests import get as requests_get -from typing import Dict, List, Union -from urllib.parse import urljoin +from requests_unixsocket import sys from ocrd.resolver import Resolver from ocrd.task_sequence import ProcessorTask @@ -241,3 +247,33 @@ def validate_first_task_input_file_groups_existence(logger: Logger, mets_path: s if group not in available_groups: message = f"Input file group '{group}' of the first processor not found: {input_file_grps}" raise_http_exception(logger, status.HTTP_422_UNPROCESSABLE_ENTITY, message) + + +def kill_mets_server_zombies(minutes_ago : Optional[int], dry_run : Optional[bool]) -> List[int]: + if minutes_ago == None: + minutes_ago = 90 + if dry_run == None: + dry_run = False + + now = time() + cmdline_pat = r'.*ocrd workspace -U.*server start $' + ret = [] + for procdir in sorted(Path('/proc').glob('*'), key=os.path.getctime): + if not procdir.is_dir(): + continue + cmdline_file = procdir.joinpath('cmdline') + if not cmdline_file.is_file(): + continue + ctime_ago = int((now - procdir.stat().st_ctime) / 60) + if ctime_ago < minutes_ago: + continue + cmdline = cmdline_file.read_text().replace('\x00', ' ') + if re.match(cmdline_pat, cmdline): + pid = int(procdir.name) + ret.append(pid) + print(f'METS Server with PID {pid} was created {ctime_ago} minutes ago, more than {minutes_ago}, so killing (cmdline="{cmdline})', file=sys.stderr) + if dry_run: + print(f'[dry_run is active] kill {pid}') + else: + os.kill(pid, signal.SIGTERM) + return ret diff --git a/src/ocrd_network/tcp_to_uds_mets_proxy.py b/src/ocrd_network/tcp_to_uds_mets_proxy.py index 176f4f144..3f335435a 100644 --- a/src/ocrd_network/tcp_to_uds_mets_proxy.py +++ b/src/ocrd_network/tcp_to_uds_mets_proxy.py @@ -1,5 +1,5 @@ from requests_unixsocket import Session as requests_unixsocket_session -from .utils import get_uds_path +from .utils import get_uds_path, convert_url_to_uds_format from typing import Dict from ocrd_utils import getLogger @@ -31,9 +31,13 @@ def forward_tcp_request(self, request_body) -> Dict: if method_type not in SUPPORTED_METHOD_TYPES: raise NotImplementedError(f"Method type: {method_type} not recognized") ws_socket_file = str(get_uds_path(ws_dir_path=ws_dir_path)) - ws_unix_socket_url = f'http+unix://{ws_socket_file.replace("/", "%2F")}' + ws_unix_socket_url = convert_url_to_uds_format(ws_socket_file) uds_request_url = f"{ws_unix_socket_url}/{request_url}" + self.log.info(f"Forwarding TCP mets server request to UDS url: {uds_request_url}") + self.log.info(f"Forwarding method type {method_type}, request data: {request_data}, " + f"expected response type: {response_type}") + if not request_data: response = self.session.request(method_type, uds_request_url) elif "params" in request_data: @@ -45,12 +49,11 @@ def forward_tcp_request(self, request_body) -> Dict: else: raise ValueError("Expecting request_data to be empty or containing single key: params," f"form, or class but not {request_data.keys}") - + if response_type == "empty": + return {} if not response: self.log.error(f"Uds-Mets-Server gives unexpected error. Response: {response.__dict__}") return {"error": response.text} - elif response_type == "empty": - return {} elif response_type == "text": return {"text": response.text} elif response_type == "class" or response_type == "dict": diff --git a/src/ocrd_network/utils.py b/src/ocrd_network/utils.py index a2f563de4..5abe2104f 100644 --- a/src/ocrd_network/utils.py +++ b/src/ocrd_network/utils.py @@ -4,6 +4,7 @@ from functools import wraps from hashlib import md5 from json import loads +from logging import Logger from pathlib import Path from re import compile as re_compile, split as re_split from requests import get as requests_get, Session as Session_TCP @@ -151,22 +152,25 @@ def is_mets_server_running(mets_server_url: str, ws_dir_path: str = None) -> boo return False -def stop_mets_server(mets_server_url: str, ws_dir_path: str = None) -> bool: +def stop_mets_server(logger: Logger, mets_server_url: str, ws_dir_path: str) -> bool: protocol = "tcp" if (mets_server_url.startswith("http://") or mets_server_url.startswith("https://")) else "uds" - session = Session_TCP() if protocol == "tcp" else Session_UDS() - if protocol == "uds": - mets_server_url = convert_url_to_uds_format(mets_server_url) - try: - if 'tcp_mets' in mets_server_url: - if not ws_dir_path: - return False - response = session.post(url=f"{mets_server_url}", json=MpxReq.stop(ws_dir_path)) - else: - response = session.delete(url=f"{mets_server_url}/") - except Exception: - return False - return response.status_code == 200 - + # If the mets server URL is the proxy endpoint + if protocol == "tcp" and "tcp_mets" in mets_server_url: + # Convert the mets server url to UDS format + ws_socket_file = str(get_uds_path(ws_dir_path)) + mets_server_url = convert_url_to_uds_format(ws_socket_file) + protocol = "uds" + if protocol == "tcp": + request_json = MpxReq.stop(ws_dir_path) + logger.info(f"Sending POST request to: {mets_server_url}, request_json: {request_json}") + response = Session_TCP().post(url=f"{mets_server_url}", json=request_json) + return response.status_code == 200 + elif protocol == "uds": + logger.info(f"Sending DELETE request to: {mets_server_url}/") + response = Session_UDS().delete(url=f"{mets_server_url}/") + return response.status_code == 200 + else: + ValueError(f"Unexpected protocol type: {protocol}") def get_uds_path(ws_dir_path: str) -> Path: return Path(config.OCRD_NETWORK_SOCKETS_ROOT_DIR, f"{safe_filename(ws_dir_path)}.sock") diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 36399870e..c5f1e1667 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -231,7 +231,7 @@ def _ocrd_download_timeout_parser(val): config.add("OCRD_NETWORK_CLIENT_POLLING_SLEEP", description="How many seconds to sleep before trying again.", parser=int, - default=(True, 30)) + default=(True, 10)) config.add("OCRD_NETWORK_CLIENT_POLLING_TIMEOUT", description="Timeout for a blocking ocrd network client (in seconds).", @@ -247,9 +247,19 @@ def _ocrd_download_timeout_parser(val): default=(True, '')) config.add("OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS", - description="Number of attempts for a RabbitMQ client to connect before failing.", + description="Number of attempts for a RabbitMQ client to connect before failing.", + parser=int, + default=(True, 3)) + +config.add( + name="OCRD_NETWORK_RABBITMQ_HEARTBEAT", + description=""" + Controls AMQP heartbeat timeout (in seconds) negotiation during connection tuning. An integer value always overrides the value + proposed by broker. Use 0 to deactivate heartbeat. + """, parser=int, - default=(True, 3)) + default=(True, 0) +) config.add(name="OCRD_NETWORK_SOCKETS_ROOT_DIR", description="The root directory where all mets server related socket files are created", diff --git a/tests/model/test_exif.py b/tests/model/test_exif.py index f6771fb8e..18c5e4c46 100644 --- a/tests/model/test_exif.py +++ b/tests/model/test_exif.py @@ -24,7 +24,13 @@ ('leptonica_samples/data/OCR-D-IMG/OCR-D-IMG_1555_007.jpg', 944, 1472, 1, 1, 1, 'inches', 'RGB', None), ('kant_aufklaerung_1784-jp2/data/OCR-D-IMG/INPUT_0020.jp2', - 1457, 2084, 1, 1, 1, 'inches', 'RGB', None) + 1457, 2084, 1, 1, 1, 'inches', 'RGB', None), + # tolerate multi-frame TIFF: + ('gutachten/data/IMG/IMG_1.tif', + 2088, 2634, 300, 300, 300, 'inches', 'RGB', 'raw'), + # multi-frame TIFF with metric pixel density (is actually YCBCR not RGB but Pillow thinks otherwise...) + ('indian-ferns/data/OCR-D-IMG/0004.tif', + 2626, 3620, 28, 28, 28, 'cm', 'RGB', 'jpeg'), ]) def test_ocrd_exif(path, width, height, xResolution, yResolution, resolution, resolutionUnit, photometricInterpretation, compression): """Check EXIF attributes for different input formats diff --git a/tests/network/config.py b/tests/network/config.py index e22cc6ce9..611ad6382 100644 --- a/tests/network/config.py +++ b/tests/network/config.py @@ -89,11 +89,19 @@ test_config.add( name="OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS", + description="Number of attempts for a RabbitMQ client to connect before failing", + parser=int, + default=(True, 3) +) + +test_config.add( + name="OCRD_NETWORK_RABBITMQ_HEARTBEAT", description=""" - Number of attempts for a RabbitMQ client to connect before failing + Controls AMQP heartbeat timeout (in seconds) negotiation during connection tuning. An integer value always overrides the value + proposed by broker. Use 0 to deactivate heartbeat. """, parser=int, - default=(True, 3) + default=(True, 0) ) test_config.add( diff --git a/tests/network/test_modules_mets_server_proxy.py b/tests/network/test_modules_mets_server_proxy.py index 8b8c0d35f..f19d7e415 100644 --- a/tests/network/test_modules_mets_server_proxy.py +++ b/tests/network/test_modules_mets_server_proxy.py @@ -119,7 +119,7 @@ def test_find_files(start_uds_mets_server): {"file_grp": test_file_group} ) response_dict = MetsServerProxy().forward_tcp_request(request_body=request_body) - assert len(response_dict["files"]) == 3, "Expected to find exatly 3 matching files" + assert len(response_dict["files"]) == 3, "Expected to find exactly 3 matching files" request_body = MpxReq.find_files( TEST_WORKSPACE_DIR, {"file_grp": test_non_existing_file_group} diff --git a/tests/test_resolver.py b/tests/test_resolver.py index c2575b608..97d2ee665 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -118,7 +118,7 @@ def test_workspace_from_url_kant_with_resources(mock_request, tmp_path): @patch.object(Session, "get") def test_workspace_from_url_kant_with_resources_existing_local(mock_request, tmp_path): """ - Fail with clobber_mets=False, succeeed with clobber_mets=True + Fail with clobber_mets=False, succeed with clobber_mets=True """ # arrange diff --git a/tests/test_resource_manager.py b/tests/test_resource_manager.py index 653167e10..286f6ea6b 100644 --- a/tests/test_resource_manager.py +++ b/tests/test_resource_manager.py @@ -80,7 +80,7 @@ def test_resources_manager_from_environment(tmp_path, monkeypatch): assert mgr.userdir == tmp_path -def test_resources_manager_config_explicite(tmp_path): +def test_resources_manager_config_explicit(tmp_path): # act from ocrd.resource_manager import OcrdResourceManager From 358d40630a948d2ad1a35af94ba95fb5b6ab74ef Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 8 Jan 2025 19:56:15 +0100 Subject: [PATCH 245/249] :memo: changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6bacd5aeb..b3ab85de3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,11 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Changed: + + - Merge v2 master into new-procesor-api + + ## [3.0.0b7] - 2024-11-12 Fixed: From 75ce41507fa3d1259ce7d3c17ef2e41fce00dae1 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 8 Jan 2025 19:56:41 +0100 Subject: [PATCH 246/249] :memo: changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b3ab85de3..f49b95433 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ Changed: - Merge v2 master into new-procesor-api +Fixed: + + - `ocrd --help` output was broken for multiline config options, bertsky/core#25 ## [3.0.0b7] - 2024-11-12 From ff2a73bd04dd8164c95c2931d6ee8c5eb528dadc Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 8 Jan 2025 19:57:54 +0100 Subject: [PATCH 247/249] :memo: changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f49b95433..8fcb0c54d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ Changed: Fixed: - `ocrd --help` output was broken for multiline config options, bertsky/core#25 + - Call `initLogging` before instantiating processors in `ocrd_cli_wrap_processor`, bertsky/core#24, #1296 ## [3.0.0b7] - 2024-11-12 From e59222abe5735b99eca65d46c1a2b4a1ba3d5ec4 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 8 Jan 2025 20:06:21 +0100 Subject: [PATCH 248/249] :memo: changelog --- CHANGELOG.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8fcb0c54d..fc9170522 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,11 +8,19 @@ Versioned according to [Semantic Versioning](http://semver.org/). Changed: - Merge v2 master into new-procesor-api + - PAGE API: Update to latest generateDS 2.44.1, bertsky/core#21 Fixed: - `ocrd --help` output was broken for multiline config options, bertsky/core#25 - Call `initLogging` before instantiating processors in `ocrd_cli_wrap_processor`, bertsky/core#24, #1296 + - PAGE API: Fully reversable mapping from/to XML element/generateDS instances, bertsky/core#21 + +Added: + + - `ocrd-filter` processor to remove segments based on XPath expressions, bertsky/core#21 + - XPath function `pc:pixelarea` for the number of pixels of the bounding box (or sum area on node sets), bertsky/core#21 + - XPath function `pc:textequiv` for the first TextEquiv unicode string (or concatenated string on node sets), bertsky/core#21 ## [3.0.0b7] - 2024-11-12 From 68786a671a5615ae5559ca00a0a307df30ae25ef Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 8 Jan 2025 20:43:46 +0100 Subject: [PATCH 249/249] remove 3.8 breaking typing hints --- src/ocrd/cli/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index 6f37858ec..794538752 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -12,7 +12,7 @@ # pylint: disable=wrong-import-position -def command_with_replaced_help(*replacements: tuple[str, str]): +def command_with_replaced_help(*replacements): class CommandWithReplacedHelp(click.Command): def get_help(self, ctx):