From a858f1b9d8b72ea5758ef2b49fe0b8b27d34765b Mon Sep 17 00:00:00 2001 From: wrznr Date: Mon, 11 May 2020 20:14:37 +0200 Subject: [PATCH 1/4] Allow local (file) paths as source for linked full texts OCR-D produces METS which does not use URLs for the linkage of external files but rather fule paths. This commit introduces file paths as an additional option for retrieving full texts from external ALTO files. In addition, the full text file group may be sepcified (rather than using a fixed `FULLTEXT` fule group). To deal with different versions of ALTO, a proposal by @kba has been implemented which involves brute-force namespace renaming. The concept of using the same namespace for different, non-compatible XML versions is a PITA. Fixes #41 --- Changelog | 15 +++++++++++++-- mets_mods2tei/api/alto.py | 18 +++++++----------- mets_mods2tei/api/mets.py | 25 +++++++++++++++++++------ mets_mods2tei/api/tei.py | 12 +++++++++++- mets_mods2tei/scripts/mets_mods2tei.py | 7 +++++-- setup.py | 2 +- tests/test_mets.py | 25 ++++++++++++++++++++++++- 7 files changed, 80 insertions(+), 24 deletions(-) diff --git a/Changelog b/Changelog index df81b1c..9d88fb3 100644 --- a/Changelog +++ b/Changelog @@ -4,17 +4,28 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## [0.1.1] - 2020-05-11 ### Added - Treat nested AMD-type (non-logical) divs in logical struct map (i.e. newspaper case) +- Make full text file group selectable by user +- Allow for file entries (in addition to URLs) in METS +- Add special treatment for URNs and VD IDs +- Add poor man's namespace versioning handling ### Changed - Make extraction of subtitles conditional on their presence +- Use "licence" for all types of licences (even unknown ones) + +### Fixed +- https://github.com/slub/mets-mods2tei/issues/28 +- https://github.com/slub/mets-mods2tei/issues/37 +- https://github.com/slub/mets-mods2tei/issues/39 +- https://github.com/slub/mets-mods2tei/issues/41 ## [0.1.0] - 2019-12-04 ### Added -- Correctly Place structures which are not on top of a page +- Correctly place structures which are not on top of a page - Set `corresp` and `facs` attributes of `pb` elements - Store links to `DEFAULT` images in METS - Tests for new functionality diff --git a/mets_mods2tei/api/alto.py b/mets_mods2tei/api/alto.py index 6cf3d4a..bf27026 100644 --- a/mets_mods2tei/api/alto.py +++ b/mets_mods2tei/api/alto.py @@ -4,15 +4,18 @@ import os import logging +import re import Levenshtein ns = { 'xlink' : "http://www.w3.org/1999/xlink", - 'alto': "http://www.loc.gov/standards/alto/ns-v2#", + 'alto': "http://www.loc.gov/standards/alto/ns-v4#", } XLINK = "{%s}" % ns['xlink'] ALTO = "{%s}" % ns['alto'] +norm_alto_ns_re = re.compile("alto/ns-v.#") + class Alto: def __init__(self): @@ -47,7 +50,7 @@ def read(cls, source): if hasattr(source, 'read'): return cls.fromfile(source) if os.path.exists(source): - return cls.fromfile(source) + return cls.fromfile(open(source)) @classmethod def fromfile(cls, path): @@ -65,7 +68,7 @@ def _fromfile(self, path): :param str path: Path to a ALTO document. """ parser = etree.XMLParser(remove_blank_text=True) - self.tree = etree.parse(path, parser) + self.tree = etree.XML(bytes(norm_alto_ns_re.sub("alto/ns-v4#", path.read()), "utf-8"), parser) self.path = path def get_text_blocks(self): @@ -88,14 +91,7 @@ def get_text_in_line(self, line): Returns the ALTO-encoded text . :param Element line: The line to extract the text from. """ - line_text = "" - for element in line.xpath("./alto:String|./alto:SP", namespaces=ns): - if element.tag == "%sString" % ALTO: - line_text += element.get("CONTENT") - elif element.tag == "%sSP" % ALTO: - line_text += " " - #line_text += "\n" - return line_text + return " ".join(element.get("CONTENT") for element in line.xpath("./alto:String", namespaces=ns)) def __compute_fuzzy_distance(self, text1, text2): """ diff --git a/mets_mods2tei/api/mets.py b/mets_mods2tei/api/mets.py index 7b1de81..6cf8f88 100644 --- a/mets_mods2tei/api/mets.py +++ b/mets_mods2tei/api/mets.py @@ -55,6 +55,7 @@ def __init__(self): self.img_map = {} self.alto_map = {} self.struct_links = {} + self.fulltext_group_name = 'FULLTEXT' self.title = None self.sub_titles = None @@ -87,21 +88,21 @@ def read(cls, source): :param source: METS (file) source. """ if hasattr(source, 'read'): - return cls.fromfile(source) + return cls.from_file(source) if os.path.exists(source): - return cls.fromfile(source) + return cls.from_file(source) @classmethod - def fromfile(cls, path): + def from_file(cls, path): """ Reads in METS from a given file source. :param str path: Path to a METS document. """ i = cls() - i.__fromfile(path) + i.fromfile(path) return i - def __fromfile(self, path): + def fromfile(self, path): """ Reads in METS from a given file source. :param str path: Path to a METS document. @@ -271,7 +272,7 @@ def __spur(self): # fulltext fulltext_map = {} - fulltext_group = self.tree.xpath("//mets:fileGrp[@USE='FULLTEXT']", namespaces=ns) + fulltext_group = self.tree.xpath("//mets:fileGrp[@USE='%s']" % self.fulltext_group_name, namespaces=ns) if fulltext_group: fulltext_map = {} for entry in fulltext_group[0].xpath("./mets:file", namespaces=ns): @@ -300,6 +301,18 @@ def __spur(self): self.struct_links[sm_link.get("%sfrom" % XLINK)] = [] self.struct_links[sm_link.get("%sfrom" % XLINK)].append(sm_link.get("%sto" % XLINK)) + @property + def fulltext_group_name(self): + """ + Return the currently configured full-text-related + file group use attribute. + """ + return self.__fulltext_group_name + + @fulltext_group_name.setter + def fulltext_group_name(self, fulltext_use): + self.__fulltext_group_name = fulltext_use + def get_main_title(self): """ Return the main title of the work. diff --git a/mets_mods2tei/api/tei.py b/mets_mods2tei/api/tei.py index 4aa9944..b792057 100644 --- a/mets_mods2tei/api/tei.py +++ b/mets_mods2tei/api/tei.py @@ -7,6 +7,7 @@ import copy from urllib.request import urlopen +from urllib.parse import urlparse from pkg_resources import resource_filename, Requirement from .alto import Alto @@ -602,7 +603,15 @@ def __add_ocr_to_node(self, node, mets): alto_link = mets.get_alto(struct_link) # only collect ocr from a file once! if not alto_link in self.alto_map: - f = urlopen(alto_link) + try: + sections = urlparse(alto_link) + except: + continue + if sections.scheme and sections.netloc: + f = urlopen(alto_link) + elif sections.path: + f = open(alto_link) + alto = Alto.read(f) self.alto_map[alto_link] = alto @@ -611,6 +620,7 @@ def __add_ocr_to_node(self, node, mets): pb.set("corresp", mets.get_img(struct_link)) for text_block in alto.get_text_blocks(): + self.logger.debug("HERE") p = etree.SubElement(node, "%sp" % TEI) for line in alto.get_lines_in_text_block(text_block): lb = etree.SubElement(p, "%slb" % TEI) diff --git a/mets_mods2tei/scripts/mets_mods2tei.py b/mets_mods2tei/scripts/mets_mods2tei.py index 1a2e199..28a3bdc 100644 --- a/mets_mods2tei/scripts/mets_mods2tei.py +++ b/mets_mods2tei/scripts/mets_mods2tei.py @@ -12,8 +12,9 @@ @click.command() @click.argument('mets', required=True) @click.option('-o', '--ocr', is_flag=True, default=False, help="Serialize OCR into resulting TEI") +@click.option('-T', '--text-group', default="FULLTEXT", help="File group which contains the full text") @click.option('-l', '--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARN', 'ERROR', 'OFF']), default='WARN') -def cli(mets, ocr, log_level): +def cli(mets, ocr, text_group, log_level): """ METS: File containing or URL pointing to the METS/MODS XML to be converted """ # @@ -29,7 +30,9 @@ def cli(mets, ocr, log_level): # # read in METS - mets = Mets.read(f) + mets = Mets() + mets.fulltext_group_name = text_group + mets.fromfile(f) # # create TEI (from skeleton) diff --git a/setup.py b/setup.py index b9bd5d9..26dd4a7 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ setup( name='mets-mods2tei', - version='0.1.0', + version='0.1.1', description='Convert digital documents in METS/MODS format to TEI', long_description=open('README.md').read(), long_description_content_type="text/markdown", diff --git a/tests/test_mets.py b/tests/test_mets.py index 710bef1..bf71438 100644 --- a/tests/test_mets.py +++ b/tests/test_mets.py @@ -47,9 +47,32 @@ def test_loading_local_file(datadir): Test loading a local mets file ''' f = open(datadir.join('test_mets.xml')) - mets = Mets.fromfile(f) + mets = Mets.from_file(f) assert(mets.mets is not None) +def test_intermediate_file_loading(datadir): + ''' + Test loading a local mets file + ''' + f = open(datadir.join('test_mets.xml')) + mets = Mets() + mets.fromfile(f) + assert(mets.mets is not None) + +def test_fulltext_group_name(subtests, datadir): + ''' + Test getting and setting the full text group name + ''' + f = open(datadir.join('test_mets.xml')) + mets = Mets.read(f) + + with subtests.test("Check getter"): + assert(mets.fulltext_group_name == "FULLTEXT") + + with subtests.test("Check setter"): + mets.fulltext_group_name = "TEXT" + assert(mets.fulltext_group_name == "TEXT") + def test_mappings(subtests, datadir): ''' Test the correct interpretation of the structural linking From fb14b740f75f0b0b9653ebdfe289cebf3f3a9531 Mon Sep 17 00:00:00 2001 From: wrznr Date: Tue, 12 May 2020 16:40:39 +0200 Subject: [PATCH 2/4] Close file and URL handles As @kba correctly pointed out, once opened file and URL handles should be closed at some point. This commit adds the necessary contexts. Since we do not no whether the METS contains URLs or file paths, we cannot know how to correctly close them. As a workaround, `urlopen` is now also used to read OCR-D like file paths. --- mets_mods2tei/api/alto.py | 7 ++++--- mets_mods2tei/api/mets.py | 9 ++++----- mets_mods2tei/api/tei.py | 18 ++++++++++++------ 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/mets_mods2tei/api/alto.py b/mets_mods2tei/api/alto.py index bf27026..e1a2cec 100644 --- a/mets_mods2tei/api/alto.py +++ b/mets_mods2tei/api/alto.py @@ -14,7 +14,7 @@ XLINK = "{%s}" % ns['xlink'] ALTO = "{%s}" % ns['alto'] -norm_alto_ns_re = re.compile("alto/ns-v.#") +norm_alto_ns_re = re.compile(rb'alto/ns-v.#') class Alto: @@ -50,7 +50,8 @@ def read(cls, source): if hasattr(source, 'read'): return cls.fromfile(source) if os.path.exists(source): - return cls.fromfile(open(source)) + with open(source, 'rb') as f: + return cls.fromfile(f) @classmethod def fromfile(cls, path): @@ -68,7 +69,7 @@ def _fromfile(self, path): :param str path: Path to a ALTO document. """ parser = etree.XMLParser(remove_blank_text=True) - self.tree = etree.XML(bytes(norm_alto_ns_re.sub("alto/ns-v4#", path.read()), "utf-8"), parser) + self.tree = etree.XML(norm_alto_ns_re.sub(b"alto/ns-v4#", path.read()), parser) self.path = path def get_text_blocks(self): diff --git a/mets_mods2tei/api/mets.py b/mets_mods2tei/api/mets.py index 6cf8f88..4ac0531 100644 --- a/mets_mods2tei/api/mets.py +++ b/mets_mods2tei/api/mets.py @@ -26,11 +26,10 @@ def __init__(self): The constructor. """ self.map = {} - filep = open(os.path.realpath(resource_filename(Requirement.parse("mets_mods2tei"), 'mets_mods2tei/data/iso15924-utf8-20180827.txt'))) - reader = csv.DictReader(filter(lambda row: row[0]!='#', filep), delimiter=';', quoting=csv.QUOTE_NONE, fieldnames=['code','index','name_eng', 'name_fr', 'alias', 'Age', 'Date']) - for row in reader: - self.map[row['code']] = row['name_eng'] - filep.close() + with open(os.path.realpath(resource_filename(Requirement.parse("mets_mods2tei"), 'mets_mods2tei/data/iso15924-utf8-20180827.txt'))) as filep: + reader = csv.DictReader(filter(lambda row: row[0]!='#', filep), delimiter=';', quoting=csv.QUOTE_NONE, fieldnames=['code','index','name_eng', 'name_fr', 'alias', 'Age', 'Date']) + for row in reader: + self.map[row['code']] = row['name_eng'] def get(self, code): """ diff --git a/mets_mods2tei/api/tei.py b/mets_mods2tei/api/tei.py index b792057..4fb5b02 100644 --- a/mets_mods2tei/api/tei.py +++ b/mets_mods2tei/api/tei.py @@ -6,6 +6,7 @@ import logging import copy +from contextlib import closing from urllib.request import urlopen from urllib.parse import urlparse from pkg_resources import resource_filename, Requirement @@ -607,12 +608,18 @@ def __add_ocr_to_node(self, node, mets): sections = urlparse(alto_link) except: continue - if sections.scheme and sections.netloc: - f = urlopen(alto_link) - elif sections.path: - f = open(alto_link) - alto = Alto.read(f) + # use urlopen for both paths and URLs + if not sections.scheme: + mod_link = 'file:' + alto_link + else: + mod_link = alto_link + self.logger.debug(mod_link) + + with closing(urlopen(alto_link)) as f: + alto = Alto.read(f) + + # save original link! self.alto_map[alto_link] = alto pb = etree.SubElement(node, "%spb" % TEI) @@ -620,7 +627,6 @@ def __add_ocr_to_node(self, node, mets): pb.set("corresp", mets.get_img(struct_link)) for text_block in alto.get_text_blocks(): - self.logger.debug("HERE") p = etree.SubElement(node, "%sp" % TEI) for line in alto.get_lines_in_text_block(text_block): lb = etree.SubElement(p, "%slb" % TEI) From 5f9b46280044381c11ef61ebeab05e47a8a4320e Mon Sep 17 00:00:00 2001 From: wrznr Date: Tue, 12 May 2020 16:49:42 +0200 Subject: [PATCH 3/4] Open the modified link --- mets_mods2tei/api/tei.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mets_mods2tei/api/tei.py b/mets_mods2tei/api/tei.py index 4fb5b02..4dbe4cf 100644 --- a/mets_mods2tei/api/tei.py +++ b/mets_mods2tei/api/tei.py @@ -616,7 +616,7 @@ def __add_ocr_to_node(self, node, mets): mod_link = alto_link self.logger.debug(mod_link) - with closing(urlopen(alto_link)) as f: + with closing(urlopen(mod_link)) as f: alto = Alto.read(f) # save original link! From c646567e3fce078733969bcc5932105d1c469495 Mon Sep 17 00:00:00 2001 From: wrznr Date: Tue, 12 May 2020 16:54:33 +0200 Subject: [PATCH 4/4] Open ALTO files as binary --- tests/test_alto.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/test_alto.py b/tests/test_alto.py index b4dfdb2..a2b0597 100644 --- a/tests/test_alto.py +++ b/tests/test_alto.py @@ -38,32 +38,32 @@ def test_reading_local_file(datadir): ''' Test reading a local alto file ''' - f = open(datadir.join('test_alto.xml')) - alto = Alto.read(f) + with open(datadir.join('test_alto.xml'), 'rb') as f: + alto = Alto.read(f) assert(alto.tree is not None) def test_loading_local_file(datadir): ''' Test loading a local alto file ''' - f = open(datadir.join('test_alto.xml')) - alto = Alto.fromfile(f) + with open(datadir.join('test_alto.xml'), 'rb') as f: + alto = Alto.read(f) assert(alto.tree is not None) def test_text_block_extraction(datadir): ''' Test the extraction of text blocks ''' - f = open(datadir.join('test_alto.xml')) - alto = Alto.fromfile(f) + with open(datadir.join('test_alto.xml'), 'rb') as f: + alto = Alto.read(f) assert(len(list(alto.get_text_blocks())) == 1) def test_text_line_extraction(datadir): ''' Test the extraction of text lines ''' - f = open(datadir.join('test_alto.xml')) - alto = Alto.fromfile(f) + with open(datadir.join('test_alto.xml'), 'rb') as f: + alto = Alto.read(f) text_block = list(alto.get_text_blocks())[0] assert(len(list(alto.get_lines_in_text_block(text_block))) == 26) @@ -71,8 +71,8 @@ def test_text_line_text_extraction(datadir): ''' Test the extraction of text from text lines ''' - f = open(datadir.join('test_alto.xml')) - alto = Alto.fromfile(f) + with open(datadir.join('test_alto.xml'), 'rb') as f: + alto = Alto.read(f) text_block = list(alto.get_text_blocks())[0] text_line = list(alto.get_lines_in_text_block(text_block))[0] assert(alto.get_text_in_line(text_line) == "Vorbericht.")