diff --git a/tests/workspace/reference_page_xml/nowa_doba.xml b/tests/workspace/reference_page_xml/nowa_doba.xml index 74f0fb3..d5febc9 100644 --- a/tests/workspace/reference_page_xml/nowa_doba.xml +++ b/tests/workspace/reference_page_xml/nowa_doba.xml @@ -2,8 +2,8 @@ OCR-D/core 2.63.3 - 2024-04-12T13:57:15.530802 - 2024-04-12T13:57:15.530802 + 2024-04-12T14:47:48.359119 + 2024-04-12T14:47:48.359119 @@ -45,7 +45,7 @@ - + @@ -192,7 +192,7 @@ Indeks 30 381 - + @@ -207,7 +207,7 @@ KOMUNIKEJ - + @@ -288,7 +288,7 @@ weje delegacije NDR W Moskwje - + @@ -507,7 +507,7 @@ licy wotpowedny komunikej, kotry tu snadnje skrotseny wozjewjamy: - + @@ -966,7 +966,7 @@ méchu so zendzenja, na kotrych so ze - + @@ -1011,7 +1011,7 @@ W prihotach II. Festiwala serbskeje kultury - + @@ -1050,7 +1050,7 @@ Iniciatiwy wsech partnerow wulkeho swjedzenja - + @@ -2043,7 +2043,7 @@ wal serbskeje kultury stac. - + @@ -2322,7 +2322,7 @@ stiwala "Wsu nasu moc za wsostron- - + @@ -2583,7 +2583,7 @@ poskicenjow - + @@ -2616,7 +2616,7 @@ Predsydstwa zaruca wuspechi skupin - + @@ -3321,7 +3321,7 @@ Kono na - + @@ -3354,7 +3354,7 @@ Zamernje so primaja gratu - + @@ -3399,7 +3399,7 @@ Kolektiw "Handrij Zejler" dale mjez pucrubarjemi - + @@ -3870,7 +3870,7 @@ tiw socialistiskeho dzela". - + @@ -5055,7 +5055,7 @@ listiske wubedzowanje dale specho- - + @@ -5682,7 +5682,7 @@ nja Z wuznamnymi wosobinami SO - + @@ -6333,7 +6333,7 @@ hofera. - + @@ -6480,7 +6480,7 @@ sledk cyleho kolektiwa. - + @@ -6705,7 +6705,7 @@ dec je kózdy napominany - + @@ -6774,7 +6774,7 @@ Netko wsu moc festiwalej - + @@ -7431,7 +7431,7 @@ bicy. - + @@ -8106,7 +8106,7 @@ II. Festiwalej serbskeje kultury. - + @@ -8139,7 +8139,7 @@ Koch W cs. rozhtosu - + @@ -8694,7 +8694,7 @@ skim rozhlosu wusylac. - + @@ -8733,7 +8733,7 @@ Serbscy wumelcy W Moskwje pobyli - + @@ -8748,7 +8748,7 @@ I - + @@ -9039,7 +9039,7 @@ Abrasimow. - + @@ -9132,7 +9132,7 @@ (Pokrocowanje na 2. stronje) - + @@ -9459,7 +9459,7 @@ Rozprawu wo wuprawje mace na 3. stronje deleka. - + @@ -9492,7 +9492,7 @@ a serokosc - + @@ -9507,7 +9507,7 @@ prihotow - + @@ -9948,7 +9948,7 @@ hesle zwuraznja. - + @@ -10653,7 +10653,7 @@ ninje wsitko priprawic a planowac. - + diff --git a/textract2page/convert_aws.py b/textract2page/convert_aws.py index f63ad9e..28e2e41 100644 --- a/textract2page/convert_aws.py +++ b/textract2page/convert_aws.py @@ -29,26 +29,13 @@ RolesType, TableRegionType, TableCellRoleType, + ImageRegionType, ) from ocrd_models.ocrd_page import to_xml text_type_map: Final = {"PRINTED": "printed", "HANDWRITING": "handwritten-cursive"} -# Textract layout types -> Page layout types -layout_type_map: Final = { - "LAYOUT_TITLE": "heading", - "LAYOUT_HEADER": "header", - "LAYOUT_FOOTER": "footer", - "LAYOUT_SECTION_HEADER": "heading", - "LAYOUT_PAGE_NUMBER": "page-number", - "LAYOUT_LIST": "other", - "LAYOUT_FIGURE": "other", - "LAYOUT_TABLE": "other", - "LAYOUT_KEY_VALUE_SET": "other", - "LAYOUT_TEXT": "paragraph", -} - class TextractGeometry(ABC): """Abstract geometry class.""" @@ -153,7 +140,21 @@ def __init__( self, aws_layout_block: Dict, textract_words: Dict, textract_lines: Dict ) -> None: super().__init__(aws_block=aws_layout_block) - self.layout_type = layout_type_map.get(aws_layout_block["BlockType"]) + # Textract layout types -> Page layout types + layout_type_map: Final = { + "LAYOUT_TITLE": "heading", + "LAYOUT_HEADER": "header", + "LAYOUT_FOOTER": "footer", + "LAYOUT_SECTION_HEADER": "heading", + "LAYOUT_PAGE_NUMBER": "page-number", + "LAYOUT_LIST": "other", + "LAYOUT_FIGURE": "other", + "LAYOUT_TABLE": "other", + "LAYOUT_KEY_VALUE_SET": "other", + "LAYOUT_TEXT": "paragraph", + } + self.page_layout_type = layout_type_map.get(aws_layout_block["BlockType"]) + self.textract_layout_type = aws_layout_block["BlockType"] child_words = [ textract_words.get(id) @@ -711,7 +712,6 @@ def convert_file( global_reading_order_index = 0 # preserve table positions in reading order visited_tables = {} - visited_layouts = {} for line_id, line in lines.items(): # if line is part of a table @@ -730,25 +730,13 @@ def convert_file( global_ordered_group.add_UnorderedGroupIndexed(local_reading_order) visited_tables[parent_table.id] = local_reading_order global_reading_order_index += 1 - # if line is part of a layout + + # if line is part of a layout do nothing here elif line.parent_layout: continue - # local reading order - parent_layout = line.parent_layout - if ( - not (parent_layout.id in visited_layouts.keys()) - and preserve_reading_order - ): - local_reading_order = UnorderedGroupIndexedType( - index=global_reading_order_index, - id=f"layout_{parent_layout.id}_reading_order", - comments="Reading order of this layout element.", - ) - global_ordered_group.add_UnorderedGroupIndexed(local_reading_order) - visited_layouts[parent_layout.id] = local_reading_order - - global_reading_order_index += 1 + # if line is neither part of a table, nor of a layout, create dummy + # region around the line else: # wrap lines in separate TextRegions to preserve reading order # (ReadingOrder references TextRegions) @@ -807,7 +795,30 @@ def convert_file( for layout in layouts: # ignore layout_type: other - if layout.layout_type == "other": + if layout.textract_layout_type == "LAYOUT_FIGURE": + pagexml_text_region = ImageRegionType( + Coords=CoordsType( + points=points_from_aws_geometry( + layout.geometry, pil_img.width, pil_img.height + ) + ), + id=f"layout-image-region-{layout.id}", + type_=layout.page_layout_type, + custom=f"textract-layout-type: {layout.textract_layout_type.split('LAYOUT_')[1].lower()};", + ) + pagexml_page.add_TextRegion(pagexml_text_region) + + if preserve_reading_order: + global_ordered_group.add_RegionRefIndexed( + RegionRefIndexedType( + index=global_reading_order_index, + regionRef=f"layout-text-region-{layout.id}", + ) + ) + global_reading_order_index += 1 + continue + if layout.textract_layout_type == "LAYOUT_TABLE": + # we cover tables separatly continue pagexml_text_region = TextRegionType( @@ -817,14 +828,11 @@ def convert_file( ) ), id=f"layout-text-region-{layout.id}", - type_=layout.layout_type, + type_=layout.page_layout_type, + custom=f"textract-layout-type: {layout.textract_layout_type.split('LAYOUT_')[1].lower()};", ) pagexml_page.add_TextRegion(pagexml_text_region) - # local_reading_order_index = 0 - # local_reading_order = visited_layouts[layout.id] - - # igonre local reading order if preserve_reading_order: global_ordered_group.add_RegionRefIndexed( RegionRefIndexedType( @@ -836,45 +844,6 @@ def convert_file( for line in layout.child_lines: - # # preserve inner reading order of lines within layout objects - # # create a text region for each line - # line_region_id = f"line-region-{line.id}" - # pagexml_line_region = TextRegionType( - # Coords=CoordsType( - # points=points_from_aws_geometry( - # line.geometry, pil_img.width, pil_img.height - # ) - # ), - # id=line_region_id, - # ) - # pagexml_text_region.add_TextRegion(pagexml_line_region) - - # # store reading order - # if preserve_reading_order: - # local_reading_order.add_RegionRef( - # RegionRefType( - # index=local_reading_order_index, - # regionRef=line_region_id, - # ) - # ) - # local_reading_order_index += 1 - - # # append lines to text regions - # pagexml_text_line = TextLineType( - # Coords=CoordsType( - # points=points_from_aws_geometry( - # line.geometry, pil_img.width, pil_img.height - # ) - # ), - # id=f"line-{line.id}", - # ) - # if line.text: - # pagexml_text_line.add_TextEquiv( - # TextEquivType(conf=line.confidence, Unicode=line.text) - # ) - # pagexml_line_region.add_TextLine(pagexml_text_line) - - # append lines to text region (no inner reading order) pagexml_text_line = TextLineType( Coords=CoordsType( points=points_from_aws_geometry(