From 3761cad41eb5d93ce17df99190f942da672a0b30 Mon Sep 17 00:00:00 2001
From: David Manthey <david.manthey@kitware.com>
Date: Tue, 10 Dec 2024 12:59:42 -0500
Subject: [PATCH] Improve handling of ome-tiff files generated by bioformats

Specifically, we finally have samples of how associated images are
handled.  As part of this, the _populatedLevels value is more correct.
---
 CHANGELOG.md                                  |  1 +
 .../large_image_source_ometiff/__init__.py    | 70 ++++++++++++++++++-
 .../pil/large_image_source_pil/__init__.py    |  5 +-
 .../tiff/large_image_source_tiff/__init__.py  | 30 +++++---
 .../large_image_source_tiff/tiff_reader.py    |  8 ++-
 test/datastore.py                             |  4 ++
 test/test_converter.py                        |  4 +-
 test/test_source_base.py                      |  4 +-
 8 files changed, 109 insertions(+), 17 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d80c27c60..50b76432b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@
 - Harden the geojson annotation parser ([#1743](../../pull/1743))
 - Add more color palettes ([#1746](../../pull/1746))
 - Improve the list of extensions the bioformats source reports ([#1748](../../pull/1748))
+- Improve handling of ome-tiff files generated by bioformats ([#1750](../../pull/1750))
 
 ### Changes
 
diff --git a/sources/ometiff/large_image_source_ometiff/__init__.py b/sources/ometiff/large_image_source_ometiff/__init__.py
index 29126bc7a..04a88aa5d 100644
--- a/sources/ometiff/large_image_source_ometiff/__init__.py
+++ b/sources/ometiff/large_image_source_ometiff/__init__.py
@@ -105,6 +105,7 @@ def __init__(self, path, **kwargs):
             msg = 'Not a recognized OME Tiff'
             raise TileSourceError(msg)
         info = getattr(base, '_description_record', None)
+        self._associatedImages = {}
         if not info or not info.get('OME'):
             msg = 'Not an OME Tiff'
             raise TileSourceError(msg)
@@ -115,6 +116,7 @@ def __init__(self, path, **kwargs):
         except KeyError:
             msg = 'Not a recognized OME Tiff'
             raise TileSourceError(msg)
+        usesSubIfds = self._checkForSubIfds(base)
         omeimages = [
             entry['Pixels'] for entry in self._omeinfo['Image'] if
             len(entry['Pixels']['TiffData']) == len(self._omebase['TiffData'])]
@@ -125,10 +127,16 @@ def __init__(self, path, **kwargs):
         omebylevel = dict(zip(levels, omeimages))
         self._omeLevels = [omebylevel.get(key) for key in range(max(omebylevel.keys()) + 1)]
         if base._tiffInfo.get('istiled'):
+            if usesSubIfds:
+                self._omeLevels = [None] * max(usesSubIfds) + [self._omeLevels[-1]]
             self._tiffDirectories = [
                 self.getTiffDir(int(entry['TiffData'][0].get('IFD', 0)))
                 if entry else None
                 for entry in self._omeLevels]
+            if usesSubIfds:
+                for lvl in usesSubIfds:
+                    if self._tiffDirectories[lvl] is None:
+                        self._tiffDirectories[lvl] = False
         else:
             self._tiffDirectories = [
                 self.getTiffDir(0, mustBeTiled=None)
@@ -149,7 +157,6 @@ def __init__(self, path, **kwargs):
         # We can get the embedded images, but we don't currently use non-tiled
         # images as associated images.  This would require enumerating tiff
         # directories not mentioned by the ome list.
-        self._associatedImages = {}
         self._checkForInefficientDirectories()
 
     def _checkForOMEZLoop(self):
@@ -199,6 +206,40 @@ def _checkForOMEZLoop(self):
         info['Image']['Pixels']['PlanesFromZloop'] = 'true'
         info['Image']['Pixels']['SizeZ'] = str(zloop)
 
+    def _checkForSubIfds(self, base):
+        """
+        Check if the first ifd has sub-ifds.  If so, expect lower resolutions
+        to be in subifds, not in primary ifds.
+
+        :param base: base tiff directory
+        :returns: either False if no subifds are lower resolution, or a
+            dictionary of levels (keys) and values that are subifd numbers.
+        """
+        try:
+            levels = int(max(0, math.ceil(max(
+                math.log(float(base.imageWidth) / base.tileWidth),
+                math.log(float(base.imageHeight) / base.tileHeight)) / math.log(2))) + 1)
+            filled = {}
+            for z in range(levels - 2, -1, -1):
+                subdir = levels - 1 - z
+                scale = int(2 ** subdir)
+                try:
+                    dir = self.getTiffDir(0, mustBeTiled=True, subDirectoryNum=subdir)
+                except Exception:
+                    continue
+                if (dir is not None and
+                        (dir.tileWidth == base.tileWidth or dir.tileWidth == dir.imageWidth) and
+                        (dir.tileHeight == base.tileHeight or dir.tileHeight == dir.imageHeight) and
+                        abs(dir.imageWidth * scale - base.imageWidth) <= scale and
+                        abs(dir.imageHeight * scale - base.imageHeight) <= scale):
+                    filled[z] = subdir
+            if not len(filled):
+                return False
+            filled[levels - 1] = 0
+            return filled
+        except TiffError:
+            return False
+
     def _parseOMEInfo(self):  # noqa
         if isinstance(self._omeinfo['Image'], dict):
             self._omeinfo['Image'] = [self._omeinfo['Image']]
@@ -241,6 +282,33 @@ def _parseOMEInfo(self):  # noqa
                     for entry in self._omebase['TiffData']}) > 1:
                 msg = 'OME Tiff references multiple files'
                 raise TileSourceError(msg)
+            if (len(self._omebase['TiffData']) ==
+                    int(self._omebase['SizeT']) * int(self._omebase['SizeZ'])):
+                self._omebase['SizeC'] = 1
+                # DWM:: others are probably associated images
+                for img in self._omeinfo['Image'][1:]:
+                    try:
+                        if img['Name'] and img['Pixels']['TiffData'][0]['IFD']:
+                            self._addAssociatedImage(
+                                int(img['Pixels']['TiffData'][0]['IFD']),
+                                None, None, img['Name'].split()[0])
+                    except Exception:
+                        pass
+            elif len(self._omeinfo['Image']) > 1:
+                multiple = False
+                for img in self._omeinfo['Image'][1:]:
+                    try:
+                        bpix = self._omeinfo['Image'][0]['Pixels']
+                        imgpix = img['Pixels']
+                        if imgpix['SizeX'] == bpix['SizeX'] and imgpix['SizeY'] == bpix['SizeY']:
+                            multiple = True
+                            break
+                    except Exception:
+                        multiple = True
+                if multiple:
+                    # We should handle this as SizeXY
+                    msg = 'OME Tiff references multiple images'
+                    raise TileSourceError(msg)
             if (len(self._omebase['TiffData']) != int(self._omebase['SizeC']) *
                     int(self._omebase['SizeT']) * int(self._omebase['SizeZ']) or
                     len(self._omebase['TiffData']) != len(
diff --git a/sources/pil/large_image_source_pil/__init__.py b/sources/pil/large_image_source_pil/__init__.py
index f8ee1bbe7..17b150f06 100644
--- a/sources/pil/large_image_source_pil/__init__.py
+++ b/sources/pil/large_image_source_pil/__init__.py
@@ -19,6 +19,7 @@
 import math
 import os
 import threading
+import warnings
 
 import numpy as np
 import PIL.Image
@@ -56,6 +57,8 @@
     # package is not installed
     pass
 
+warnings.filterwarnings('ignore', category=UserWarning, module='.*PIL.*')
+
 # Default to ignoring files with some specific extensions.
 config.ConfigValues['source_pil_ignored_names'] = \
     r'(\.mrxs|\.vsi)$'
@@ -138,7 +141,7 @@ def __init__(self, path, maxSize=None, **kwargs):  # noqa
         if self._pilImage is None:
             try:
                 self._pilImage = PIL.Image.open(largeImagePath)
-            except OSError:
+            except (OSError, ValueError):
                 if not os.path.isfile(largeImagePath):
                     raise TileSourceFileNotFoundError(largeImagePath) from None
                 msg = 'File cannot be opened via PIL.'
diff --git a/sources/tiff/large_image_source_tiff/__init__.py b/sources/tiff/large_image_source_tiff/__init__.py
index 4b2bb91b6..ab349d64e 100644
--- a/sources/tiff/large_image_source_tiff/__init__.py
+++ b/sources/tiff/large_image_source_tiff/__init__.py
@@ -327,12 +327,19 @@ def _initWithTiffTools(self):  # noqa
         self._info = info
         frames = []
         associated = []  # for now, a list of directories
+        used_subifd = False
         for idx, ifd in enumerate(info['ifds']):
             # if not tiles, add to associated images
             if tifftools.Tag.tileWidth.value not in ifd['tags']:
-                associated.append(idx)
+                associated.append((idx, False))
                 continue
-            level = self._levelFromIfd(ifd, info['ifds'][0])
+            try:
+                level = self._levelFromIfd(ifd, info['ifds'][0])
+            except TileSourceError:
+                if idx and used_subifd:
+                    associated.append((idx, True))
+                    continue
+                raise
             # if the same resolution as the main image, add a frame
             if level == self.levels - 1:
                 frames.append({'dirs': [None] * self.levels})
@@ -371,9 +378,13 @@ def _initWithTiffTools(self):  # noqa
                             tifftools.Tag.TileOffsets.value not in subifds[0]['tags']):
                         msg = 'Subifd has no strip or tile offsets.'
                         raise TileSourceMalformedError(msg)
-                    level = self._levelFromIfd(subifds[0], info['ifds'][0])
+                    try:
+                        level = self._levelFromIfd(subifds[0], info['ifds'][0])
+                    except Exception:
+                        break
                     if level < self.levels - 1 and frames[-1]['dirs'][level] is None:
                         frames[-1]['dirs'][level] = (idx, subidx + 1)
+                        used_subifd = True
                     else:
                         msg = 'Tile layers are in a surprising order'
                         raise TileSourceError(msg)
@@ -407,8 +418,8 @@ def _initWithTiffTools(self):  # noqa
                 self._iccprofiles.append(ifd['tags'][
                     tifftools.Tag.ICCProfile.value]['data'])
         self._associatedImages = {}
-        for dirNum in associated:
-            self._addAssociatedImage(dirNum)
+        for dirNum, isTiled in associated:
+            self._addAssociatedImage(dirNum, isTiled)
         self._frames = frames
         self._tiffDirectories = [
             self.getTiffDir(
@@ -490,7 +501,7 @@ def _checkForVendorSpecificTags(self):
                 frame.setdefault('frame', {})
                 frame['frame']['IndexC'] = idx
 
-    def _addAssociatedImage(self, directoryNum, mustBeTiled=False, topImage=None):
+    def _addAssociatedImage(self, directoryNum, mustBeTiled=False, topImage=None, imageId=None):
         """
         Check if the specified TIFF directory contains an image with a sensible
         image description that can be used as an ID.  If so, and if the image
@@ -501,6 +512,7 @@ def _addAssociatedImage(self, directoryNum, mustBeTiled=False, topImage=None):
            untiled images.
         :param topImage: if specified, add image-embedded metadata to this
            image.
+        :param imageId: if specified, use this as the image name.
         """
         try:
             associated = self.getTiffDir(directoryNum, mustBeTiled)
@@ -514,6 +526,8 @@ def _addAssociatedImage(self, directoryNum, mustBeTiled=False, topImage=None):
                 id = 'dir%d' % directoryNum
                 if not len(self._associatedImages):
                     id = 'macro'
+            if imageId:
+                id = imageId
             if not id and not mustBeTiled:
                 id = {1: 'label', 9: 'macro'}.get(associated._tiffInfo.get('subfiletype'))
             if not isinstance(id, str):
@@ -765,7 +779,7 @@ def getAssociatedImagesList(self):
         """
         imageList = set(self._associatedImages)
         for td in self._tiffDirectories:
-            if td is not None:
+            if td is not None and td is not False:
                 imageList |= set(td._embeddedImages)
         return sorted(imageList)
 
@@ -784,7 +798,7 @@ def _getAssociatedImage(self, imageKey):
         # with seemingly bad associated images, we may need to read them with a
         # more complex process than read_image.
         for td in self._tiffDirectories:
-            if td is not None and imageKey in td._embeddedImages:
+            if td is not None and td is not False and imageKey in td._embeddedImages:
                 return PIL.Image.open(io.BytesIO(base64.b64decode(td._embeddedImages[imageKey])))
         if imageKey in self._associatedImages:
             return PIL.Image.fromarray(self._associatedImages[imageKey])
diff --git a/sources/tiff/large_image_source_tiff/tiff_reader.py b/sources/tiff/large_image_source_tiff/tiff_reader.py
index a97e174c0..61be88945 100644
--- a/sources/tiff/large_image_source_tiff/tiff_reader.py
+++ b/sources/tiff/large_image_source_tiff/tiff_reader.py
@@ -788,11 +788,13 @@ def getTile(self, x, y, asarray=False):
 
         if (not self._tiffInfo.get('istiled') or
                 self._tiffInfo.get('compression') not in {
-                    libtiff_ctypes.COMPRESSION_JPEG, 33003, 33005, 34712} or
+                    libtiff_ctypes.COMPRESSION_JPEG, 33003, 33004, 33005, 34712} or
                 self._tiffInfo.get('bitspersample') != 8 or
                 self._tiffInfo.get('sampleformat') not in {
                     None, libtiff_ctypes.SAMPLEFORMAT_UINT} or
-                (asarray and self._tiffInfo.get('compression') not in {33003, 33005, 34712} and (
+                (asarray and self._tiffInfo.get('compression') not in {
+                    33003, 33004, 33005, 34712,
+                } and (
                     self._tiffInfo.get('compression') != libtiff_ctypes.COMPRESSION_JPEG or
                     self._tiffInfo.get('photometric') != libtiff_ctypes.PHOTOMETRIC_YCBCR))):
             return self._getUncompressedTile(tileNum)
@@ -811,7 +813,7 @@ def getTile(self, x, y, asarray=False):
         # Get the whole frame, which is in a JPEG or JPEG 2000 format
         frame = self._getJpegFrame(tileNum, True)
         # For JP2K, see if we can convert it faster than PIL
-        if self._tiffInfo.get('compression') in {33003, 33005}:
+        if self._tiffInfo.get('compression') in {33003, 33004, 33005, 34712}:
             try:
                 import openjpeg
 
diff --git a/test/datastore.py b/test/datastore.py
index ecfebc834..500b645c4 100644
--- a/test/datastore.py
+++ b/test/datastore.py
@@ -123,6 +123,10 @@
     # Synthetic Indica Labs tiff; subifds missing tile/strip data and unmarked
     # float32 pixels rather than uint32
     'synthetic_indica.tiff': 'sha512:fba7eb2fb5fd12ac242d8b0760440f170f48f9e2434a672cbf230bd8a9ff02fad8f9bdf7225edf2de244f412edfc5205e695031a1d43dd99fe31c3aca11909a1',  # noqa
+    # Converted from the TCGA svs file using bioformats java program and
+    # --rgb --quality=0.015 --compression='JPEG-2000 Lossy' parameters to make
+    # the file small
+    'TCGA-55-8207-01Z-00-DX1.ome.tiff': 'sha512:50cf63f0e8bfa3054d3532b7dd0237b66aeb4c7609da874639a28bc068dbd157f786e84d3eb76a3b0e6636a042c56c3b96d3be2ad66f7589d0542a5d20cecdb4',  # noqa
 }
 
 
diff --git a/test/test_converter.py b/test/test_converter.py
index 7588b410c..64a3f91d7 100644
--- a/test/test_converter.py
+++ b/test/test_converter.py
@@ -146,8 +146,8 @@ def testConvertJp2kCompression(tmpdir):
     image, _ = source.getRegion(
         output={'maxWidth': 200, 'maxHeight': 200}, format=constants.TILE_FORMAT_NUMPY)
     # Without or with icc adjustment
-    assert ((image[12][167] == [215, 135, 172]).all() or
-            (image[12][167] == [216, 134, 172]).all())
+    assert ((image[12][167][:3] == [215, 135, 172]).all() or
+            (image[12][167][:3] == [216, 134, 172]).all())
 
     outputPath2 = os.path.join(tmpdir, 'out2.tiff')
     large_image_converter.convert(imagePath, outputPath2, compression='jp2k', psnr=50)
diff --git a/test/test_source_base.py b/test/test_source_base.py
index f5657c50f..421868b1e 100644
--- a/test/test_source_base.py
+++ b/test/test_source_base.py
@@ -64,12 +64,12 @@
     'openslide': {
         'read': r'\.(ptif|svs|ndpi|tif.*|qptiff|dcm)$',
         'noread': r'(oahu|DDX58_AXL|huron\.image2_jpeg2k|landcover_sample|d042-353\.crop|US_Geo\.|extraoverview|imagej|bad_axes|synthetic_untiled|indica|tcia.*dcm)',  # noqa
-        'skip': r'nokeyframe\.ome\.tiff$',
+        'skip': r'nokeyframe\.ome\.tiff|TCGA-55.*\.ome\.tiff$',
         'skipTiles': r'one_layer_missing',
     },
     'pil': {
         'read': r'(\.(jpg|jpeg|png|tif.*)|18[-0-9a-f]{34}\.dcm)$',
-        'noread': r'(G10-3|JK-kidney|d042-353.*tif|huron|one_layer_missing|US_Geo|extraoverview|indica)',  # noqa
+        'noread': r'(G10-3|JK-kidney|d042-353.*tif|huron|one_layer_missing|US_Geo|extraoverview|indica|TCGA-55.*\.ome\.tiff)',  # noqa
     },
     'rasterio': {
         'read': r'(\.(jpg|jpeg|jp2|ptif|scn|svs|ndpi|tif.*|qptiff)|18[-0-9a-f]{34}\.dcm)$',