From 3761cad41eb5d93ce17df99190f942da672a0b30 Mon Sep 17 00:00:00 2001 From: David Manthey Date: Tue, 10 Dec 2024 12:59:42 -0500 Subject: [PATCH] Improve handling of ome-tiff files generated by bioformats Specifically, we finally have samples of how associated images are handled. As part of this, the _populatedLevels value is more correct. --- CHANGELOG.md | 1 + .../large_image_source_ometiff/__init__.py | 70 ++++++++++++++++++- .../pil/large_image_source_pil/__init__.py | 5 +- .../tiff/large_image_source_tiff/__init__.py | 30 +++++--- .../large_image_source_tiff/tiff_reader.py | 8 ++- test/datastore.py | 4 ++ test/test_converter.py | 4 +- test/test_source_base.py | 4 +- 8 files changed, 109 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d80c27c60..50b76432b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ - Harden the geojson annotation parser ([#1743](../../pull/1743)) - Add more color palettes ([#1746](../../pull/1746)) - Improve the list of extensions the bioformats source reports ([#1748](../../pull/1748)) +- Improve handling of ome-tiff files generated by bioformats ([#1750](../../pull/1750)) ### Changes diff --git a/sources/ometiff/large_image_source_ometiff/__init__.py b/sources/ometiff/large_image_source_ometiff/__init__.py index 29126bc7a..04a88aa5d 100644 --- a/sources/ometiff/large_image_source_ometiff/__init__.py +++ b/sources/ometiff/large_image_source_ometiff/__init__.py @@ -105,6 +105,7 @@ def __init__(self, path, **kwargs): msg = 'Not a recognized OME Tiff' raise TileSourceError(msg) info = getattr(base, '_description_record', None) + self._associatedImages = {} if not info or not info.get('OME'): msg = 'Not an OME Tiff' raise TileSourceError(msg) @@ -115,6 +116,7 @@ def __init__(self, path, **kwargs): except KeyError: msg = 'Not a recognized OME Tiff' raise TileSourceError(msg) + usesSubIfds = self._checkForSubIfds(base) omeimages = [ entry['Pixels'] for entry in self._omeinfo['Image'] if len(entry['Pixels']['TiffData']) == len(self._omebase['TiffData'])] @@ -125,10 +127,16 @@ def __init__(self, path, **kwargs): omebylevel = dict(zip(levels, omeimages)) self._omeLevels = [omebylevel.get(key) for key in range(max(omebylevel.keys()) + 1)] if base._tiffInfo.get('istiled'): + if usesSubIfds: + self._omeLevels = [None] * max(usesSubIfds) + [self._omeLevels[-1]] self._tiffDirectories = [ self.getTiffDir(int(entry['TiffData'][0].get('IFD', 0))) if entry else None for entry in self._omeLevels] + if usesSubIfds: + for lvl in usesSubIfds: + if self._tiffDirectories[lvl] is None: + self._tiffDirectories[lvl] = False else: self._tiffDirectories = [ self.getTiffDir(0, mustBeTiled=None) @@ -149,7 +157,6 @@ def __init__(self, path, **kwargs): # We can get the embedded images, but we don't currently use non-tiled # images as associated images. This would require enumerating tiff # directories not mentioned by the ome list. - self._associatedImages = {} self._checkForInefficientDirectories() def _checkForOMEZLoop(self): @@ -199,6 +206,40 @@ def _checkForOMEZLoop(self): info['Image']['Pixels']['PlanesFromZloop'] = 'true' info['Image']['Pixels']['SizeZ'] = str(zloop) + def _checkForSubIfds(self, base): + """ + Check if the first ifd has sub-ifds. If so, expect lower resolutions + to be in subifds, not in primary ifds. + + :param base: base tiff directory + :returns: either False if no subifds are lower resolution, or a + dictionary of levels (keys) and values that are subifd numbers. + """ + try: + levels = int(max(0, math.ceil(max( + math.log(float(base.imageWidth) / base.tileWidth), + math.log(float(base.imageHeight) / base.tileHeight)) / math.log(2))) + 1) + filled = {} + for z in range(levels - 2, -1, -1): + subdir = levels - 1 - z + scale = int(2 ** subdir) + try: + dir = self.getTiffDir(0, mustBeTiled=True, subDirectoryNum=subdir) + except Exception: + continue + if (dir is not None and + (dir.tileWidth == base.tileWidth or dir.tileWidth == dir.imageWidth) and + (dir.tileHeight == base.tileHeight or dir.tileHeight == dir.imageHeight) and + abs(dir.imageWidth * scale - base.imageWidth) <= scale and + abs(dir.imageHeight * scale - base.imageHeight) <= scale): + filled[z] = subdir + if not len(filled): + return False + filled[levels - 1] = 0 + return filled + except TiffError: + return False + def _parseOMEInfo(self): # noqa if isinstance(self._omeinfo['Image'], dict): self._omeinfo['Image'] = [self._omeinfo['Image']] @@ -241,6 +282,33 @@ def _parseOMEInfo(self): # noqa for entry in self._omebase['TiffData']}) > 1: msg = 'OME Tiff references multiple files' raise TileSourceError(msg) + if (len(self._omebase['TiffData']) == + int(self._omebase['SizeT']) * int(self._omebase['SizeZ'])): + self._omebase['SizeC'] = 1 + # DWM:: others are probably associated images + for img in self._omeinfo['Image'][1:]: + try: + if img['Name'] and img['Pixels']['TiffData'][0]['IFD']: + self._addAssociatedImage( + int(img['Pixels']['TiffData'][0]['IFD']), + None, None, img['Name'].split()[0]) + except Exception: + pass + elif len(self._omeinfo['Image']) > 1: + multiple = False + for img in self._omeinfo['Image'][1:]: + try: + bpix = self._omeinfo['Image'][0]['Pixels'] + imgpix = img['Pixels'] + if imgpix['SizeX'] == bpix['SizeX'] and imgpix['SizeY'] == bpix['SizeY']: + multiple = True + break + except Exception: + multiple = True + if multiple: + # We should handle this as SizeXY + msg = 'OME Tiff references multiple images' + raise TileSourceError(msg) if (len(self._omebase['TiffData']) != int(self._omebase['SizeC']) * int(self._omebase['SizeT']) * int(self._omebase['SizeZ']) or len(self._omebase['TiffData']) != len( diff --git a/sources/pil/large_image_source_pil/__init__.py b/sources/pil/large_image_source_pil/__init__.py index f8ee1bbe7..17b150f06 100644 --- a/sources/pil/large_image_source_pil/__init__.py +++ b/sources/pil/large_image_source_pil/__init__.py @@ -19,6 +19,7 @@ import math import os import threading +import warnings import numpy as np import PIL.Image @@ -56,6 +57,8 @@ # package is not installed pass +warnings.filterwarnings('ignore', category=UserWarning, module='.*PIL.*') + # Default to ignoring files with some specific extensions. config.ConfigValues['source_pil_ignored_names'] = \ r'(\.mrxs|\.vsi)$' @@ -138,7 +141,7 @@ def __init__(self, path, maxSize=None, **kwargs): # noqa if self._pilImage is None: try: self._pilImage = PIL.Image.open(largeImagePath) - except OSError: + except (OSError, ValueError): if not os.path.isfile(largeImagePath): raise TileSourceFileNotFoundError(largeImagePath) from None msg = 'File cannot be opened via PIL.' diff --git a/sources/tiff/large_image_source_tiff/__init__.py b/sources/tiff/large_image_source_tiff/__init__.py index 4b2bb91b6..ab349d64e 100644 --- a/sources/tiff/large_image_source_tiff/__init__.py +++ b/sources/tiff/large_image_source_tiff/__init__.py @@ -327,12 +327,19 @@ def _initWithTiffTools(self): # noqa self._info = info frames = [] associated = [] # for now, a list of directories + used_subifd = False for idx, ifd in enumerate(info['ifds']): # if not tiles, add to associated images if tifftools.Tag.tileWidth.value not in ifd['tags']: - associated.append(idx) + associated.append((idx, False)) continue - level = self._levelFromIfd(ifd, info['ifds'][0]) + try: + level = self._levelFromIfd(ifd, info['ifds'][0]) + except TileSourceError: + if idx and used_subifd: + associated.append((idx, True)) + continue + raise # if the same resolution as the main image, add a frame if level == self.levels - 1: frames.append({'dirs': [None] * self.levels}) @@ -371,9 +378,13 @@ def _initWithTiffTools(self): # noqa tifftools.Tag.TileOffsets.value not in subifds[0]['tags']): msg = 'Subifd has no strip or tile offsets.' raise TileSourceMalformedError(msg) - level = self._levelFromIfd(subifds[0], info['ifds'][0]) + try: + level = self._levelFromIfd(subifds[0], info['ifds'][0]) + except Exception: + break if level < self.levels - 1 and frames[-1]['dirs'][level] is None: frames[-1]['dirs'][level] = (idx, subidx + 1) + used_subifd = True else: msg = 'Tile layers are in a surprising order' raise TileSourceError(msg) @@ -407,8 +418,8 @@ def _initWithTiffTools(self): # noqa self._iccprofiles.append(ifd['tags'][ tifftools.Tag.ICCProfile.value]['data']) self._associatedImages = {} - for dirNum in associated: - self._addAssociatedImage(dirNum) + for dirNum, isTiled in associated: + self._addAssociatedImage(dirNum, isTiled) self._frames = frames self._tiffDirectories = [ self.getTiffDir( @@ -490,7 +501,7 @@ def _checkForVendorSpecificTags(self): frame.setdefault('frame', {}) frame['frame']['IndexC'] = idx - def _addAssociatedImage(self, directoryNum, mustBeTiled=False, topImage=None): + def _addAssociatedImage(self, directoryNum, mustBeTiled=False, topImage=None, imageId=None): """ Check if the specified TIFF directory contains an image with a sensible image description that can be used as an ID. If so, and if the image @@ -501,6 +512,7 @@ def _addAssociatedImage(self, directoryNum, mustBeTiled=False, topImage=None): untiled images. :param topImage: if specified, add image-embedded metadata to this image. + :param imageId: if specified, use this as the image name. """ try: associated = self.getTiffDir(directoryNum, mustBeTiled) @@ -514,6 +526,8 @@ def _addAssociatedImage(self, directoryNum, mustBeTiled=False, topImage=None): id = 'dir%d' % directoryNum if not len(self._associatedImages): id = 'macro' + if imageId: + id = imageId if not id and not mustBeTiled: id = {1: 'label', 9: 'macro'}.get(associated._tiffInfo.get('subfiletype')) if not isinstance(id, str): @@ -765,7 +779,7 @@ def getAssociatedImagesList(self): """ imageList = set(self._associatedImages) for td in self._tiffDirectories: - if td is not None: + if td is not None and td is not False: imageList |= set(td._embeddedImages) return sorted(imageList) @@ -784,7 +798,7 @@ def _getAssociatedImage(self, imageKey): # with seemingly bad associated images, we may need to read them with a # more complex process than read_image. for td in self._tiffDirectories: - if td is not None and imageKey in td._embeddedImages: + if td is not None and td is not False and imageKey in td._embeddedImages: return PIL.Image.open(io.BytesIO(base64.b64decode(td._embeddedImages[imageKey]))) if imageKey in self._associatedImages: return PIL.Image.fromarray(self._associatedImages[imageKey]) diff --git a/sources/tiff/large_image_source_tiff/tiff_reader.py b/sources/tiff/large_image_source_tiff/tiff_reader.py index a97e174c0..61be88945 100644 --- a/sources/tiff/large_image_source_tiff/tiff_reader.py +++ b/sources/tiff/large_image_source_tiff/tiff_reader.py @@ -788,11 +788,13 @@ def getTile(self, x, y, asarray=False): if (not self._tiffInfo.get('istiled') or self._tiffInfo.get('compression') not in { - libtiff_ctypes.COMPRESSION_JPEG, 33003, 33005, 34712} or + libtiff_ctypes.COMPRESSION_JPEG, 33003, 33004, 33005, 34712} or self._tiffInfo.get('bitspersample') != 8 or self._tiffInfo.get('sampleformat') not in { None, libtiff_ctypes.SAMPLEFORMAT_UINT} or - (asarray and self._tiffInfo.get('compression') not in {33003, 33005, 34712} and ( + (asarray and self._tiffInfo.get('compression') not in { + 33003, 33004, 33005, 34712, + } and ( self._tiffInfo.get('compression') != libtiff_ctypes.COMPRESSION_JPEG or self._tiffInfo.get('photometric') != libtiff_ctypes.PHOTOMETRIC_YCBCR))): return self._getUncompressedTile(tileNum) @@ -811,7 +813,7 @@ def getTile(self, x, y, asarray=False): # Get the whole frame, which is in a JPEG or JPEG 2000 format frame = self._getJpegFrame(tileNum, True) # For JP2K, see if we can convert it faster than PIL - if self._tiffInfo.get('compression') in {33003, 33005}: + if self._tiffInfo.get('compression') in {33003, 33004, 33005, 34712}: try: import openjpeg diff --git a/test/datastore.py b/test/datastore.py index ecfebc834..500b645c4 100644 --- a/test/datastore.py +++ b/test/datastore.py @@ -123,6 +123,10 @@ # Synthetic Indica Labs tiff; subifds missing tile/strip data and unmarked # float32 pixels rather than uint32 'synthetic_indica.tiff': 'sha512:fba7eb2fb5fd12ac242d8b0760440f170f48f9e2434a672cbf230bd8a9ff02fad8f9bdf7225edf2de244f412edfc5205e695031a1d43dd99fe31c3aca11909a1', # noqa + # Converted from the TCGA svs file using bioformats java program and + # --rgb --quality=0.015 --compression='JPEG-2000 Lossy' parameters to make + # the file small + 'TCGA-55-8207-01Z-00-DX1.ome.tiff': 'sha512:50cf63f0e8bfa3054d3532b7dd0237b66aeb4c7609da874639a28bc068dbd157f786e84d3eb76a3b0e6636a042c56c3b96d3be2ad66f7589d0542a5d20cecdb4', # noqa } diff --git a/test/test_converter.py b/test/test_converter.py index 7588b410c..64a3f91d7 100644 --- a/test/test_converter.py +++ b/test/test_converter.py @@ -146,8 +146,8 @@ def testConvertJp2kCompression(tmpdir): image, _ = source.getRegion( output={'maxWidth': 200, 'maxHeight': 200}, format=constants.TILE_FORMAT_NUMPY) # Without or with icc adjustment - assert ((image[12][167] == [215, 135, 172]).all() or - (image[12][167] == [216, 134, 172]).all()) + assert ((image[12][167][:3] == [215, 135, 172]).all() or + (image[12][167][:3] == [216, 134, 172]).all()) outputPath2 = os.path.join(tmpdir, 'out2.tiff') large_image_converter.convert(imagePath, outputPath2, compression='jp2k', psnr=50) diff --git a/test/test_source_base.py b/test/test_source_base.py index f5657c50f..421868b1e 100644 --- a/test/test_source_base.py +++ b/test/test_source_base.py @@ -64,12 +64,12 @@ 'openslide': { 'read': r'\.(ptif|svs|ndpi|tif.*|qptiff|dcm)$', 'noread': r'(oahu|DDX58_AXL|huron\.image2_jpeg2k|landcover_sample|d042-353\.crop|US_Geo\.|extraoverview|imagej|bad_axes|synthetic_untiled|indica|tcia.*dcm)', # noqa - 'skip': r'nokeyframe\.ome\.tiff$', + 'skip': r'nokeyframe\.ome\.tiff|TCGA-55.*\.ome\.tiff$', 'skipTiles': r'one_layer_missing', }, 'pil': { 'read': r'(\.(jpg|jpeg|png|tif.*)|18[-0-9a-f]{34}\.dcm)$', - 'noread': r'(G10-3|JK-kidney|d042-353.*tif|huron|one_layer_missing|US_Geo|extraoverview|indica)', # noqa + 'noread': r'(G10-3|JK-kidney|d042-353.*tif|huron|one_layer_missing|US_Geo|extraoverview|indica|TCGA-55.*\.ome\.tiff)', # noqa }, 'rasterio': { 'read': r'(\.(jpg|jpeg|jp2|ptif|scn|svs|ndpi|tif.*|qptiff)|18[-0-9a-f]{34}\.dcm)$',