Source code for straditize.colnames

# -*- coding: utf-8 -*-
"""Module for text recognition


Copyright (C) 2018-2019  Philipp S. Sommer

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program. If not, see <>.
import re
import xarray as xr
from PIL import ImageOps, Image
from straditize.common import rgba2rgb
import numpy as np
import subprocess as spr
from collections import namedtuple
from import safe_list

from functools import partial

# check tesseract version and import tesserocr. If the tesseract version
# is 4.0.*, then we have to locale.setlocale(locale.LC_ALL, 'C')
# (see
    tesseract_version = spr.check_output('tesseract --version'.split())
except FileNotFoundError:
    tesseract_version = tesserocr = None
    tesseract_version = re.findall(
        '\d+\.\d+\.*', tesseract_version.decode('utf-8'))[0]
    if tesseract_version.startswith('4.0.'):
        import locale
        locale.setlocale(locale.LC_ALL, 'C')
        import tesserocr
    except ImportError:
        tesserocr = None

_Bbox = namedtuple('_Bbox', tuple('xywh'))

[docs]class Bbox(_Bbox): """A bounding box for a column name""" @property def top(self): """The top of the box""" return self.y @property def bottom(self): """The bottom of the box""" return self.y + self.h @property def right(self): """The right edge of the box""" return self.x + self.w @property def left(self): """The left edge of the box""" return self.x @property def bounds(self): """A list ``[x, y, width, height]``""" return list(self) @property def extents(self): """A list ``[x0, x1, y0, y1]`` with ``x0 <= x1`` and ``y0 <= y1``""" return sorted([self.x0, self.x1]) + sorted([self.y0, self.y1]) @property def crop_extents(self): """The extents necessary for PIL.Image.crop""" return self.left,, self.right, self.bottom @property def corners(self): """A np.ndarray of shape (4, 2) with the corners of the box""" return np.array([ [self.left, self.bottom], [self.left,], [self.right, self.bottom], [self.right,] ]) @property def x0(self): """The left edge""" return self.left @property def height(self): """The (positive) height""" return abs(self.h) @property def width(self): """The (positive) width""" return abs(self.w) @property def x1(self): """The right edge""" return self.right @property def y0(self): """The lower (bottom) edge""" return self.bottom @property def y1(self): """The upper (top) edge""" return
[docs] @classmethod def from_dict(cls, d): """Construct a box from the dictionary""" return cls(**d)
[docs]class ColNamesReader(object): """A class to recognize the text in an image This object handles the column names in the :attr:`column_names` attribute. It also implements several algorithms to automatically read in the column names using the tesserocr package. In particular these are the :meth:`recognize_text` method to read in one small image and the :meth:`find_colnames` method to find the column names automatically.""" _images = None _boxes = None #: The RGBA :class:`PIL.Image.Image` that stores the column names image = None #: Boolean flag. If True, the data part is masked out in the #: :attr:`highres_image` ignore_data_part = True #: The vertical data limits of the data part that shall be exluded in the #: :attr:`highres_image` if the :attr:`ignore_data_part` is True data_ylim = None @property def highres_image(self): """The :attr:`image` attribute with higher resolution and with masked out data part if the :attr:`ignore_data_part` attribute is True and the :attr:`data_ylim` attribute is not None. The data part is then set to white with 0 alpha""" ret = (self.image if self._highres_image is None else self._highres_image) if self.data_ylim is not None and self.ignore_data_part: arr = np.array(ret) ylim = self.data_ylim * ret.size[1] / self.image.size[1] arr[slice(*ylim.astype(int)), :, :-1] = 255 arr[slice(*ylim.astype(int)), :, -1] = 0 ret = Image.fromarray(arr) return ret @highres_image.setter def highres_image(self, value): """The :attr:`image` attribute with higher resolution and with masked out data part if the :attr:`ignore_data_part` attribute is True and the :attr:`data_ylim` attribute is not None. The data part is then set to white with 0 alpha""" self._highres_image = value _highres_image = None @property def column_names(self): """The names of the columns""" nnames = len(self._column_names) ncols = len(self.column_bounds) if nnames < ncols: self._column_names += list(map(str, range(nnames, ncols))) return self._column_names[:ncols] @column_names.setter def column_names(self, value): """The names of the columns""" self._column_names = value @property def colpics(self): """The pictures of the column names""" npics = len(self._colpics) ncols = len(self.column_bounds) if npics < ncols: self._colpics += [None] * (ncols - npics) return self._colpics[:ncols] @colpics.setter def colpics(self, value): """The pictures of the column names""" self._colpics = value @property def rotated_image(self): """The rotated :attr:`image` based on the :meth:`rotate_image` method """ return self.rotate_image(self.image) def __init__(self, image, bounds, rotate=45, mirror=False, flip=False, highres_image=None, data_ylim=None): """ Parameters ---------- image: PIL.Image.Image The RGBA image that has the same shape as the original stratigraphic diagram bounds: np.ndarray of shape (N, 2) The boundaries for each column. These are essential for the :meth:`find_colnames` and the :meth:`highlight_column` methods rotate: float An angle between 0 and 90 that corresponds to the rotation of the column names mirror: bool If True, the image is mirrored (horizontally) flip: bool If True, the image is flipped (vertically) highres_image: PIL.Image.Image A high resolution version of the `image` with the same width-to-height ratio data_ylim: tuple (y0, y1) The vertical data limits of the data part that should be ignored in the :meth:`find_colnames` method if the :attr:`ignore_data_part` is True """ from PIL import Image try: mode = image.mode except AttributeError: image = Image.fromarray(image, mode='RGBA') else: if mode != 'RGBA': image = image.convert('RGBA') self.image = image self.column_bounds = bounds self.rotate = rotate self.mirror = mirror self.flip = flip if highres_image is not None: try: mode = highres_image.mode except AttributeError: highres_image = Image.fromarray(highres_image, mode='RGBA') else: if mode != 'RGBA': highres_image = highres_image.convert('RGBA') self.highres_image = highres_image self.data_ylim = None if data_ylim is None else np.asarray(data_ylim) self._column_names = [] self._colpics = [] def __reduce__(self): return ( self.__class__, (self.image, self.column_bounds, self.rotate, self.mirror, self.flip), {'_colpics': self._colpics, '_column_names': self._column_names, '_highres_image': self._highres_image, 'data_ylim': self.data_ylim})
[docs] def close(self): """Close the column names reader""" self._colpics.clear() self._column_names.clear() self.image.close() del self.image if self._highres_image is not None: self._highres_image.close() del self._highres_image
nc_meta = { 'colnames_image': { 'dims': ('ycolname', 'xcolname', 'rgba'), 'long_name': 'RGBA images for column names reader', 'units': 'color'}, 'colnames_hr_image': { 'dims': ('ycolname_hr', 'xcolname_hr', 'rgba'), 'long_name': "Highres image for column names reader", 'units': 'color'}, 'colnames_bounds': { 'dims': ('column', 'limit'), 'units': 'px', 'long_name': ('The boundaries of the columns for the column names ' 'reader')}, 'colname': { 'dims': 'column', 'long_name': 'Name of the columns'}, 'colpic': { 'dims': ('column', 'colpic_y', 'colpic_x', 'rgba'), 'long_name': 'The pictures of the column names', 'units': 'color'}, 'colpic_extents': { 'dims': ('column', 'limit'), 'long_name': 'The limits of the column names pictures', 'units': 'px'}, 'rotate_colnames': { 'dims': (), 'long_name': 'The rotation angle for column names'}, 'mirror_colnames': { 'dims': (), 'long_name': "Mirror the column names picture (horizontally)"}, 'flip_colnames': { 'dims': (), 'long_name': "Flip the column names picture (vertically)"}, }
[docs] def create_variable(self, ds, vname, data, **kwargs): """Insert the data into a variable in an :class:`xr.Dataset`""" attrs = self.nc_meta[vname].copy() dims = safe_list(attrs.pop('dims', vname)) if vname in ds: ds.variables[vname][kwargs] = data else: v = xr.Variable(dims, np.asarray(data), attrs=attrs) ds[vname] = v return vname
[docs] def get_colpic(self, x0, y0, x1, y1): """Extract the picture of the column name Parameters ---------- x0: int The left edge y0: int The upper edge x1: int The right edge y1: int The lower edge Returns ------- PIL.Image.Image The part of the rotated :attr:`highres_image` cropped out from the given parameters""" hr = self.highres_image image = self.rotate_image(hr) xs_hr, ys_hr = hr.size xs, ys = self.image.size x01, y01 = self.transform_point(x0, y0, invert=True) x11, y11 = self.transform_point(x1, y1, invert=True) x02, y02 = self.transform_point( x01 * xs_hr / xs, y01 * ys_hr / ys, image=hr) x12, y12 = self.transform_point( x11 * xs_hr / xs, y11 * ys_hr / ys, image=hr) return image.crop([x02, y02, x12, y12])
[docs] def to_dataset(self, ds=None): """All the necessary data as a :class:`xarray.Dataset` Parameters ---------- ds: xarray.Dataset The dataset in which to insert the data. If None, a new one will be created Returns ------- xarray.Dataset Either the given `ds` or a new :class:`xarray.Dataset` instance""" if ds is None: ds = xr.Dataset() self.create_variable(ds, 'colnames_image', self.image) if self._highres_image is not None: self.create_variable(ds, 'colnames_hr_image', self._highres_image) self.create_variable(ds, 'colnames_bounds', self.column_bounds) self.create_variable(ds, 'colname', self.column_names) self.create_variable(ds, 'rotate_colnames', self.rotate) self.create_variable(ds, 'mirror_colnames', self.mirror) self.create_variable(ds, 'flip_colnames', self.flip) if any(self.colpics): extents = np.array([colpic.size[::-1] if colpic else (0, 0) for colpic in self.colpics]) self.create_variable(ds, 'colpic_extents', extents) colpics_shp = (len(extents), ) + tuple(extents.max(axis=0)) + (4, ) colpics = np.zeros( colpics_shp, dtype=next(np.asarray(pic).dtype for pic in self.colpics)) for i, (pic, (ys, xs)) in enumerate(zip(self.colpics, extents)): colpics[i, :ys, :xs, :] = np.asarray(pic) self.create_variable(ds, 'colpic', colpics) return ds
[docs] @classmethod def from_dataset(cls, ds): """Create a :class:`ColNamesReader` for a xarray.Dataset Parameters ---------- ds: xarray.Dataset The dataset as obtained from the :meth:`to_dataset` method""" from PIL import Image ret = cls(ds['colnames_image'].values, ds['colnames_bounds'].values, rotate=ds['rotate_colnames'].values, mirror=bool(ds['mirror_colnames'].values), flip=bool(ds['flip_colnames'].values)) if 'colnames_hr_image' in ds: ret.highres_image = Image.fromarray(ds['colnames_hr_image'].values, mode='RGBA') ret._column_names = list(ds['colname'].values) if 'colpic' in ds: ret._colpics = [ Image.fromarray(arr[:ys, :xs].values, mode='RGBA') if xs and ys else None for arr, (ys, xs) in zip(ds['colpic'], ds['colpic_extents'].values)] if 'data_lims' in ds: ret.data_ylim = ds['data_lims'].sel(axis='y').values return ret
[docs] def transform_point(self, x, y, invert=False, image=None): """Transform a point between un-rotated and rotated coordinate system Parameters ---------- x: float The x-coordinate of the point in the source coordinate system y: float The y-coordinate of the point in the source coordinate system invert: bool If True, the source coordinate system is the rotated one (i.e. this method transform from the :attr:`rotated_image` to the coordinate system of the :attr:`image`), other wise from the :attr:`image` to the :attr:`rotated_image` image: PIL.Image.Image The unrotated source image. If None, the :attr:`image` is used. This image defines the source coordinate system (or the target coordinate system if `invert` is True) Returns ------- float The transformed `x`-coordinate float The transformed `y`-coordinate """ import matplotlib.transforms as mt angle = np.deg2rad(self.rotate) if image is None: image = self.image xs, ys = image.size trans = mt.Affine2D().rotate(angle).translate(ys*np.sin(angle), 0) if invert: x, y = trans.inverted().transform_point([x, y]) if self.mirror: x = xs - x if self.flip: y = ys - y if invert: return x, y else: return trans.transform_point([x, y])
[docs] def navigate_to_col(self, col, ax): """Navigate to the specified column Change the x- and y-limits of the `ax` to display the given `col` based on the :attr:`column_bounds` Parameters ---------- col: int The column number ax: matplotlib.axes.Axes The matplotlib axes for which to update the limits. This `ax` is expected to show the :attr:`rotated_image`""" xmin, xmax = ax.get_xlim() ymin, ymax = ax.get_ylim() dx = (xmax - xmin) / 2. dy = (ymax - ymin) / 2. xc = xmin + dx yc = ymin + dy xc_t, yc_t = self.transform_point(xc, yc, True) xc_col = np.mean(self.column_bounds[col]) xc_new, yc_new = self.transform_point(xc_col, yc_t) ax.set_xlim(xc_new - dx, xc_new + dx) ax.set_ylim(yc_new - dy, yc_new + dy)
[docs] def highlight_column(self, col, ax): """Highlight the column in the given axes displaying the :attr:`rotated_image` This method draws a rotated rectangle highlighting the given column `col` in the given `ax`. Parameters ---------- col: int The column number ax: matplotlib.axes.Axes The matplotlib axes on which to plot the rectangle. This `ax` is expected to show the :attr:`rotated_image`""" import matplotlib.patches as patches import matplotlib as mpl xmin, xmax = self.column_bounds[col] xs, ys = self.image.size if self.mirror: xmin, xmax = xs - xmax, xs - xmin angle = np.deg2rad(self.rotate) x = ys * np.sin(angle) + xmin * np.cos(angle) y = xmin * np.sin(angle) patch = patches.Rectangle((x, y), xmax-xmin, ys, color="red", alpha=0.50) tr = mpl.transforms.Affine2D().rotate_around(x, y, angle) patch.set_transform(tr + ax.transData) ax.add_patch(patch) return patch
[docs] def rotate_image(self, image): """Modify an image with :attr:`rotate`, :attr:`flip`, :attr:`mirror` This method rotated, mirrors and/or flips the given `image` based on the :attr:`rotate`, :attr:`mirror` and :attr:`flip` attributes Parameters ---------- image: PIL.Image.Image The source image Returns ------- PIL.Image.Image The target image """ ret = image if self.mirror: ret = ImageOps.mirror(ret) if self.flip: ret = ImageOps.flip(ret) ret = ret.rotate(-self.rotate, expand=True) return ret
[docs] def recognize_text(self, image): """Recognize the text in an image using tesserocr This method uses the :func:`tesserocr.image_to_text` to read in the text in a given `image` Parameters ---------- image: PIL.Image.Image The image to read in Returns ------- str The text found in it without newline characters""" if tesserocr is None: raise ImportError("tesserocr module not found!") if image.mode == 'RGBA': image = rgba2rgb(image) return tesserocr.image_to_text(image).strip().replace('\n', ' ')
[docs] def find_colnames(self, extents=None): """Find the names for the columns using tesserocr Parameters ---------- extents: list of floats (x0, y0, x1, y1) The extents to crop the :attr:`rotated_image`. We only look for column names in this image Returns ------- dict A mapping from column number to a string (the column name) dict A mapping from column number to a :class:`PIL.Image.Image` (the image of the column name) dict A mapping from column number to a :class:`Bbox` (the bounding box of the corresponding column name)""" def get_overlap(col, box): s, e = bounds[col] x0, y0 = extents[:2] xmin = self.transform_point( box.x0 + x0, box.y0 + y0, invert=True, image=hr)[0] xmax = self.transform_point( box.x0 + x0, box.y1 + y0, invert=True, image=hr)[0] xmin, xmax = sorted([xmin, xmax]) return max(min(e, xmax) - max(s, xmin), 0) def vbox_distance(b1, b2): if b1.left > b2.right or b1.right < b2.left: return np.inf # no overlap return min(abs( - b2.bottom), abs( - b1.bottom)) if tesserocr is None: raise ImportError("tesserocr module not found!") bounds = self.column_bounds cols = list(range(len(bounds))) rotated = self.rotated_image hr = self.highres_image rotated_hr = self.rotate_image(hr) fx, fy = np.round( np.array(rotated_hr.size) / rotated.size).astype(int) bounds = bounds * fx if extents is None: image = rotated_hr x0 = y0 = 0 else: extents = np.asarray(extents) extents[::2] *= fx extents[1::2] *= fy image = rotated_hr.crop(extents) x0, y0 = self.transform_point( *extents[:2], image=hr, invert=True) if tesseract_version.startswith('4.0.'): # LC_ALL might have been changed by some other module, so we set # it here again to "C" import locale locale.setlocale(locale.LC_ALL, 'C') with tesserocr.PyTessBaseAPI() as api: api.SetImage(rgba2rgb(image)) im_boxes = api.GetComponentImages(tesserocr.RIL.TEXTLINE, True) texts = {} images = {} for i, (im, d, _, _) in enumerate(im_boxes): box = Bbox(**d) if not any(get_overlap(col, box) for col in cols): continue # expand the image to improve text recognition im = ImageOps.expand(rgba2rgb(image.crop(box.crop_extents)), int(im.size[1] / 2.), (255, 255, 255)) text = tesserocr.image_to_text(im).strip() if len(text) >= 3: texts[box] = text images[box] = im.convert('RGBA') if not texts: return {}, {}, {} # merge boxes that are closer than one 1em em = min(b.h for b in texts) merged = {None} while merged: merged = set() for b1, t in list(texts.items()): if b1 in merged: continue col = max(cols, key=partial(get_overlap, box=b1)) for b2, t in list(texts.items()): if (b1 is b2 or b2 in merged or not get_overlap(col, b2) or vbox_distance(b1, b2) > 0.5*em): continue merged.update([b1, b2]) box = Bbox(min(b1.x, b2.x), min(b1.y, b2.y), max(b1.x1, b2.x1) - min(b1.x0, b2.x0), max(b1.y0, b2.y0) - min(b1.y1, b2.y1)) texts[box] = texts[b1] + ( ' ' if not texts[b1].endswith('-') else '') + texts[b2] images[box] = image.crop(box.crop_extents) b1 = box for b in merged: del texts[b], images[b] # get a mapping from box to column from the overlap boxes = dict(filter( lambda t: get_overlap(*t), ((col, max(texts, key=partial(get_overlap, col))) for col in range(len(bounds))))) x0, y0 = extents[:2] return ( {col: texts[box] for col, box in boxes.items()}, {col: images[box] for col, box in boxes.items()}, {col: Bbox((x0 + b.x0) / fx, (y0 + b.y) / fy, b.w / fx, b.h / fy) for col, b in boxes.items()})