Source code for straditize.colnames

# -*- coding: utf-8 -*-
"""Module for text recognition

**Disclaimer**

Copyright (C) 2018-2019  Philipp S. Sommer

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
import re
import xarray as xr
from PIL import ImageOps, Image
from straditize.common import rgba2rgb
import numpy as np
import subprocess as spr
from collections import namedtuple
from psyplot.data import safe_list

from functools import partial


# check tesseract version and import tesserocr. If the tesseract version
# is 4.0.*, then we have to locale.setlocale(locale.LC_ALL, 'C')
# (see https://github.com/sirfz/tesserocr/issues/137)
try:
    tesseract_version = spr.check_output('tesseract --version'.split())
except FileNotFoundError:
    tesseract_version = tesserocr = None
else:
    tesseract_version = re.findall(
        '\d+\.\d+\.*', tesseract_version.decode('utf-8'))[0]
    if tesseract_version.startswith('4.0.'):
        import locale
        locale.setlocale(locale.LC_ALL, 'C')
    try:
        import tesserocr
    except ImportError:
        tesserocr = None


_Bbox = namedtuple('_Bbox', tuple('xywh'))


[docs]class Bbox(_Bbox):
    """A bounding box for a column name"""

    @property
    def top(self):
        """The top of the box"""
        return self.y

    @property
    def bottom(self):
        """The bottom of the box"""
        return self.y + self.h

    @property
    def right(self):
        """The right edge of the box"""
        return self.x + self.w

    @property
    def left(self):
        """The left edge of the box"""
        return self.x

    @property
    def bounds(self):
        """A list ``[x, y, width, height]``"""
        return list(self)

    @property
    def extents(self):
        """A list ``[x0, x1, y0, y1]`` with ``x0 <= x1`` and ``y0 <= y1``"""
        return sorted([self.x0, self.x1]) + sorted([self.y0, self.y1])

    @property
    def crop_extents(self):
        """The extents necessary for PIL.Image.crop"""
        return self.left, self.top, self.right, self.bottom

    @property
    def corners(self):
        """A np.ndarray of shape (4, 2) with the corners of the box"""
        return np.array([
            [self.left, self.bottom],
            [self.left, self.top],
            [self.right, self.bottom],
            [self.right, self.top]
            ])

    @property
    def x0(self):
        """The left edge"""
        return self.left

    @property
    def height(self):
        """The (positive) height"""
        return abs(self.h)

    @property
    def width(self):
        """The (positive) width"""
        return abs(self.w)

    @property
    def x1(self):
        """The right edge"""
        return self.right

    @property
    def y0(self):
        """The lower (bottom) edge"""
        return self.bottom

    @property
    def y1(self):
        """The upper (top) edge"""
        return self.top

[docs]    @classmethod
    def from_dict(cls, d):
        """Construct a box from the dictionary"""
        return cls(**d)


[docs]class ColNamesReader(object):
    """A class to recognize the text in an image

    This object handles the column names in the :attr:`column_names` attribute.
    It also implements several algorithms to automatically read in the column
    names using the tesserocr package. In particular these are the
    :meth:`recognize_text` method to read in one small image and the
    :meth:`find_colnames` method to find the column names automatically."""

    _images = None

    _boxes = None

    #: The RGBA :class:`PIL.Image.Image` that stores the column names
    image = None

    #: Boolean flag. If True, the data part is masked out in the
    #: :attr:`highres_image`
    ignore_data_part = True

    #: The vertical data limits of the data part that shall be exluded in the
    #: :attr:`highres_image` if the :attr:`ignore_data_part` is True
    data_ylim = None

    @property
    def highres_image(self):
        """The :attr:`image` attribute with higher resolution and with masked
        out data part if the :attr:`ignore_data_part` attribute is True and the
        :attr:`data_ylim` attribute is not None. The data part is then set to
        white with 0 alpha"""
        ret = (self.image if self._highres_image is None else
               self._highres_image)
        if self.data_ylim is not None and self.ignore_data_part:
            arr = np.array(ret)
            ylim = self.data_ylim * ret.size[1] / self.image.size[1]
            arr[slice(*ylim.astype(int)), :, :-1] = 255
            arr[slice(*ylim.astype(int)), :, -1] = 0
            ret = Image.fromarray(arr)
        return ret

    @highres_image.setter
    def highres_image(self, value):
        """The :attr:`image` attribute with higher resolution and with masked
        out data part if the :attr:`ignore_data_part` attribute is True and the
        :attr:`data_ylim` attribute is not None. The data part is then set to
        white with 0 alpha"""
        self._highres_image = value

    _highres_image = None

    @property
    def column_names(self):
        """The names of the columns"""
        nnames = len(self._column_names)
        ncols = len(self.column_bounds)
        if nnames < ncols:
            self._column_names += list(map(str, range(nnames, ncols)))
        return self._column_names[:ncols]

    @column_names.setter
    def column_names(self, value):
        """The names of the columns"""
        self._column_names = value

    @property
    def colpics(self):
        """The pictures of the column names"""
        npics = len(self._colpics)
        ncols = len(self.column_bounds)
        if npics < ncols:
            self._colpics += [None] * (ncols - npics)
        return self._colpics[:ncols]

    @colpics.setter
    def colpics(self, value):
        """The pictures of the column names"""
        self._colpics = value

    @property
    def rotated_image(self):
        """The rotated :attr:`image` based on the :meth:`rotate_image` method
        """
        return self.rotate_image(self.image)

    def __init__(self, image, bounds, rotate=45, mirror=False, flip=False,
                 highres_image=None, data_ylim=None):
        """
        Parameters
        ----------
        image: PIL.Image.Image
            The RGBA image that has the same shape as the original
            stratigraphic diagram
        bounds: np.ndarray of shape (N, 2)
            The boundaries for each column. These are essential for the
            :meth:`find_colnames` and the :meth:`highlight_column` methods
        rotate: float
            An angle between 0 and 90 that corresponds to the rotation of the
            column names
        mirror: bool
            If True, the image is mirrored (horizontally)
        flip: bool
            If True, the image is flipped (vertically)
        highres_image: PIL.Image.Image
            A high resolution version of the `image` with the same
            width-to-height ratio
        data_ylim: tuple (y0, y1)
            The vertical data limits of the data part that should be ignored
            in the :meth:`find_colnames` method if the :attr:`ignore_data_part`
            is True
        """
        from PIL import Image

        try:
            mode = image.mode
        except AttributeError:
            image = Image.fromarray(image, mode='RGBA')
        else:
            if mode != 'RGBA':
                image = image.convert('RGBA')
        self.image = image
        self.column_bounds = bounds
        self.rotate = rotate
        self.mirror = mirror
        self.flip = flip
        if highres_image is not None:
            try:
                mode = highres_image.mode
            except AttributeError:
                highres_image = Image.fromarray(highres_image, mode='RGBA')
            else:
                if mode != 'RGBA':
                    highres_image = highres_image.convert('RGBA')
        self.highres_image = highres_image
        self.data_ylim = None if data_ylim is None else np.asarray(data_ylim)
        self._column_names = []
        self._colpics = []

    def __reduce__(self):
        return (
            self.__class__,
            (self.image, self.column_bounds, self.rotate, self.mirror,
             self.flip),
            {'_colpics': self._colpics,
             '_column_names': self._column_names,
             '_highres_image': self._highres_image,
             'data_ylim': self.data_ylim})

[docs]    def close(self):
        """Close the column names reader"""
        self._colpics.clear()
        self._column_names.clear()
        self.image.close()
        del self.image
        if self._highres_image is not None:
            self._highres_image.close()
            del self._highres_image

    nc_meta = {
        'colnames_image': {
            'dims': ('ycolname', 'xcolname', 'rgba'),
            'long_name': 'RGBA images for column names reader',
            'units': 'color'},
        'colnames_hr_image': {
            'dims': ('ycolname_hr', 'xcolname_hr', 'rgba'),
            'long_name': "Highres image for column names reader",
            'units': 'color'},
        'colnames_bounds': {
            'dims': ('column', 'limit'), 'units': 'px',
            'long_name': ('The boundaries of the columns for the column names '
                          'reader')},
        'colname': {
            'dims': 'column', 'long_name': 'Name of the columns'},
        'colpic': {
            'dims': ('column', 'colpic_y', 'colpic_x', 'rgba'),
            'long_name': 'The pictures of the column names', 'units': 'color'},
        'colpic_extents': {
            'dims': ('column', 'limit'),
            'long_name': 'The limits of the column names pictures',
            'units': 'px'},
        'rotate_colnames': {
            'dims': (), 'long_name': 'The rotation angle for column names'},
        'mirror_colnames': {
            'dims': (),
            'long_name': "Mirror the column names picture (horizontally)"},
        'flip_colnames': {
            'dims': (),
            'long_name': "Flip the column names picture (vertically)"},
        }

[docs]    def create_variable(self, ds, vname, data, **kwargs):
        """Insert the data into a variable in an :class:`xr.Dataset`"""
        attrs = self.nc_meta[vname].copy()
        dims = safe_list(attrs.pop('dims', vname))
        if vname in ds:
            ds.variables[vname][kwargs] = data
        else:
            v = xr.Variable(dims, np.asarray(data), attrs=attrs)
            ds[vname] = v
        return vname

[docs]    def get_colpic(self, x0, y0, x1, y1):
        """Extract the picture of the column name

        Parameters
        ----------
        x0: int
            The left edge
        y0: int
            The upper edge
        x1: int
            The right edge
        y1: int
            The lower edge

        Returns
        -------
        PIL.Image.Image
            The part of the rotated :attr:`highres_image` cropped out from the
            given parameters"""
        hr = self.highres_image
        image = self.rotate_image(hr)
        xs_hr, ys_hr = hr.size
        xs, ys = self.image.size
        x01, y01 = self.transform_point(x0, y0, invert=True)
        x11, y11 = self.transform_point(x1, y1, invert=True)
        x02, y02 = self.transform_point(
            x01 * xs_hr / xs, y01 * ys_hr / ys, image=hr)
        x12, y12 = self.transform_point(
            x11 * xs_hr / xs, y11 * ys_hr / ys, image=hr)
        return image.crop([x02, y02, x12, y12])

[docs]    def to_dataset(self, ds=None):
        """All the necessary data as a :class:`xarray.Dataset`

        Parameters
        ----------
        ds: xarray.Dataset
            The dataset in which to insert the data. If None, a new one will be
            created

        Returns
        -------
        xarray.Dataset
            Either the given `ds` or a new :class:`xarray.Dataset` instance"""
        if ds is None:
            ds = xr.Dataset()
        self.create_variable(ds, 'colnames_image', self.image)
        if self._highres_image is not None:
            self.create_variable(ds, 'colnames_hr_image', self._highres_image)
        self.create_variable(ds, 'colnames_bounds', self.column_bounds)
        self.create_variable(ds, 'colname', self.column_names)
        self.create_variable(ds, 'rotate_colnames', self.rotate)
        self.create_variable(ds, 'mirror_colnames', self.mirror)
        self.create_variable(ds, 'flip_colnames', self.flip)
        if any(self.colpics):
            extents = np.array([colpic.size[::-1] if colpic else (0, 0)
                                for colpic in self.colpics])
            self.create_variable(ds, 'colpic_extents', extents)
            colpics_shp = (len(extents), ) + tuple(extents.max(axis=0)) + (4, )
            colpics = np.zeros(
                colpics_shp,
                dtype=next(np.asarray(pic).dtype for pic in self.colpics))
            for i, (pic, (ys, xs)) in enumerate(zip(self.colpics, extents)):
                colpics[i, :ys, :xs, :] = np.asarray(pic)
            self.create_variable(ds, 'colpic', colpics)
        return ds

[docs]    @classmethod
    def from_dataset(cls, ds):
        """Create a :class:`ColNamesReader` for a xarray.Dataset

        Parameters
        ----------
        ds: xarray.Dataset
            The dataset as obtained from the :meth:`to_dataset` method"""
        from PIL import Image
        ret = cls(ds['colnames_image'].values, ds['colnames_bounds'].values,
                  rotate=ds['rotate_colnames'].values,
                  mirror=bool(ds['mirror_colnames'].values),
                  flip=bool(ds['flip_colnames'].values))
        if 'colnames_hr_image' in ds:
            ret.highres_image = Image.fromarray(ds['colnames_hr_image'].values,
                                                mode='RGBA')
        ret._column_names = list(ds['colname'].values)
        if 'colpic' in ds:
            ret._colpics = [
                Image.fromarray(arr[:ys, :xs].values, mode='RGBA')
                if xs and ys else None
                for arr, (ys, xs) in zip(ds['colpic'],
                                         ds['colpic_extents'].values)]
        if 'data_lims' in ds:
            ret.data_ylim = ds['data_lims'].sel(axis='y').values
        return ret

[docs]    def transform_point(self, x, y, invert=False, image=None):
        """Transform a point between un-rotated and rotated coordinate system

        Parameters
        ----------
        x: float
            The x-coordinate of the point in the source coordinate system
        y: float
            The y-coordinate of the point in the source coordinate system
        invert: bool
            If True, the source coordinate system is the rotated one (i.e.
            this method transform from the :attr:`rotated_image` to the
            coordinate system of the :attr:`image`), other wise from the
            :attr:`image` to the :attr:`rotated_image`
        image: PIL.Image.Image
            The unrotated source image. If None, the :attr:`image` is used.
            This image defines the source coordinate system (or the target
            coordinate system if `invert` is True)

        Returns
        -------
        float
            The transformed `x`-coordinate
        float
            The transformed `y`-coordinate
        """
        import matplotlib.transforms as mt
        angle = np.deg2rad(self.rotate)
        if image is None:
            image = self.image
        xs, ys = image.size
        trans = mt.Affine2D().rotate(angle).translate(ys*np.sin(angle), 0)
        if invert:
            x, y = trans.inverted().transform_point([x, y])
        if self.mirror:
            x = xs - x
        if self.flip:
            y = ys - y
        if invert:
            return x, y
        else:
            return trans.transform_point([x, y])

[docs]    def navigate_to_col(self, col, ax):
        """Navigate to the specified column

        Change the x- and y-limits of the `ax` to display the given `col` based
        on the :attr:`column_bounds`

        Parameters
        ----------
        col: int
            The column number
        ax: matplotlib.axes.Axes
            The matplotlib axes for which to update the limits. This `ax` is
            expected to show the :attr:`rotated_image`"""
        xmin, xmax = ax.get_xlim()
        ymin, ymax = ax.get_ylim()
        dx = (xmax - xmin) / 2.
        dy = (ymax - ymin) / 2.
        xc = xmin + dx
        yc = ymin + dy
        xc_t, yc_t = self.transform_point(xc, yc, True)
        xc_col = np.mean(self.column_bounds[col])
        xc_new, yc_new = self.transform_point(xc_col, yc_t)
        ax.set_xlim(xc_new - dx, xc_new + dx)
        ax.set_ylim(yc_new - dy, yc_new + dy)

[docs]    def highlight_column(self, col, ax):
        """Highlight the column in the given axes displaying the
        :attr:`rotated_image`

        This method draws a rotated rectangle highlighting the given column
        `col` in the given `ax`.

        Parameters
        ----------
        col: int
            The column number
        ax: matplotlib.axes.Axes
            The matplotlib axes on which to plot the rectangle. This `ax` is
            expected to show the :attr:`rotated_image`"""
        import matplotlib.patches as patches
        import matplotlib as mpl
        xmin, xmax = self.column_bounds[col]
        xs, ys = self.image.size
        if self.mirror:
            xmin, xmax = xs - xmax, xs - xmin
        angle = np.deg2rad(self.rotate)
        x = ys * np.sin(angle) + xmin * np.cos(angle)
        y = xmin * np.sin(angle)
        patch = patches.Rectangle((x, y), xmax-xmin, ys, color="red",
                                  alpha=0.50)
        tr = mpl.transforms.Affine2D().rotate_around(x, y, angle)
        patch.set_transform(tr + ax.transData)
        ax.add_patch(patch)

        return patch

[docs]    def rotate_image(self, image):
        """Modify an image with :attr:`rotate`, :attr:`flip`, :attr:`mirror`

        This method rotated, mirrors and/or flips the given `image` based on
        the :attr:`rotate`, :attr:`mirror` and :attr:`flip` attributes

        Parameters
        ----------
        image: PIL.Image.Image
            The source image

        Returns
        -------
        PIL.Image.Image
            The target image
        """
        ret = image
        if self.mirror:
            ret = ImageOps.mirror(ret)
        if self.flip:
            ret = ImageOps.flip(ret)
        ret = ret.rotate(-self.rotate, expand=True)
        return ret

[docs]    def recognize_text(self, image):
        """Recognize the text in an image using tesserocr

        This method uses the :func:`tesserocr.image_to_text` to read in the
        text in a given `image`

        Parameters
        ----------
        image: PIL.Image.Image
            The image to read in

        Returns
        -------
        str
            The text found in it without newline characters"""
        if tesserocr is None:
            raise ImportError("tesserocr module not found!")

        if image.mode == 'RGBA':
            image = rgba2rgb(image)

        return tesserocr.image_to_text(image).strip().replace('\n', ' ')

[docs]    def find_colnames(self, extents=None):
        """Find the names for the columns using tesserocr

        Parameters
        ----------
        extents: list of floats (x0, y0, x1, y1)
            The extents to crop the :attr:`rotated_image`. We only look for
            column names in this image

        Returns
        -------
        dict
            A mapping from column number to a string (the column name)
        dict
            A mapping from column number to a :class:`PIL.Image.Image` (the
            image of the column name)
        dict
            A mapping from column number to a :class:`Bbox` (the bounding box
            of the corresponding column name)"""

        def get_overlap(col, box):
            s, e = bounds[col]
            x0, y0 = extents[:2]
            xmin = self.transform_point(
                box.x0 + x0, box.y0 + y0, invert=True, image=hr)[0]
            xmax = self.transform_point(
                box.x0 + x0, box.y1 + y0, invert=True, image=hr)[0]
            xmin, xmax = sorted([xmin, xmax])
            return max(min(e, xmax) - max(s, xmin), 0)

        def vbox_distance(b1, b2):
            if b1.left > b2.right or b1.right < b2.left:
                return np.inf  # no overlap
            return min(abs(b1.top - b2.bottom), abs(b2.top - b1.bottom))

        if tesserocr is None:
            raise ImportError("tesserocr module not found!")

        bounds = self.column_bounds
        cols = list(range(len(bounds)))
        rotated = self.rotated_image
        hr = self.highres_image
        rotated_hr = self.rotate_image(hr)
        fx, fy = np.round(
            np.array(rotated_hr.size) / rotated.size).astype(int)
        bounds = bounds * fx

        if extents is None:
            image = rotated_hr
            x0 = y0 = 0
        else:
            extents = np.asarray(extents)
            extents[::2] *= fx
            extents[1::2] *= fy
            image = rotated_hr.crop(extents)

            x0, y0 = self.transform_point(
                *extents[:2], image=hr,
                invert=True)

        if tesseract_version.startswith('4.0.'):
            # LC_ALL might have been changed by some other module, so we set
            # it here again to "C"
            import locale
            locale.setlocale(locale.LC_ALL, 'C')

        with tesserocr.PyTessBaseAPI() as api:
            api.SetImage(rgba2rgb(image))
            im_boxes = api.GetComponentImages(tesserocr.RIL.TEXTLINE, True)
            texts = {}
            images = {}
            for i, (im, d, _, _) in enumerate(im_boxes):
                box = Bbox(**d)
                if not any(get_overlap(col, box) for col in cols):
                    continue
                # expand the image to improve text recognition
                im = ImageOps.expand(rgba2rgb(image.crop(box.crop_extents)),
                                     int(im.size[1] / 2.), (255, 255, 255))
                text = tesserocr.image_to_text(im).strip()
                if len(text) >= 3:
                    texts[box] = text
                    images[box] = im.convert('RGBA')

        if not texts:
            return {}, {}, {}

        # merge boxes that are closer than one 1em
        em = min(b.h for b in texts)
        merged = {None}
        while merged:
            merged = set()
            for b1, t in list(texts.items()):
                if b1 in merged:
                    continue
                col = max(cols, key=partial(get_overlap, box=b1))
                for b2, t in list(texts.items()):
                    if (b1 is b2 or b2 in merged or not get_overlap(col, b2) or
                            vbox_distance(b1, b2) > 0.5*em):
                        continue
                    merged.update([b1, b2])
                    box = Bbox(min(b1.x, b2.x), min(b1.y, b2.y),
                               max(b1.x1, b2.x1) - min(b1.x0, b2.x0),
                               max(b1.y0, b2.y0) - min(b1.y1, b2.y1))
                    texts[box] = texts[b1] + (
                        ' ' if not texts[b1].endswith('-') else '') + texts[b2]
                    images[box] = image.crop(box.crop_extents)
                    b1 = box
            for b in merged:
                del texts[b], images[b]

        # get a mapping from box to column from the overlap
        boxes = dict(filter(
            lambda t: get_overlap(*t),
            ((col, max(texts, key=partial(get_overlap, col)))
             for col in range(len(bounds)))))
        x0, y0 = extents[:2]

        return (
            {col: texts[box] for col, box in boxes.items()},
            {col: images[box] for col, box in boxes.items()},
            {col: Bbox((x0 + b.x0) / fx, (y0 + b.y) / fy, b.w / fx, b.h / fy)
             for col, b in boxes.items()})