Source code for pycognaize.document.tag.table_tag

import bson
import string

import pandas as pd
from typing import Optional, Tuple

from tabulate import tabulate

from pycognaize.common.enums import (
    IqCellKeyEnum,
    IqTableTagEnum,
    IqTagKeyEnum,
    ID
)
from pycognaize.document.tag import ExtractionTag
from pycognaize.document.tag.tag import BoxTag
from pycognaize.document.tag.cell import Cell
from pycognaize.common.utils import convert_coord_to_num
from typing import TYPE_CHECKING
if TYPE_CHECKING:
    from pycognaize.document.page import Page



[docs]
class TableTag(BoxTag):
    """Base class for all pycognaize table fields"""

    def __init__(self, left, right, top, bottom,
                 page: 'Page',
                 cell_data: dict):
        super().__init__(left=left, right=right, top=top, bottom=bottom,
                         page=page)
        self._cell_data = cell_data
        self._cells = {}
        self._populate_cells()
        self._build_df()
        self._raw_df = None
        self._df = None

    def __getitem__(self, val: Tuple[int, int]) -> Cell:
        """Gets the cell through index"""
        if len(val) == 2:
            if any([isinstance(i, slice) for i in val]):
                raise NotImplementedError("Slice lookup not implemented")
            elif val in self.cells.keys():
                return self.cells[val]
            raise IndexError(
                f"No cell with the following index in the table: {val}")
        else:
            raise ValueError(f"Invalid argument {val}")

    @property
    def cell_data(self) -> dict:
        if not self._cell_data:
            raise Exception('Cell data is empty')
        return self._cell_data

    @property
    def cells(self) -> dict:
        return self._cells

    @property
    def raw_df(self) -> pd.DataFrame:
        if self._raw_df is None:
            self._raw_df = self._build_df()
        return self._raw_df

    @property
    def df(self) -> pd.DataFrame:
        if self._df is None:
            if "map" not in dir(self.raw_df):
                self._df = self.raw_df.applymap(
                    lambda x: self._extract_raw_ocr(x))
            else:
                self._df = self.raw_df.map(
                    lambda x: self._extract_raw_ocr(x))
        return self._df

    @staticmethod
    def _extract_raw_ocr(x):
        """Returns OCR extraction data"""
        try:
            return x.raw_ocr_value
        except AttributeError:
            return ''


[docs]
    @classmethod
    def construct_from_raw(cls, raw: dict, page: 'Page') -> 'TableTag':
        """Build Field from pycognaize raw data

        :param raw: pycognaize field's tag info
        :param page: `Page` to which the tag belongs
        :return:
        """
        table_raw_data = raw[IqTableTagEnum.table.value]
        left = convert_coord_to_num(table_raw_data['left'])
        top = convert_coord_to_num(table_raw_data['top'])
        height = convert_coord_to_num(table_raw_data['height'])
        width = convert_coord_to_num(table_raw_data['width'])
        cells = table_raw_data[IqTableTagEnum.cells.value]
        right = left + width
        bottom = top + height
        return cls(left=left, right=right, top=top, bottom=bottom,
                   page=page,
                   cell_data=cells)



[docs]
    def to_dict(self) -> dict:
        """Converts table tag to dict"""
        table_dict = {
            IqTableTagEnum.page.value: self.page.page_number,
            IqTableTagEnum.left.value: f"{self.left}%",
            IqTableTagEnum.top.value: f"{self.top}%",
            IqTableTagEnum.height.value: f"{self.height}%",
            IqTableTagEnum.width.value: f"{self.width}%",
            IqTableTagEnum.cells.value: self.cell_data,
        }
        output_dict = {
            ID: bson.ObjectId(),
            IqTagKeyEnum.page.value: self.page.page_number,
            IqTagKeyEnum.ocr_value.value:
                f"table on page {self.page.page_number}",
            IqTagKeyEnum.value.value:
                f"table on page {self.page.page_number}",
            IqTagKeyEnum.is_table.value: True,
            IqTableTagEnum.table.value: table_dict
        }
        return output_dict



[docs]
    def to_string(self) -> str:
        """
        Returns a string representation of the table
        """
        if self.df is None:
            return ''
        return tabulate(self.df, headers='keys', tablefmt='psql')


    def _populate_cells(self):
        for left_col_top_row, cell_dict in self.cell_data.items():
            keys = tuple((int(i) for i in left_col_top_row.split(':')))
            self._cells[keys] = (
                self._populate_cell(keys=keys, cell_dict=cell_dict))

    @staticmethod
    def _populate_cell(keys: tuple, cell_dict: dict) -> Cell:
        """Creates Cell object for each item in Table"""
        for key in IqCellKeyEnum:
            if key == IqCellKeyEnum.left_col_top_row:
                continue
            if key.value not in cell_dict:
                raise KeyError(
                    f"Required key '{key.value}' not in cell: {cell_dict}")

        return Cell(
            value=cell_dict[IqCellKeyEnum.text.value],
            left_col=keys[0],
            top_row=keys[1],
            top=cell_dict[IqCellKeyEnum.top.value],
            right=cell_dict[IqCellKeyEnum.left.value] + cell_dict[
                IqCellKeyEnum.width.value],
            bottom=cell_dict[IqCellKeyEnum.top.value] + cell_dict[
                IqCellKeyEnum.height.value],
            left=cell_dict[IqCellKeyEnum.left.value],
            col_span=cell_dict[IqCellKeyEnum.col_span.value],
            row_span=cell_dict[IqCellKeyEnum.row_span.value]
        )

    def _build_df(self, use_ocr_text: bool = False,
                  duplicate_text_for_spanned_cells: bool = True
                  ) -> pd.DataFrame:
        """Build pandas data frame using `TableTag` Cells

        :param use_ocr_text: If true, the raw OCR data will be used for
            the content of the cells.
        :param duplicate_text_for_spanned_cells: If true, the text will be
            duplicated for spanned cells.
        :return: DataFrame object, where each cell contains an ExtractionTag
            object with the coordinates and values from the annotated
            document.
        """
        cols = set()
        rows = set()
        image_width = self.page.image_width
        image_height = self.page.image_height
        region_in_pixels = dict(page=self.page)
        if image_width > image_height:
            image_width, image_height = image_height, image_width

        for cell_ in self.cells.values():
            cols.add(cell_.left)
            rows.add(cell_.top)

        cols = list(cols)
        rows = list(rows)
        cols.sort()
        rows.sort()

        headers_df = list(range(len(cols)))
        indices_df = list(range(len(rows)))
        df = pd.DataFrame(columns=headers_df, index=indices_df)

        for cell_ in self.cells.values():
            text = cell_.value
            top_index = rows.index(cell_.top)
            left_index = cols.index(cell_.left)
            for col_n in range(left_index, left_index + cell_.col_span):
                for row_n in range(top_index, top_index + cell_.row_span):
                    if use_ocr_text:
                        x = cell_.left * image_width / 100
                        y = cell_.top * image_height / 100,
                        w = (cell_.right - cell_.left) * image_width / 100,
                        h = (cell_.bottom - cell_.top) * image_height / 100,
                        region_in_pixels.update(
                            dict(x=x, y=y, w=w, h=h, width_scale=image_width,
                                 height_scale=image_height))
                        # FIXME: Define get_ocr_for_region and use it here
                        text = self.page.get_ocr_for_cell(region_in_pixels)
                    if not pd.isnull(df.iloc[row_n, col_n]):
                        raise ValueError(
                            "table_tag provides multiple values"
                            " for the same cell.")
                    if (
                        not duplicate_text_for_spanned_cells
                        and (
                            col_n > left_index
                            or row_n > top_index
                            )
                    ):
                        text = ''
                    df.loc[row_n, col_n] = ExtractionTag(
                        left=cell_.left, right=cell_.right,
                        top=cell_.top, bottom=cell_.bottom,
                        page=self.page,
                        raw_value=cell_.value,
                        raw_ocr_value=text)
        return df

    @staticmethod
    def _is_ascii(str_) -> bool:
        return not any((i for i in str_ if i not in string.ascii_letters))


[docs]
    def letter_2_num(self, letters) -> int:
        """Convert excel style coordinates into zero index coordinate"""
        letters = letters.upper()
        res = 0
        if self._is_ascii(letters):
            weight = len(letters) - 1
            for i, c in enumerate(letters):
                res += (ord(c) - 64) * 26 ** (weight - i)
        return res



[docs]
    @staticmethod
    def split_excel_letters_numbers(
            str_coord: str) -> Optional[Tuple[str, int]]:
        """Return the letters and numbers of the Excel coordinate as a tuple.
            If the string is not a valid Excel coordinate, return None"""
        for n, ch in enumerate(str_coord):
            if ch.isdigit():
                letters, numbers = str_coord[:n], str_coord[n:]
                if letters.isalpha() and numbers.isdigit():
                    return letters, int(numbers)
                else:
                    return