Source code for pycognaize.document.tag.html_tag

import abc
import bson
import pandas as pd
import logging

from typing import Optional, List, Union
from pycognaize.common.enums import (IqRecipeEnum, XBRLCellEnum,
                                     XBRLTableTagEnum, XBRLTagEnum, ID)
from pycognaize.document.html_info import HTML
from pycognaize.document.tag.tag import Tag



[docs]
class HTMLTagABC(Tag, metaclass=abc.ABCMeta):
    """Base class for XBRL document tags"""
    def __init__(self, html_id: Union[str, List[str]], xpath: str,
                 tag_id: Optional[str] = None):
        self._html_id = html_id
        self._xpath = xpath
        self._tag_id = tag_id

    @property
    def html_id(self):
        return self._html_id

    @property
    def xpath(self):
        return self._xpath

    @property
    def tag_id(self):
        return self._tag_id

    @classmethod
    def construct_from_raw(cls, raw: dict, html: HTML) -> 'HTMLTagABC':
        ...




[docs]
class HTMLTableTag(HTMLTagABC):
    """Represents table's coordinate data in XBRL document"""

    def __init__(self, tag_id: str,  value: str, ocr_value: str,
                 xpath: str, title: str, html_id: Union[str, List[str]],
                 cell_data: dict, html: HTML, source_ids,
                 is_table: bool = True):
        super().__init__(html_id=html_id, xpath=xpath, tag_id=tag_id)
        self._value = value
        self._ocr_value = ocr_value
        self._is_table = is_table
        self._title = title
        self._cell_data = cell_data
        self._cells = {}
        self._html = html
        self._source_ids = source_ids
        self._populate_cells()
        self._build_df()
        self._raw_df = None
        self._df = None

    @property
    def title(self):
        return self._title

    @property
    def source_ids(self):
        return self._source_ids

    @property
    def value(self):
        return self._value

    @property
    def ocr_value(self):
        return self._ocr_value

    @property
    def is_table(self):
        return self._is_table

    @property
    def html(self):
        return self._html

    @property
    def cells(self):
        return self._cells

    @property
    def cell_data(self) -> dict:
        if not self._cell_data:
            raise Exception('Cell data is empty')
        return self._cell_data

    @property
    def raw_df(self) -> pd.DataFrame:
        if self._raw_df is None:
            self._raw_df = self._build_df()
        return self._raw_df

    @property
    def df(self) -> pd.DataFrame:
        if self._df is None:
            if "map" not in dir(self.raw_df):
                self._df = self.raw_df.applymap(
                    lambda x: self._extract_value(x))
            else:
                self._df = self.raw_df.map(
                    lambda x: self._extract_value(x))
        return self._df

    @staticmethod
    def _extract_value(x):
        """Returns text value from `HTMLTag` object"""
        try:
            return x.raw_value
        except AttributeError:
            return ''

    def _populate_cells(self):
        for row_col_index, cell_dict in self.cell_data.items():
            keys = tuple((int(i) for i in row_col_index.split(':')))
            if XBRLTableTagEnum.is_bold.value not in cell_dict:
                cell_dict[XBRLTableTagEnum.is_bold.value] = False
            if XBRLTableTagEnum.left_indentation.value not in cell_dict:
                cell_dict[XBRLTableTagEnum.left_indentation.value] = None
            self._cells[keys] = (
                self._populate_cell(keys=keys, cell_dict=cell_dict))

    @staticmethod
    def _populate_cell(keys: tuple, cell_dict: dict) -> 'HTMLCell':
        """ Creates `HTMLCell` object for each item in Table"""
        return HTMLCell(
            html_id=cell_dict[XBRLCellEnum.id.value],
            xpath=cell_dict[XBRLCellEnum.xpath.value],
            row_index=keys[1],
            col_index=keys[0],
            col_span=cell_dict[XBRLCellEnum.col_span.value],
            row_span=cell_dict[XBRLCellEnum.row_span.value],
            raw_value=cell_dict[XBRLCellEnum.raw_value.value],
            is_bold=cell_dict[XBRLCellEnum.is_bold.value],
            left_indentation=cell_dict[XBRLCellEnum.left_indentation.value]
        )


[docs]
    def to_dict(self) -> dict:
        """Converts HTMLTableTag to dict"""
        table_dict = {
            XBRLTableTagEnum.xpath.value: self.xpath,
            XBRLTableTagEnum.title.value: self.title,
            XBRLTableTagEnum.anchor_id.value: self.html_id,
            XBRLTableTagEnum.cells.value: self.cell_data,
        }
        output_dict = {
            XBRLTableTagEnum._id.value: self.tag_id,
            XBRLTableTagEnum.value.value: '',
            XBRLTableTagEnum.ocr_value.value: '',
            XBRLTableTagEnum.is_table.value: True,
            XBRLTableTagEnum.table.value: table_dict,
            XBRLTableTagEnum.source.value: self.source_ids}

        return output_dict



[docs]
    @classmethod
    def construct_from_raw(cls, raw: dict, html: HTML) -> 'HTMLTableTag':
        """Builds HTMLTableTag objeTct from pycognaize raw data
        :param raw: pycognaize field's tag info
        :param html: HTML

        :return:
        """
        tag_id = raw[XBRLTableTagEnum._id.value]
        ocr_value = raw[XBRLTableTagEnum.ocr_value.value]
        value = raw[XBRLTableTagEnum.value.value]
        table_raw_data = raw[XBRLTableTagEnum.table.value]
        xpath = table_raw_data[XBRLTableTagEnum.xpath.value]
        title = table_raw_data[XBRLTableTagEnum.title.value]
        html_id = table_raw_data[XBRLTableTagEnum.anchor_id.value]
        cell_data = table_raw_data[XBRLTableTagEnum.cells.value]
        source_ids = raw[XBRLTableTagEnum.source.value]
        return cls(tag_id=tag_id, ocr_value=ocr_value, value=value,
                   is_table=True, xpath=xpath, title=title,
                   html_id=html_id, cell_data=cell_data,
                   html=html, source_ids=source_ids)


    def _build_df(self) -> pd.DataFrame:
        """Build pandas data frame using `HTMLTag` Cells

        :return: DataFrame object,where each cell contains an HTMLTag
            object with the html_id and values from the annotated
            document
        """
        cols = set()
        rows = set()
        for cell_ in self.cells.values():
            cols.add(cell_.col_index - 1)
            rows.add(cell_.row_index - 1)
        cols = list(cols)
        rows = list(rows)
        cols.sort()
        rows.sort()
        df = pd.DataFrame(columns=cols, index=rows)
        for cell_ in self.cells.values():
            row_index = cell_.row_index - 1
            col_index = cell_.col_index - 1
            for col_n in range(col_index, col_index + cell_.col_span):
                for row_n in range(row_index, row_index + cell_.row_span):
                    df.loc[row_n, col_n] = HTMLTag(
                        is_table=False,
                        html_id=cell_.html_id,
                        xpath=cell_.xpath,
                        raw_value=cell_.raw_value,
                        raw_ocr_value=cell_.raw_value,
                        field_id='',
                        tag_id=self.tag_id,
                        row_index=row_index,
                        col_index=col_index)
        df = self.replace_nans_with_empty_html_tags(df)
        return df


[docs]
    def replace_nans_with_empty_html_tags(self,
                                          df: pd.DataFrame) -> pd.DataFrame:
        """
            Replaces NaN values in a DataFrame with empty HTML tags.
        """
        for col in df.columns:
            for idx in df.index:
                if pd.isna(df.loc[idx, col]):
                    logging.warning(
                        f'Build df issue: Replacing empty cell at {idx, col} '
                        f'with empty HTMLTag in HTMLTableTag with html id '
                        f'{self.html_id}')
                    df.loc[idx, col] = HTMLTag(is_table=False,
                                               html_id='',
                                               xpath='',
                                               raw_value='',
                                               raw_ocr_value='',
                                               field_id='',
                                               tag_id='',
                                               row_index=idx,
                                               col_index=col)
        return df





[docs]
class HTMLCell:
    """Represents cell tag for XBRL tables"""
    def __init__(self, row_index: int, col_index: int,
                 col_span: int, row_span: int,
                 html_id: Union[str, List[str]], xpath: str,
                 raw_value: str, is_bold: False,
                 left_indentation: None):
        self._row_index = row_index
        self._col_index = col_index
        self._col_span = col_span
        self._row_span = row_span
        self._html_id = html_id
        self._xpath = xpath
        self._raw_value = raw_value
        self._is_bold = is_bold
        self._left_indentation = left_indentation

    @property
    def row_index(self) -> int:
        return self._row_index

    @property
    def col_index(self) -> int:
        return self._col_index

    @property
    def col_span(self) -> int:
        return self._col_span

    @property
    def row_span(self) -> int:
        return self._row_span

    @property
    def html_id(self):
        return self._html_id

    @property
    def xpath(self):
        return self._xpath

    @property
    def raw_value(self) -> str:
        return self._raw_value

    @property
    def is_bold(self) -> bool:
        return self._is_bold

    @property
    def left_indentation(self) -> str:
        return self._left_indentation


[docs]
    @classmethod
    def construct_from_raw(cls, raw: dict) -> 'HTMLCell':
        """Build HTMLTAG from pycognaize raw data

        :param raw: pycognaize field's tag info
        :return:
        """
        source_data = raw[XBRLCellEnum.source.value]
        row_index = source_data[XBRLCellEnum.row_index.value]
        col_index = source_data[XBRLCellEnum.col_index.value]
        col_span = source_data[XBRLCellEnum.col_span.value]
        row_span = source_data[XBRLCellEnum.row_span.value]
        raw_value = raw[XBRLCellEnum.raw_value.value]
        html_id = source_data[XBRLCellEnum.html_id.value]
        xpath = source_data[XBRLCellEnum.xpath.value]
        is_bold = (source_data[XBRLCellEnum.is_bold.value]
                   if XBRLCellEnum.is_bold.value in source_data else False)
        left_indentation = (source_data[XBRLCellEnum.left_indentation.value]
                            if XBRLCellEnum.left_indentation.value else None)
        return cls(html_id=html_id, xpath=xpath, row_index=row_index,
                   col_index=col_index, col_span=col_span, row_span=row_span,
                   raw_value=raw_value, is_bold=is_bold,
                   left_indentation=left_indentation)



[docs]
    def to_dict(self) -> dict:
        """Converts cell to dict"""
        cell_dict = {
            XBRLCellEnum.col_span.value: self.col_span,
            XBRLCellEnum.row_span.value: self.row_span,
            XBRLCellEnum.html_id.value: self.html_id,
            XBRLCellEnum.xpath.value: self.xpath,
            XBRLCellEnum.raw_value.value: self.raw_value,
            XBRLCellEnum.left_indentation.value: self.left_indentation,
            XBRLCellEnum.is_bold.value: self.is_bold}
        return {f"{self.col_index}:{self.row_index}": cell_dict}





[docs]
class HTMLTag(HTMLTagABC):
    def __init__(self, raw_value: str, raw_ocr_value: str,
                 is_table: bool, html_id: Union[str, List[str]],
                 field_id: Optional[str], tag_id: Optional[str],
                 row_index: int, col_index: int, xpath: str,
                 is_td: bool = True):
        super().__init__(html_id=html_id, xpath=xpath, tag_id=tag_id)
        self._raw_value = raw_value
        self._raw_ocr_value = raw_ocr_value
        self._is_table = is_table
        self._field_id = field_id
        self._row_index = row_index
        self._col_index = col_index
        self._is_td = is_td

    @property
    def raw_value(self):
        """returns adjusted value"""
        return self._raw_value

    @property
    def raw_ocr_value(self):
        return self._raw_ocr_value

    @property
    def is_table(self):
        return self._is_table

    @property
    def field_id(self):
        return self._field_id

    @property
    def row_index(self):
        return self._row_index

    @property
    def col_index(self):
        return self._col_index

    @property
    def is_td(self):
        return self._is_td


[docs]
    @classmethod
    def construct_from_raw(cls, raw: dict, html: HTML) -> 'HTMLTag':
        """Build HTMLTag from pycognaize raw data
        :param html: HTML
        :param raw: pycognaize field's tag info
        :return:
        """
        source_data = raw[XBRLTagEnum.source.value]
        if XBRLTagEnum.html.value in source_data:
            is_td = False
            html_id = source_data[XBRLTagEnum.html.value][
                XBRLTagEnum.parent_id.value]
        else:
            is_td = True
            html_id = source_data[XBRLTagEnum.ids.value]
        raw_value = raw[XBRLTagEnum.value.value]
        raw_ocr_value = raw[XBRLTagEnum.ocr_value.value]
        is_table = raw[XBRLTagEnum.is_table.value]
        field_id = source_data[IqRecipeEnum.field_id.value]
        tag_id = source_data[XBRLTagEnum.tag_id.value]
        row_index = source_data[XBRLTagEnum.row_index.value]
        col_index = source_data[XBRLTagEnum.col_index.value] - 1
        xpath = source_data[XBRLTagEnum.xpath.value]
        return cls(html_id=html_id,  raw_value=raw_value,
                   raw_ocr_value=raw_ocr_value, is_table=is_table,
                   field_id=field_id, tag_id=tag_id,
                   row_index=row_index, col_index=col_index,
                   xpath=xpath, is_td=is_td)



[docs]
    def to_dict(self) -> dict:
        """Converts tag to dict"""
        if self.is_td:
            tag_info = {
                XBRLTagEnum.ids.value: self.html_id,
                IqRecipeEnum.field_id.value: self.field_id,
                XBRLTagEnum.tag_id.value: self.tag_id,
                XBRLTagEnum.row_index.value: self.row_index,
                XBRLTagEnum.col_index.value: self.col_index + 1,
                XBRLTagEnum.xpath.value: self.xpath,
            }
        else:
            tag_info = {
                XBRLTagEnum.html.value: {
                    XBRLTagEnum.parent_id.value: self.html_id,
                    XBRLTagEnum.value.value: self.raw_value
                },
                XBRLTagEnum.ids.value: [],
                IqRecipeEnum.field_id.value: self.field_id,
                XBRLTagEnum.tag_id.value: self.tag_id,
                XBRLTagEnum.row_index.value: self.row_index,
                XBRLTagEnum.col_index.value: self.col_index + 1,
                XBRLTagEnum.xpath.value: self.xpath,
            }
        output_dict = {
            ID: str(bson.ObjectId()),
            XBRLTagEnum.value.value: self.raw_value,
            XBRLTagEnum.ocr_value.value: self.raw_ocr_value,
            XBRLTagEnum.is_table.value: self.is_table,
            XBRLTagEnum.source.value: tag_info,
         }
        return output_dict