Source code for pycognaize.document.tag.html_tag

import abc
import bson
import pandas as pd
import logging

from typing import Optional, List, Union
from pycognaize.common.enums import (IqRecipeEnum, XBRLCellEnum,
                                     XBRLTableTagEnum, XBRLTagEnum, ID)
from pycognaize.document.html_info import HTML
from pycognaize.document.tag.tag import Tag


[docs] class HTMLTagABC(Tag, metaclass=abc.ABCMeta): """Base class for XBRL document tags""" def __init__(self, html_id: Union[str, List[str]], xpath: str, tag_id: Optional[str] = None): self._html_id = html_id self._xpath = xpath self._tag_id = tag_id @property def html_id(self): return self._html_id @property def xpath(self): return self._xpath @property def tag_id(self): return self._tag_id @classmethod def construct_from_raw(cls, raw: dict, html: HTML) -> 'HTMLTagABC': ...
[docs] class HTMLTableTag(HTMLTagABC): """Represents table's coordinate data in XBRL document""" def __init__(self, tag_id: str, value: str, ocr_value: str, xpath: str, title: str, html_id: Union[str, List[str]], cell_data: dict, html: HTML, source_ids, is_table: bool = True): super().__init__(html_id=html_id, xpath=xpath, tag_id=tag_id) self._value = value self._ocr_value = ocr_value self._is_table = is_table self._title = title self._cell_data = cell_data self._cells = {} self._html = html self._source_ids = source_ids self._populate_cells() self._build_df() self._raw_df = None self._df = None @property def title(self): return self._title @property def source_ids(self): return self._source_ids @property def value(self): return self._value @property def ocr_value(self): return self._ocr_value @property def is_table(self): return self._is_table @property def html(self): return self._html @property def cells(self): return self._cells @property def cell_data(self) -> dict: if not self._cell_data: raise Exception('Cell data is empty') return self._cell_data @property def raw_df(self) -> pd.DataFrame: if self._raw_df is None: self._raw_df = self._build_df() return self._raw_df @property def df(self) -> pd.DataFrame: if self._df is None: if "map" not in dir(self.raw_df): self._df = self.raw_df.applymap( lambda x: self._extract_value(x)) else: self._df = self.raw_df.map( lambda x: self._extract_value(x)) return self._df @staticmethod def _extract_value(x): """Returns text value from `HTMLTag` object""" try: return x.raw_value except AttributeError: return '' def _populate_cells(self): for row_col_index, cell_dict in self.cell_data.items(): keys = tuple((int(i) for i in row_col_index.split(':'))) if XBRLTableTagEnum.is_bold.value not in cell_dict: cell_dict[XBRLTableTagEnum.is_bold.value] = False if XBRLTableTagEnum.left_indentation.value not in cell_dict: cell_dict[XBRLTableTagEnum.left_indentation.value] = None self._cells[keys] = ( self._populate_cell(keys=keys, cell_dict=cell_dict)) @staticmethod def _populate_cell(keys: tuple, cell_dict: dict) -> 'HTMLCell': """ Creates `HTMLCell` object for each item in Table""" return HTMLCell( html_id=cell_dict[XBRLCellEnum.id.value], xpath=cell_dict[XBRLCellEnum.xpath.value], row_index=keys[1], col_index=keys[0], col_span=cell_dict[XBRLCellEnum.col_span.value], row_span=cell_dict[XBRLCellEnum.row_span.value], raw_value=cell_dict[XBRLCellEnum.raw_value.value], is_bold=cell_dict[XBRLCellEnum.is_bold.value], left_indentation=cell_dict[XBRLCellEnum.left_indentation.value] )
[docs] def to_dict(self) -> dict: """Converts HTMLTableTag to dict""" table_dict = { XBRLTableTagEnum.xpath.value: self.xpath, XBRLTableTagEnum.title.value: self.title, XBRLTableTagEnum.anchor_id.value: self.html_id, XBRLTableTagEnum.cells.value: self.cell_data, } output_dict = { XBRLTableTagEnum._id.value: self.tag_id, XBRLTableTagEnum.value.value: '', XBRLTableTagEnum.ocr_value.value: '', XBRLTableTagEnum.is_table.value: True, XBRLTableTagEnum.table.value: table_dict, XBRLTableTagEnum.source.value: self.source_ids} return output_dict
[docs] @classmethod def construct_from_raw(cls, raw: dict, html: HTML) -> 'HTMLTableTag': """Builds HTMLTableTag objeTct from pycognaize raw data :param raw: pycognaize field's tag info :param html: HTML :return: """ tag_id = raw[XBRLTableTagEnum._id.value] ocr_value = raw[XBRLTableTagEnum.ocr_value.value] value = raw[XBRLTableTagEnum.value.value] table_raw_data = raw[XBRLTableTagEnum.table.value] xpath = table_raw_data[XBRLTableTagEnum.xpath.value] title = table_raw_data[XBRLTableTagEnum.title.value] html_id = table_raw_data[XBRLTableTagEnum.anchor_id.value] cell_data = table_raw_data[XBRLTableTagEnum.cells.value] source_ids = raw[XBRLTableTagEnum.source.value] return cls(tag_id=tag_id, ocr_value=ocr_value, value=value, is_table=True, xpath=xpath, title=title, html_id=html_id, cell_data=cell_data, html=html, source_ids=source_ids)
def _build_df(self) -> pd.DataFrame: """Build pandas data frame using `HTMLTag` Cells :return: DataFrame object,where each cell contains an HTMLTag object with the html_id and values from the annotated document """ cols = set() rows = set() for cell_ in self.cells.values(): cols.add(cell_.col_index - 1) rows.add(cell_.row_index - 1) cols = list(cols) rows = list(rows) cols.sort() rows.sort() df = pd.DataFrame(columns=cols, index=rows) for cell_ in self.cells.values(): row_index = cell_.row_index - 1 col_index = cell_.col_index - 1 for col_n in range(col_index, col_index + cell_.col_span): for row_n in range(row_index, row_index + cell_.row_span): df.loc[row_n, col_n] = HTMLTag( is_table=False, html_id=cell_.html_id, xpath=cell_.xpath, raw_value=cell_.raw_value, raw_ocr_value=cell_.raw_value, field_id='', tag_id=self.tag_id, row_index=row_index, col_index=col_index) df = self.replace_nans_with_empty_html_tags(df) return df
[docs] def replace_nans_with_empty_html_tags(self, df: pd.DataFrame) -> pd.DataFrame: """ Replaces NaN values in a DataFrame with empty HTML tags. """ for col in df.columns: for idx in df.index: if pd.isna(df.loc[idx, col]): logging.warning( f'Build df issue: Replacing empty cell at {idx, col} ' f'with empty HTMLTag in HTMLTableTag with html id ' f'{self.html_id}') df.loc[idx, col] = HTMLTag(is_table=False, html_id='', xpath='', raw_value='', raw_ocr_value='', field_id='', tag_id='', row_index=idx, col_index=col) return df
[docs] class HTMLCell: """Represents cell tag for XBRL tables""" def __init__(self, row_index: int, col_index: int, col_span: int, row_span: int, html_id: Union[str, List[str]], xpath: str, raw_value: str, is_bold: False, left_indentation: None): self._row_index = row_index self._col_index = col_index self._col_span = col_span self._row_span = row_span self._html_id = html_id self._xpath = xpath self._raw_value = raw_value self._is_bold = is_bold self._left_indentation = left_indentation @property def row_index(self) -> int: return self._row_index @property def col_index(self) -> int: return self._col_index @property def col_span(self) -> int: return self._col_span @property def row_span(self) -> int: return self._row_span @property def html_id(self): return self._html_id @property def xpath(self): return self._xpath @property def raw_value(self) -> str: return self._raw_value @property def is_bold(self) -> bool: return self._is_bold @property def left_indentation(self) -> str: return self._left_indentation
[docs] @classmethod def construct_from_raw(cls, raw: dict) -> 'HTMLCell': """Build HTMLTAG from pycognaize raw data :param raw: pycognaize field's tag info :return: """ source_data = raw[XBRLCellEnum.source.value] row_index = source_data[XBRLCellEnum.row_index.value] col_index = source_data[XBRLCellEnum.col_index.value] col_span = source_data[XBRLCellEnum.col_span.value] row_span = source_data[XBRLCellEnum.row_span.value] raw_value = raw[XBRLCellEnum.raw_value.value] html_id = source_data[XBRLCellEnum.html_id.value] xpath = source_data[XBRLCellEnum.xpath.value] is_bold = (source_data[XBRLCellEnum.is_bold.value] if XBRLCellEnum.is_bold.value in source_data else False) left_indentation = (source_data[XBRLCellEnum.left_indentation.value] if XBRLCellEnum.left_indentation.value else None) return cls(html_id=html_id, xpath=xpath, row_index=row_index, col_index=col_index, col_span=col_span, row_span=row_span, raw_value=raw_value, is_bold=is_bold, left_indentation=left_indentation)
[docs] def to_dict(self) -> dict: """Converts cell to dict""" cell_dict = { XBRLCellEnum.col_span.value: self.col_span, XBRLCellEnum.row_span.value: self.row_span, XBRLCellEnum.html_id.value: self.html_id, XBRLCellEnum.xpath.value: self.xpath, XBRLCellEnum.raw_value.value: self.raw_value, XBRLCellEnum.left_indentation.value: self.left_indentation, XBRLCellEnum.is_bold.value: self.is_bold} return {f"{self.col_index}:{self.row_index}": cell_dict}
[docs] class HTMLTag(HTMLTagABC): def __init__(self, raw_value: str, raw_ocr_value: str, is_table: bool, html_id: Union[str, List[str]], field_id: Optional[str], tag_id: Optional[str], row_index: int, col_index: int, xpath: str, is_td: bool = True): super().__init__(html_id=html_id, xpath=xpath, tag_id=tag_id) self._raw_value = raw_value self._raw_ocr_value = raw_ocr_value self._is_table = is_table self._field_id = field_id self._row_index = row_index self._col_index = col_index self._is_td = is_td @property def raw_value(self): """returns adjusted value""" return self._raw_value @property def raw_ocr_value(self): return self._raw_ocr_value @property def is_table(self): return self._is_table @property def field_id(self): return self._field_id @property def row_index(self): return self._row_index @property def col_index(self): return self._col_index @property def is_td(self): return self._is_td
[docs] @classmethod def construct_from_raw(cls, raw: dict, html: HTML) -> 'HTMLTag': """Build HTMLTag from pycognaize raw data :param html: HTML :param raw: pycognaize field's tag info :return: """ source_data = raw[XBRLTagEnum.source.value] if XBRLTagEnum.html.value in source_data: is_td = False html_id = source_data[XBRLTagEnum.html.value][ XBRLTagEnum.parent_id.value] else: is_td = True html_id = source_data[XBRLTagEnum.ids.value] raw_value = raw[XBRLTagEnum.value.value] raw_ocr_value = raw[XBRLTagEnum.ocr_value.value] is_table = raw[XBRLTagEnum.is_table.value] field_id = source_data[IqRecipeEnum.field_id.value] tag_id = source_data[XBRLTagEnum.tag_id.value] row_index = source_data[XBRLTagEnum.row_index.value] col_index = source_data[XBRLTagEnum.col_index.value] - 1 xpath = source_data[XBRLTagEnum.xpath.value] return cls(html_id=html_id, raw_value=raw_value, raw_ocr_value=raw_ocr_value, is_table=is_table, field_id=field_id, tag_id=tag_id, row_index=row_index, col_index=col_index, xpath=xpath, is_td=is_td)
[docs] def to_dict(self) -> dict: """Converts tag to dict""" if self.is_td: tag_info = { XBRLTagEnum.ids.value: self.html_id, IqRecipeEnum.field_id.value: self.field_id, XBRLTagEnum.tag_id.value: self.tag_id, XBRLTagEnum.row_index.value: self.row_index, XBRLTagEnum.col_index.value: self.col_index + 1, XBRLTagEnum.xpath.value: self.xpath, } else: tag_info = { XBRLTagEnum.html.value: { XBRLTagEnum.parent_id.value: self.html_id, XBRLTagEnum.value.value: self.raw_value }, XBRLTagEnum.ids.value: [], IqRecipeEnum.field_id.value: self.field_id, XBRLTagEnum.tag_id.value: self.tag_id, XBRLTagEnum.row_index.value: self.row_index, XBRLTagEnum.col_index.value: self.col_index + 1, XBRLTagEnum.xpath.value: self.xpath, } output_dict = { ID: str(bson.ObjectId()), XBRLTagEnum.value.value: self.raw_value, XBRLTagEnum.ocr_value.value: self.raw_ocr_value, XBRLTagEnum.is_table.value: self.is_table, XBRLTagEnum.source.value: tag_info, } return output_dict