Source code for pycognaize.document.field.table_field

import sys
import itertools
import unicodedata
from typing import Optional, Dict, List, Type, Union
if sys.version_info < (3, 11):
    from typing_extensions import Self
else:
    from typing import Self
import logging


from pycognaize.common.enums import (
    IqDocumentKeysEnum,
    ID,
    IqDataTypesEnum,
    IqFieldKeyEnum
)
from pycognaize.common.utils import (
    convert_tag_coords_to_percentages,
    get_index_of_first_non_empty_list, filter_out_nested_lines
)
from pycognaize.document.field import Field
from pycognaize.document.html_info import HTML
from pycognaize.document.tag import TableTag, ExtractionTag
from pycognaize.document.tag.html_tag import HTMLTableTag
from pycognaize.document.tag.tag import BoxTag
from pycognaize.document.page import Page


[docs] class TableField(Field): """Base class for all pycognaize table fields""" tag_class: Type[BoxTag] = TableTag html_tag_class: Type[HTMLTableTag] = HTMLTableTag def __init__(self, name: str, tag: Optional[Union[TableTag, HTMLTableTag]] = None, field_id: Optional[str] = None, group_key: str = None, confidence: Optional[float] = -1.0, group_name: str = None, mapping: Optional[List[Dict[str, str]]] = None ): tags = [] if tag is None else [tag] super().__init__(name=name, tags=tags, group_key=group_key, confidence=confidence, group_name=group_name, mapping=mapping) self._field_id = field_id def get_table_title(self, n_lines_above=8, margin=10) -> str: title = '' if isinstance(self.tags[0], HTMLTableTag): title = self._get_table_title_from_html( self.tags[0], n_lines_above=n_lines_above) elif isinstance(self.tags[0], (ExtractionTag, TableTag)): title = self._get_table_title_from_pdf( self.tags[0], n_lines_above=n_lines_above, margin=margin) return title @staticmethod def _get_table_title_from_html(tag: HTMLTableTag, n_lines_above: int ) -> str: table_html = tag.html.html_soup.find_all( 'table', {'id': tag.html_id})[0] above_lines = [] for count, above_line in enumerate(table_html.previous_elements): line_text = unicodedata.normalize('NFKD', above_line.text.strip().lower()) if line_text and line_text not in above_lines: above_lines.append(line_text) if count >= n_lines_above: break filtered_above_lines = filter_out_nested_lines(above_lines) title = ' '.join(filtered_above_lines) return title @staticmethod def _get_table_title_from_pdf(tag: ExtractionTag, n_lines_above=8, margin=10) -> str: """Return the title of the table found on the pdf""" h = tag.page.image_height w = tag.page.image_width tags_converted = convert_tag_coords_to_percentages( tag, w=w, h=h) table_top = tags_converted['top'] all_rows_above = [] for line in tag.page.lines: all_rows_above.append( [w['ocr_text'] for w in line if w['bottom'] < table_top + margin]) index_of_first_non_empty_line = get_index_of_first_non_empty_list( all_rows_above) all_rows_above = all_rows_above[:index_of_first_non_empty_line + 1] title = ' '.join( itertools.chain.from_iterable(all_rows_above[-n_lines_above:])) return title
[docs] @classmethod def construct_from_raw(cls, raw: dict, pages: Dict[int, Page], html: Optional[HTML] = None, labels=None) -> 'TableField': """Create TableField object from dictionary""" tag_dicts: List[dict] = raw[IqDocumentKeysEnum.tags.value] tags = [] for i in tag_dicts: try: if not pages: tags.append(cls.html_tag_class.construct_from_raw( raw=i, html=html)) else: tags.append(cls.tag_class.construct_from_raw( raw=i, page=pages[i['page']])) except Exception as e: logging.debug(f"Failed creating tag for field {raw[ID]}: {e}") if len(tags) > 1: raise ValueError( f"{cls.__name__} cannot have {len(tags)}" f" {cls.tag_class.__name__}s") return cls(name=raw[IqDocumentKeysEnum.name.value], tag=tags[0] if tags else None, field_id=str(raw[ID]), group_key=raw.get(IqFieldKeyEnum.group_key.value, ''), group_name=raw.get(IqFieldKeyEnum.group.value, ''), mapping=raw.get(IqFieldKeyEnum.mapping.value, []) )
[docs] def to_dict(self) -> dict: """Converts TableField object to dictionary""" field_dict = super().to_dict() field_dict[ID] = self._field_id field_dict[ IqFieldKeyEnum.data_type.value] = IqDataTypesEnum.table.value field_dict[IqFieldKeyEnum.value.value] = '' return field_dict
@classmethod def parse_table(cls, table_field: Self): df = table_field.tags[0].df new_header = df.iloc[0] # grab the first row for the header df = df[1:] # take the data less the header row df.columns = new_header # set the header row as the df header df_text = df.to_markdown(index=False) return df_text def __repr__(self): return f"<{self.__class__.__name__}: {self.name}>" def __str__(self): return self.__repr__()