Source code for pycognaize.document.tag.span_tag

import bson
from pycognaize.common.utils import convert_coord_to_num
from pycognaize.document.tag.tag import BoxTag
from pycognaize.common.enums import IqTagKeyEnum, ID
from typing import TYPE_CHECKING
if TYPE_CHECKING:
    from pycognaize.document.page import Page

from pycognaize.common.decorators import module_not_found


[docs] class SpanTag(BoxTag): """Represents a Tag included in a spanfield""" def __init__(self, left, right, top, bottom, page: 'Page', raw_value: str, raw_ocr_value: str): super().__init__(left=left, right=right, top=top, bottom=bottom, page=page) self.raw_value = raw_value self.raw_ocr_value = raw_ocr_value self._spacy_doc = None
[docs] @classmethod def construct_from_raw(cls, raw: dict, page: 'Page') -> 'SpanTag': """Create a SpanTag from a raw dictionary""" left = convert_coord_to_num(raw['left']) top = convert_coord_to_num(raw['top']) height = convert_coord_to_num(raw['height']) width = convert_coord_to_num(raw['width']) right = left + width bottom = top + height raw_value = raw['value'] raw_ocr_value = raw['ocrValue'] tag = cls(left=left, right=right, top=top, bottom=bottom, page=page, raw_value=raw_value, raw_ocr_value=raw_ocr_value) return tag
[docs] def to_dict(self) -> dict: return { ID: str(bson.ObjectId()), IqTagKeyEnum.ocr_value.value: self.raw_ocr_value, IqTagKeyEnum.value.value: str(self.raw_value), IqTagKeyEnum.left.value: f"{self.left}%", IqTagKeyEnum.top.value: f"{self.top}%", IqTagKeyEnum.height.value: f"{self.bottom - self.top}%", IqTagKeyEnum.width.value: f"{self.right - self.left}%", IqTagKeyEnum.page.value: self.page.page_number, IqTagKeyEnum.confidence.value: self.confidence.get_confidence(), }
@module_not_found def __create_spacy_doc(self): """Creates spacy nlp object from raw value""" import spacy nlp = spacy.blank("en") self._spacy_doc = nlp(self.raw_value) def __getitem__(self, val): """Returns slice of the span object""" return SpanTag(self._left, self._right, self._top, self._bottom, self._page, self.raw_value[val], self.raw_ocr_value[val])