Source code for pycognaize.document.tag.extraction_tag
import math
import bson
from datetime import datetime
from typing import Union
from pycognaize.common.confidence import Confidence
from pycognaize.common.enums import IqTagKeyEnum, ID
from pycognaize.document.tag.tag import BoxTag
from pycognaize.common.utils import convert_coord_to_num
from pycognaize.document.tag.cell import Cell
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from pycognaize.document.page import Page
[docs]
class ExtractionTag(BoxTag):
"""Represents field's coordinate data on document"""
def __init__(self, left, right, top, bottom, page, raw_value,
raw_ocr_value, confidence: Confidence = None):
super().__init__(left=left, right=right, top=top, bottom=bottom,
page=page, confidence=confidence)
self._raw_value = raw_value
self._raw_ocr_value = raw_ocr_value
[docs]
@classmethod
def construct_from_raw(cls, raw: dict, page: 'Page') -> 'ExtractionTag':
"""Builds Tag object from pycognaize raw data
:param raw: pycognaize field's tag info
:param page: `Page` to which the tag belongs
:return:
"""
confidence = Confidence(raw.get(IqTagKeyEnum.
confidence.value, {}))
left = convert_coord_to_num(raw['left'])
top = convert_coord_to_num(raw['top'])
height = convert_coord_to_num(raw['height'])
width = convert_coord_to_num(raw['width'])
right = left + width
bottom = top + height
raw_value = raw['value']
raw_ocr_value = raw['ocrValue']
tag = cls(left=left, right=right, top=top, bottom=bottom,
page=page, raw_value=raw_value, raw_ocr_value=raw_ocr_value,
confidence=confidence)
return tag
[docs]
def hshift(self, by) -> 'ExtractionTag':
"""Shifts rectangle horizontally
:param by: the amount by which the tag should be horizontally shifted
:return: shifted rectangle
"""
return self.__class__(left=self.left + by, right=self.right + by,
top=self.top, bottom=self.bottom,
page=self.page, raw_value=self.raw_value,
raw_ocr_value=self.raw_ocr_value,
confidence=self.confidence)
def horizontal_shift(self, by):
return self.hshift(by)
[docs]
def vshift(self, by) -> 'ExtractionTag':
"""Shifts rectangle vertically
:param by: the amount by which the tag should be vertically shifted
:return: shifted rectangle
"""
return self.__class__(left=self.left, right=self.right,
top=self.top + by, bottom=self.bottom + by,
page=self.page, raw_value=self.raw_value,
raw_ocr_value=self.raw_ocr_value,
confidence=self.confidence)
def vertical_shift(self, by):
return self.vshift(by)
[docs]
def __add__(self, other: Union['BoxTag', Cell]) -> 'ExtractionTag':
"""Merge two rectangles into one"""
if self.page.page_number == other.page.page_number:
left = min(self.left, other.left)
right = max(self.right, other.right)
top = min(self.top, other.top)
bottom = max(self.bottom, other.bottom)
raw_value_joined = " ".join(
[i.raw_value for i in sorted(
[self, other], key=lambda x: (x.left, x.top))])
left_actual = left * self.page.image_width / 100
right_actual = right * self.page.image_width / 100
top_actual = top * self.page.image_height / 100
bottom_actual = bottom * self.page.image_height / 100
words_list = self.page.extract_area_words(left=left_actual,
right=right_actual,
top=top_actual,
bottom=bottom_actual)
words = [text['ocr_text'] for text in words_list]
raw_ocr_value_joined = " ".join(words)
return ExtractionTag(
left=left, right=right, top=top, bottom=bottom,
page=self.page, raw_value=raw_value_joined,
raw_ocr_value=raw_ocr_value_joined,
confidence=self.confidence)
else:
raise ValueError("Tags are not on the same page.")
@property
def raw_value(self):
return self._raw_value
@property
def raw_ocr_value(self):
return self._raw_ocr_value
def _validate_numeric(self):
"""Validate numerica data"""
try:
self.value = (float(self.raw_value) if self.raw_value is not None
else math.nan)
self.has_value_exception = False
except Exception as ValueException:
self.has_value_exception = True
self.value_exception_message = str(ValueException)
try:
self.ocr_value = (float(self.raw_ocr_value)
if self.raw_ocr_value is not None else math.nan)
self.has_raw_value_exception = False
except Exception as RawValueException:
self.has_raw_value_exception = True
self.raw_value_exception_message = str(RawValueException)
def _validate_date(self, date_format):
"""Validate date data"""
try:
self.value = datetime.strptime(self.raw_value, date_format)
self.has_value_exception = False
except Exception as ValueException:
self.has_value_exception = True
self.value_exception_message = str(ValueException)
try:
self.ocr_value = datetime.strptime(self.raw_ocr_value, date_format)
self.has_raw_value_exception = False
except Exception as RawValueException:
self.has_raw_value_exception = True
self.raw_value_exception_message = str(RawValueException)
[docs]
def to_dict(self) -> dict:
"""Converts extraction tag to dict"""
return {
ID: str(bson.ObjectId()),
IqTagKeyEnum.ocr_value.value: self.raw_ocr_value,
IqTagKeyEnum.value.value: str(self.raw_value),
IqTagKeyEnum.left.value: f"{self.left}%",
IqTagKeyEnum.top.value: f"{self.top}%",
IqTagKeyEnum.height.value: f"{self.bottom - self.top}%",
IqTagKeyEnum.width.value: f"{self.right - self.left}%",
IqTagKeyEnum.page.value: self.page.page_number,
IqTagKeyEnum.confidence.value: self.confidence.get_confidence(),
}