Source code for pycognaize.common.lazy_dict

"""Defines LazyDocumentDict, used for lazy loading the documents
in the snapshot
"""
import logging
import os
from collections.abc import Mapping
from typing import Optional

from bson import json_util

from pycognaize.common.enums import StorageEnum
from pycognaize.common.utils import join_path
from pycognaize.document import Document
from pycognaize.file_storage import get_storage
from pycognaize.login import Login


[docs] class LazyDocumentDict(Mapping): """Contains documents included in the snapshot""" document_filename = StorageEnum.doc_file.value def __init__(self, doc_path: str, data_path: str): login_instance = Login() if login_instance.logged_in: self._storage_config = { 'aws_access_key_id': login_instance.aws_access_key, 'aws_session_token': login_instance.aws_session_token, 'aws_secret_access_key': login_instance.aws_secret_access_key } else: self._storage_config = None self._doc_path = doc_path self._data_path = data_path storage = get_storage(self._doc_path, config=self._storage_config) ids = [] for directory in storage.list_dir(doc_path, include_files=False): if storage.is_file(directory / self.document_filename): ids.append(directory.name) self._ids = sorted(ids) @property def doc_path(self) -> str: """Path of the document's JSON""" return self._doc_path @property def data_path(self) -> str: """Path of the document's OCR and page images""" return self._data_path def __getitem__(self, doc_id) -> Optional[Document]: """The Document object, retrieved from provided path Note: Path can be both local and remote """ storage = get_storage(self.doc_path, config=self._storage_config) path = join_path( storage.is_s3_path(self.doc_path), self.doc_path, doc_id, f"{self.document_filename}" ) try: with storage.open(path, 'r', encoding='utf8') as f: doc_dict = json_util.loads(f.read()) return Document.from_dict(raw=doc_dict, data_path=os.path.join(self.data_path, doc_id)) except FileNotFoundError: logging.error(f'Document at path {path} is not found.') except Exception as e: logging.error(f'Failed reading document {doc_id}: {e}') def __iter__(self): return iter(self._ids) def __len__(self): return len(self._ids)