Source code for pycognaize.document.snapshot

import logging
import os
from typing import Mapping, Tuple

from pycognaize.common.enums import EnvConfigEnum, HASH_FILE
from pycognaize.common.exceptions import AuthenthicationError
from pycognaize.common.lazy_dict import LazyDocumentDict
from pycognaize.common.utils import directory_summary_hash
from pycognaize.document.snapshot_downloader import SnapshotDownloader
from pycognaize.login import Login



[docs]
class Snapshot:
    """A snapshot of annotated documents from one or more collections"""

    def __init__(self, path: str):
        self._path = path
        self._documents = LazyDocumentDict(doc_path=path, data_path=path)

    @property
    def documents(self) -> Mapping:
        """Mapping of document ids to documents
        :return Mapping:  LazyDocumentDict Object
        """
        return self._documents


[docs]
    @classmethod
    def get_by_id(cls, snapshot_id: str) -> 'Snapshot':
        """Returns the Snapshot Object"""
        login_instance = Login()
        if login_instance.logged_in:
            snapshot_dir = login_instance.snapshot_root
            snapshot_path = snapshot_dir + "/" + snapshot_id
            return cls(path=snapshot_path)
        else:
            snapshot_dir = os.environ[EnvConfigEnum.SNAPSHOT_PATH.value]
            snapshot_id = os.environ[EnvConfigEnum.SNAPSHOT_ID.value]
            snapshot_path = os.path.join(snapshot_dir, snapshot_id)
            return cls(path=snapshot_path)



[docs]
    @classmethod
    def download(cls, snapshot_id: str, destination_dir: str,
                 exclude_images: bool = False,
                 exclude_ocr: bool = False,
                 exclude_pdf: bool = False,
                 exclude_html: bool = False,
                 require_login: bool = True,
                 snapshot_root: str = None
                 ) -> Tuple['Snapshot', str]:
        """Downloads snapshot to specified destination"""
        if require_login:
            if snapshot_root is not None:
                raise ValueError("If the require_login is True, "
                                 "snapshot_root should not be"
                                 " provided")
            login_instance = Login()
            if not login_instance.logged_in:
                raise AuthenthicationError()
            snapshot_path = login_instance.snapshot_root + "/" + snapshot_id
        else:
            if snapshot_root is None:
                raise ValueError("If require_login is False, "
                                 "snapshot_root` should be "
                                 "provided")
            snapshot_path = snapshot_root + "/" + snapshot_id
        exclude = cls._get_exclude_patterns(
            exclude_images=exclude_images,
            exclude_ocr=exclude_ocr,
            exclude_pdf=exclude_pdf,
            exclude_html=exclude_html
        )
        downloader = SnapshotDownloader()
        downloader.download(snapshot_path, destination_dir, exclude)
        summary_hash = directory_summary_hash(destination_dir)
        with open(os.path.join(destination_dir, HASH_FILE), 'w') as f:
            f.write(summary_hash)
        logging.info(f"Snapshot {snapshot_id} downloaded to "
                     f"{destination_dir}. To use the snapshot, check our "
                     f"documentation at: "
                     "http://pycognaize-docs.com."
                     "s3-website.us-east-2.amazonaws.com")

        return cls(path=snapshot_path), \
            os.path.join(destination_dir, snapshot_id)


    @classmethod
    def _snapshot_path(cls) -> str:
        """Identify and return the snapshot path"""
        login_instance = Login()
        if login_instance.logged_in:
            snapshot_dir = login_instance.snapshot_root
            snapshot_id = os.environ[EnvConfigEnum.SNAPSHOT_ID.value]
            snapshot_path = snapshot_dir + "/" + snapshot_id
        else:
            snapshot_dir = os.environ[EnvConfigEnum.SNAPSHOT_PATH.value]
            snapshot_id = os.environ[EnvConfigEnum.SNAPSHOT_ID.value]
            snapshot_path = os.path.join(snapshot_dir, snapshot_id)
        return snapshot_path

    @classmethod
    def _get_exclude_patterns(
            cls,
            exclude_images,
            exclude_ocr,
            exclude_pdf,
            exclude_html
    ):
        exclude = []

        if exclude_images:
            exclude.append('*/images/*.jpeg')

        if exclude_ocr:
            exclude.append('*/data/*.json')

        if exclude_pdf:
            exclude.append('*.pdf')

        if exclude_html:
            exclude.append('*.html')

        return exclude


[docs]
    @classmethod
    def get(cls) -> 'Snapshot':
        """Read the snapshot object from local storage and return it"""
        return cls(path=cls._snapshot_path())