import json
from typing import TYPE_CHECKING, List, Optional

from dataikuapi.dss.document_extractor import ManagedFolderDocumentRef


if TYPE_CHECKING:
    from dataiku.core.vector_stores.dku_vector_store import DkuVectorStore
    from langchain_core.documents import Document


class DocumentMetadataFormatter:
    """
    Helper class to format vector store documents metadata for usage within
    Dataiku.

    .. important::
        Do not create this class directly, use
        :meth:`VectorStoreWriter.get_metadata_formatter()` instead.
    """

    def __init__(self, project_key: str, vector_store_implementation):
        self._project_key = project_key
        self._dku_vs: 'DkuVectorStore' = vector_store_implementation
        self._security_tokens: List[str] = []
        self._document_info: Optional[dict] = None
        self._multimodal_content: Optional[dict] = None

    def with_security_tokens(self, security_tokens: List[str]):
        """
        Adds the security tokens in the metadata.

        :param security_tokens: The security tokens.
        """
        self._security_tokens = security_tokens
        return self

    def with_original_document(
            self,
            folder_id: str,
            path: str,
            project_key: Optional[str]=None
    ):
        """
        Adds the original document information in the metadata.

        :param folder_id: The id of the managed folder that contains the
            original document.
        :param path: The original document path in the managed folder.
        :param project_key: The managed folder project key. Defaults to the
            project key of the knowledge bank.
        """
        if self._document_info is None:
            self._document_info = {}

        if project_key is None:
            project_key = self._project_key

        folder_full_id = "{}.{}".format(
            project_key, folder_id
        )

        self._document_info["source_file"] = {
            "folder_full_id": folder_full_id,
            "path": path
        }

        return self

    def with_original_document_ref(
            self,
            document_ref: ManagedFolderDocumentRef,
            project_key: Optional[str]=None
    ):
        """
        Adds the original document information in the metadata.

        :param document_ref: The reference to the original document.
        :param project_key: The managed folder project key. Defaults to the
            project key of the knowledge bank.
        """
        return self.with_original_document(
            folder_id=document_ref.managed_folder_id,
            path=document_ref.file_path,
            project_key=project_key
        )

    def with_original_document_page_range(self, page_start: int, page_end: int):
        """
        Adds the page range in the original document. This metadata is intended
        to start at index 1.

        :param page_start: The original document page where the extract
            started. Must be positive, and lower or equal to `page_end`.
        :param page_end: The original document page where the extract ended.
            Must be positive, and greater or equal to `page_start`.
        """
        if page_start <= 0:
            raise ValueError(f"page_start must be positive, received {page_start}")

        if page_end <= 0:
            raise ValueError(f"page_end must be positive, received {page_end}")

        if page_start > page_end:
            raise ValueError(f"page_start ({page_start}) must be lower or equal to page_end ({page_end})")

        if self._document_info is None:
            self._document_info = {}

        self._document_info["page_range"] = {
            "start": page_start,
            "end": page_end
        }

        return self

    def with_original_document_section_outline(self, section_outline: List[str]):
        """
        Adds a section outline in the metadata. Section outlines can be derived
        from the document extracted content. For example, it may contain the
        titles of the sections that contains this part of the original
        document, from top level headers to lower level headers.

        :param section_outline: The section outline.
        """
        if not (
            type(section_outline) is list and
            all(type(o) is str for o in section_outline)
        ):
            raise ValueError("section_outline must be a list[str]")

        if self._document_info is None:
            self._document_info = {}

        self._document_info["section_outline"] = section_outline
        return self

    def with_retrieval_content(
            self,
            text: Optional[str]=None,
            image_paths: Optional[List[str]]=None
    ):
        """
        Adds the retrieval content in the metadata. Exclusively accepts either
        text content or image paths relative to the knowledge bank images
        folder.

        :param text: The text content.
        :param images_paths: The paths to the images, relative to the managed
            folder that is configured in the knowledge bank.
        """
        if (text is None) == (image_paths is None):  # not xor
            raise ValueError("Either text or image_paths must be set")

        if text is not None:
            if type(text) is not str:
                raise ValueError("text must be a str")

            self._multimodal_content = {
                "type": "text",
                "content": text
            }

        if image_paths is not None:
            if not (
                type(image_paths) is list and
                all(type(p) is str for p in image_paths)
            ):
                raise ValueError("image_paths must be a list[str]")

            self._multimodal_content = {
                "type": "images",
                "content": image_paths
            }

        return self

    def format_metadata(self, document: 'Document') -> 'Document':
        """
        Formats the metadata in the provided document, so that it can be used
        for retrieval in Dataiku.

        :param document: The Langchain document which metadata must be formatted.
        :return: The document with updated metadata.
        """

        if self._multimodal_content:
            document.metadata["DKU_MULTIMODAL_CONTENT"] = json.dumps(self._multimodal_content)

        if self._document_info:
            document.metadata["DKU_DOCUMENT_INFO"] = json.dumps(self._document_info)

        if self._security_tokens:
            document.metadata["DKU_SECURITY_TOKENS"] = json.dumps(self._security_tokens)

        # for debug / troubleshooting
        document.metadata["DKU_API_GENERATED"] = "true"

        # use vector store specific code to format metadata accordingly
        # takes into account data types from the metadata schema, security tokens, etc...
        return self._dku_vs.transform_document_before_load(document)
