import enum
import json
from typing import Optional, List

from langchain_core.documents import Document

from dataiku.langchain.content_part_types import ImageRefPart, ImageRetrieval, get_image_parts
from dataiku.langchain.document_handler import RetrievalSource
from dataiku.langchain.metadata_generator import DKU_DOCUMENT_INFO
from dataiku.langchain.multimodal_content import MultimodalContent
from dataiku.llm.types import FileRef, SourceItem, SourcesSettings, ImageRef


class SourcesType(enum.Enum):
    SIMPLE_DOCUMENT = "SIMPLE_DOCUMENT"
    FILE_BASED_DOCUMENT = "FILE_BASED_DOCUMENT"


SETTINGS_KEY_TO_SOURCE_KEY = {
    "titleMetadata": "title",
    "urlMetadata": "url",
    "thumbnailURLMetadata": "thumbnailURL",
}

SNIPPET_FORMAT_TO_SOURCE_KEY = {
    "TEXT": "textSnippet",
    "MARKDOWN": "markdownSnippet",
    "HTML": "htmlSnippet"
}


def _get_file_based_document_info(doc: "Document") -> Optional[FileRef]:
    if DKU_DOCUMENT_INFO in doc.metadata:
        try:
            document_info = json.loads(doc.metadata[DKU_DOCUMENT_INFO])
        except Exception as e:
            raise ValueError(f"Metadata {doc.metadata[DKU_DOCUMENT_INFO]} is not a valid json", str(e))

        source_file = document_info.get("source_file", {})
        full_folder_id = source_file.get("folder_full_id")
        path = source_file.get("path")

        # Add a “sourceFile” section to the output **only when both** the full folder id
        # and the path are present; otherwise omit the entire section.
        if full_folder_id is not None and path is not None:
            return FileRef(folderId=full_folder_id, path=path, pageRange=document_info.get("page_range"), sectionOutline=document_info.get("section_outline"))
    return None


class SourcesHandler:
    def __init__(self,
                 sources_settings: SourcesSettings,
                 full_folder_id: Optional[str] = None,
                 retrieval_source: RetrievalSource = RetrievalSource.EMBEDDING,
                 retrieval_column: Optional[str] = None):
        self.sources_settings: SourcesSettings = sources_settings
        self.full_folder_id = full_folder_id
        self.retrieval_source = retrieval_source
        self.retrieval_column = retrieval_column

    def build_role_based_source_from(self, doc: "Document") -> SourceItem:
        source_item: SourceItem = SourceItem()
        selected_metadata = self.sources_settings.get("metadataInSources")
        if selected_metadata is not None:
            source_item["metadata"] = {k: v for k, v in doc.metadata.items() if k in selected_metadata}

        for setting_key, source_key in SETTINGS_KEY_TO_SOURCE_KEY.items():
            setting_value = self.sources_settings.get(setting_key)
            if setting_value and setting_value in doc.metadata:
                source_item[source_key] = doc.metadata.get(setting_value)

        file_based_document_info = _get_file_based_document_info(doc)
        if file_based_document_info is not None:
            source_item["fileRef"] = file_based_document_info
            source_item["type"] = SourcesType.FILE_BASED_DOCUMENT.value
        else:
            source_item["type"] = SourcesType.SIMPLE_DOCUMENT.value

        multimodal_content = MultimodalContent.from_doc(doc, self.full_folder_id)
        snippet_metadata = self.sources_settings.get("snippetMetadata")

        if snippet_metadata is not None and snippet_metadata in doc.metadata:
            # If snippet metadata is provided, we use it to get the snippet data.
            snippet_data = doc.metadata[snippet_metadata]
        elif self.retrieval_source == RetrievalSource.MULTIMODAL and multimodal_content:
            # If in MULTIMODAL mode and multimodal content is present...
            if multimodal_content.type == MultimodalContent.Type.TEXT:
                # if type is TEXT, we use its content as the snippet data.
                snippet_data = multimodal_content.content
            elif multimodal_content.type == MultimodalContent.Type.IMAGES:
                # if type is IMAGES, snippet data is null because only the image will be used by the LLM
                snippet_data = None
        elif self.retrieval_source == RetrievalSource.CUSTOM and self.retrieval_column is not None:
            # If the retrieval source is CUSTOM and a retrieval column is specified, we use the column data.
            snippet_data = doc.metadata.get(self.retrieval_column, "")
        else:
            # Fallback to the page content of the document.
            snippet_data = doc.page_content

        snippet_format = self.sources_settings.get("snippetFormat", "TEXT")
        source_item[SNIPPET_FORMAT_TO_SOURCE_KEY[snippet_format]] = snippet_data

        if self.retrieval_source == RetrievalSource.MULTIMODAL and multimodal_content and multimodal_content.type == MultimodalContent.Type.IMAGES:
            parts = get_image_parts(multimodal_content, None, ImageRetrieval.IMAGE_REF, self.full_folder_id)
            # In case of images we can have multiple parts all the same type
            image_refs: List[ImageRef] = []
            for part in parts:
                assert isinstance(part, ImageRefPart), "We should have an ImageRefPart type"
                image_refs.append({
                    "folderId": part.full_folder_id,
                    "path": part.path
                })
            source_item["imageRefs"] = image_refs

        return source_item
