import dataclasses
import logging
from typing import Optional, List, Dict, Any

import pandas as pd

logger = logging.getLogger(__name__)

DKU_MULTIMODAL_CONTENT = "DKU_MULTIMODAL_CONTENT"
DKU_DOCUMENT_INFO = "DKU_DOCUMENT_INFO"
DKU_SECURITY_TOKENS_META = "DKU_SECURITY_TOKENS"


@dataclasses.dataclass(frozen=True)
class MetadataGenerator:

    metadata_columns: List[str]
    source_id_column: Optional[str] = None
    security_tokens_column: Optional[str] = None

    def to_metadata(self, row: Dict[str, Any]) -> Dict[str, Any]:
        """
        Completes the specified metadata with the dataiku internal information, used at retrieval time.
        """

        # Being extra safe with key access, only get the sources if they are present on the row
        sources = {col_name: row[col_name] for col_name in self.metadata_columns if col_name in row}
        source_id = {self.source_id_column: row[self.source_id_column]} if self.source_id_column is not None else {}
        multimodal_info = {DKU_MULTIMODAL_CONTENT: row[DKU_MULTIMODAL_CONTENT]} if DKU_MULTIMODAL_CONTENT in row else {}
        document_info = {DKU_DOCUMENT_INFO: row[DKU_DOCUMENT_INFO]} if DKU_DOCUMENT_INFO in row else {}
        security_tokens_info = {DKU_SECURITY_TOKENS_META: row[self.security_tokens_column]} if self.security_tokens_column is not None and not pd.isna(row.get(self.security_tokens_column, None)) else {}

        return {
            **sources,  # Original metadata
            **source_id,  # Optional column to reference doc id for RecordManager indexing
            **multimodal_info,  # Optional column containing multimodal info
            **security_tokens_info,
            **document_info, # Settings to identify the metadata info at loading time
        }

