/*
 * Decompiled with CFR 0.152.
 */
package com.dataiku.dip.recipes.nlp.embed_documents;

import com.dataiku.dip.dataflow.JobActivity;
import com.dataiku.dip.docextraction.ScreenshotterService;
import com.dataiku.dip.docextraction.StructuredExtractor;
import com.dataiku.dip.docextraction.VLMExtractor;
import com.dataiku.dip.docextraction.common.InputRefs;
import com.dataiku.dip.llm.EnrichedLLMStructuredRef;
import com.dataiku.dip.managedfolder.ManagedFolder;
import com.dataiku.dip.recipes.nlp.common.doc_extraction.DocExtractionRule;
import com.dataiku.dip.recipes.nlp.common.doc_extraction.DocExtractionRuleApplier;
import com.dataiku.dip.recipes.nlp.common.doc_extraction.ExtractedData;
import com.dataiku.dip.recipes.nlp.common.doc_extraction.chunks.SingleExtractedChunk;
import com.dataiku.dip.recipes.nlp.embed_documents.EmbedDocumentsRecipeParams;
import com.dataiku.dip.security.AuthCtx;
import com.dataiku.dip.security.tickets.APITicketService;
import com.dataiku.dip.shaker.processors.expr.RecursiveCharacterTextSplitter;
import com.dataiku.dip.utils.DKULogger;
import com.google.common.annotations.VisibleForTesting;
import java.util.List;
import javax.annotation.Nullable;

public class EmbedDocumentsRuleApplier
extends DocExtractionRuleApplier {
    EnrichedLLMStructuredRef embeddingModelRef;
    static DKULogger logger = DKULogger.getLogger((String)"dku.recipes.nlp.embed_doc_rule_applier");

    public EmbedDocumentsRuleApplier(EmbedDocumentsRecipeParams recipeParams, EnrichedLLMStructuredRef embeddingModelRef, @Nullable ManagedFolder outfolder, AuthCtx authCtx, String projectKey, APITicketService ticketService, JobActivity activity, ScreenshotterService screenshotterService) {
        super(recipeParams.getRules(embeddingModelRef), recipeParams.getAllOtherFilesRule(embeddingModelRef), recipeParams.getDefaultVLMExtractionPromptTemplate(), outfolder, authCtx, projectKey, ticketService, activity, screenshotterService);
    }

    @Override
    protected boolean useSeparatedChunksForImages() {
        return true;
    }

    @Override
    protected boolean prependFullOutlineToEachChunksPriorSplitting() {
        return true;
    }

    @Override
    protected boolean isScreenshotStorageRequired(DocExtractionRule ruleToApply) {
        return DocExtractionRule.MultimodalContentType.IMAGES.equals((Object)ruleToApply.storeInMultimodalColumn);
    }

    @Override
    protected boolean isImagesStorageRequired(StructuredExtractor.StructuredExtractorSettings structuredSettings) {
        return StructuredExtractor.ImageHandlingMode.VLM_ANNOTATE.equals((Object)structuredSettings.imageHandlingMode);
    }

    @Override
    protected String getVLMExtractionPromptFrom(DocExtractionRule ruleToApply) {
        return VLMExtractor.getExtractionPromptFromCharsLimit(this.vlmExtractionPromptTemplate, ruleToApply.splittingSettings.chunkSizeCharacters);
    }

    @Override
    protected String getVLMAnnotationPromptFrom(DocExtractionRule ruleToApply) {
        return StructuredExtractor.getVlmDefaultAnnotationPromptFromCharsLimit(ruleToApply.splittingSettings.chunkSizeCharacters);
    }

    @Override
    @VisibleForTesting
    public ExtractedData splitExtractedData(ExtractedData extractedData, DocExtractionRule ruleToApply, InputRefs.ManagedFolderDocumentRef document) {
        ExtractedData splitExtractedData = null;
        if (ruleToApply.splittingSettings != null) {
            boolean usePostChunkingValuesForMultimodalCol = ruleToApply.storeInMultimodalColumn.equals((Object)DocExtractionRule.MultimodalContentType.FULL_CONTENT) || ruleToApply.storeInMultimodalColumn.equals((Object)DocExtractionRule.MultimodalContentType.CHUNKED_PROMPT_OUTPUT);
            logger.info((Object)this.buildMessageLogForDocument(document, "Performing splitting on embedded chunks (using " + (usePostChunkingValuesForMultimodalCol ? "chunked" : "un-chunked") + " values for content stored for retrieval)"));
            RecursiveCharacterTextSplitter.Parameter param = new RecursiveCharacterTextSplitter.Parameter();
            param.chunkSize = ruleToApply.splittingSettings.chunkSizeCharacters;
            param.chunkOverlap = ruleToApply.splittingSettings.chunkOverlapCharacters;
            param.validate();
            RecursiveCharacterTextSplitter splitter = new RecursiveCharacterTextSplitter(param);
            splitExtractedData = new ExtractedData(extractedData.extractorEngine, extractedData.sourceDocument);
            splitExtractedData.assetsStoragePath = extractedData.assetsStoragePath;
            for (SingleExtractedChunk chunk : extractedData.chunks) {
                List<String> embedValues = splitter.splitText(chunk.embedValue);
                splitExtractedData.createChunks(embedValues, chunk, usePostChunkingValuesForMultimodalCol);
            }
            logger.info((Object)this.buildMessageLogForDocument(document, "Splitting expanded the document into " + extractedData.chunks.size() + " records to embed"));
            return splitExtractedData;
        }
        logger.warn((Object)this.buildMessageLogForDocument(document, "No splitting settings found, skipping splitting"));
        return extractedData;
    }
}

