/*
 * Decompiled with CFR 0.152.
 */
package com.dataiku.dip.docextraction;

import com.dataiku.common.stereotype.PartOfPublicAPI;
import com.dataiku.dip.connections.AbstractSQLConnection;
import com.dataiku.dip.docextraction.StructuredContent;
import com.dataiku.dip.docextraction.StructuredContentDTO;
import com.dataiku.dip.docextraction.common.InputRefs;
import com.dataiku.dip.docextraction.common.chunks.StructuredExtractionChunk;
import com.dataiku.dip.recipes.nlp.rag_embedding.RAGEmbeddingRecipeCreator;
import com.dataiku.dip.utils.ExceptionUtils;
import com.dataiku.dss.shadelib.com.google.common.annotations.VisibleForTesting;
import com.vladsch.flexmark.ast.HardLineBreak;
import com.vladsch.flexmark.ast.Heading;
import com.vladsch.flexmark.ast.SoftLineBreak;
import com.vladsch.flexmark.parser.Parser;
import com.vladsch.flexmark.util.ast.Document;
import com.vladsch.flexmark.util.ast.Node;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;
import javax.annotation.Nullable;
import org.apache.commons.lang.StringUtils;

public class StructuredExtractor {
    private static final String VLM_ANNOTATION_EXTRACTION_PROMPT = "You are analyzing a document image to extract structured information for downstream processing and retrieval (RAG). Your goal is to extract all meaningful, reusable content in a markdown format up to %d characters.\nInstructions:\n1. Classify the image type: diagram, chart, table, handwriting, printed text, or mixed.\n2. Extract and describe based on type:\n- Diagram or Chart: Provide a detailed description of the structure, relationships, labels, axes, values, legends, and overall meaning. Convert all meaningful content into text. Summarize insights if relevant.\n- Handwritten or Printed Text: Transcribe all visible text accurately. Preserve formatting and structure. Use markdown headings, bullets, or code blocks as appropriate.\n- Tables: Reconstruct the table as markdown. Ensure all rows and columns are preserved. Retain headings and numerical precision.\n- Mixed Content: Separate and extract each part individually (e.g., transcribe text, describe chart, and format table).\n3. Preserve structure and context: Use markdown to reflect hierarchy, formatting, and clarity.\n4. Avoid assumptions or hallucinations. If any part is unclear, describe it as such.\n5. Ensure each section is a reusable, standalone chunk.\nIf the image is a logo, signature, or QR code, don't describe it and return an empty string.\n";

    public static String getVlmDefaultAnnotationPrompt(@Nullable Integer maxTokensLimit) {
        return String.format(VLM_ANNOTATION_EXTRACTION_PROMPT, RAGEmbeddingRecipeCreator.adaptDefaultChunkSizeCharacters(maxTokensLimit));
    }

    public static StructuredContent runMarkdownStructuredExtraction(String rawMarkdown, int maxSectionDepth) {
        if (maxSectionDepth == 0) {
            StructuredContent.Document doc = new StructuredContent.Document();
            doc.addChild(new StructuredContent.Text(rawMarkdown));
            return doc;
        }
        Parser parser = Parser.builder().build();
        Document document = parser.parse(rawMarkdown);
        ArrayDeque<StructuredContent.Section> stack = new ArrayDeque<StructuredContent.Section>();
        StructuredContent.Document root = new StructuredContent.Document();
        stack.push(root);
        StringBuilder textItemBuilder = new StringBuilder();
        for (Node node : document.getChildren()) {
            if (node instanceof Heading) {
                Heading heading = (Heading)node;
                if (heading.getLevel() > maxSectionDepth) {
                    textItemBuilder.append((CharSequence)node.getChars().append(new CharSequence[]{"\n"}));
                    continue;
                }
                StructuredContent.Section currentExtractedSection = new StructuredContent.Section(heading.getLevel(), heading.getChars().toString());
                if (stack.isEmpty()) {
                    stack.add(currentExtractedSection);
                    continue;
                }
                if (!textItemBuilder.isEmpty()) {
                    ((StructuredContent.Section)stack.peek()).addChild(new StructuredContent.Text(textItemBuilder.toString()));
                    textItemBuilder.setLength(0);
                }
                while (!stack.isEmpty() && ((StructuredContent.Section)stack.peek()).level >= heading.getLevel()) {
                    stack.pop();
                }
                if (!stack.isEmpty()) {
                    ((StructuredContent.Section)stack.peek()).addChild(currentExtractedSection);
                }
                stack.push(currentExtractedSection);
                continue;
            }
            if (node instanceof SoftLineBreak || node instanceof HardLineBreak) {
                textItemBuilder.append("\n");
                continue;
            }
            textItemBuilder.append((CharSequence)node.getChars());
        }
        if (!stack.isEmpty() && !textItemBuilder.isEmpty()) {
            ((StructuredContent.Section)stack.peek()).addChild(new StructuredContent.Text(textItemBuilder.toString()));
        }
        return root;
    }

    public static List<StructuredExtractionChunk> flattenTreeAndMergeContinuousSectionContent(StructuredContent structure, boolean useSeparatedChunksForImages) {
        return StructuredExtractor.flattenTreeAndMergeContinuousSectionContent(structure, Collections.emptyList(), useSeparatedChunksForImages);
    }

    protected static List<StructuredExtractionChunk> flattenTreeAndMergeContinuousSectionContent(StructuredContent item, List<String> currentOutline, boolean useSeparatedChunksForImages) {
        if (item == null) {
            return Collections.emptyList();
        }
        if (item instanceof StructuredContent.Section) {
            StructuredContent.Section section = (StructuredContent.Section)item;
            ArrayList<String> deeperOutline = new ArrayList<String>(currentOutline);
            if (!Objects.equals(section.getType(), "slide") && section.title != null) {
                deeperOutline.add(section.title);
            }
            if (section.content == null || section.content.isEmpty()) {
                if (StringUtils.isNotBlank((String)section.title)) {
                    return Collections.singletonList(StructuredExtractionChunk.build("", deeperOutline, section.cloneWithoutChildren(), section.pageRange));
                }
                return Collections.emptyList();
            }
            ArrayList<StructuredExtractionChunk> list = new ArrayList<StructuredExtractionChunk>();
            List<StructuredContent> mergedItems = StructuredExtractor.mergeNonSectionItems(section.content, useSeparatedChunksForImages);
            if (!Objects.equals(section.getType(), "document") && !Objects.equals(section.getType(), "slide") && mergedItems.stream().allMatch(mergedItem -> mergedItem instanceof StructuredContent.Section)) {
                list.add(StructuredExtractionChunk.build("", deeperOutline, section.copyWithImmediateNonSectionChildren(), section.pageRange));
            }
            for (StructuredContent child : mergedItems) {
                if (child instanceof StructuredContent.Text) {
                    StructuredContent.Text text = (StructuredContent.Text)child;
                    if (section.collapsedSubTree != null) {
                        list.add(StructuredExtractionChunk.build(text.toText(), deeperOutline, section.collapsedSubTree, text.pageRange));
                    } else {
                        list.add(StructuredExtractionChunk.build(text.toText(), deeperOutline, section.copyWithImmediateNonSectionChildren(), text.pageRange));
                    }
                } else if (child instanceof StructuredContent.Image) {
                    StructuredContent.Image image = (StructuredContent.Image)child;
                    list.add(StructuredExtractionChunk.buildForImage(image.toText(), deeperOutline, section.copyWithImmediateNonSectionChildren(), image.pageRange, image.caption, image.imageRef));
                }
                if (!(child instanceof StructuredContent.Section)) continue;
                StructuredContent.Section childSection = (StructuredContent.Section)child;
                list.addAll(StructuredExtractor.flattenTreeAndMergeContinuousSectionContent(childSection, deeperOutline, useSeparatedChunksForImages));
            }
            return list;
        }
        if (item instanceof StructuredContent.Image) {
            StructuredContent.Image image = (StructuredContent.Image)item;
            return Collections.singletonList(StructuredExtractionChunk.buildForImage(image.toText(), currentOutline, null, image.pageRange, image.caption, image.imageRef));
        }
        return Collections.singletonList(StructuredExtractionChunk.build(item.toText(), currentOutline, null, item.pageRange));
    }

    private static List<StructuredContent> mergeNonSectionItems(List<StructuredContent> items, boolean useSeparatedChunksForImages) {
        ArrayList<StructuredContent> result = new ArrayList<StructuredContent>();
        ArrayList<Object> textsToMerge = new ArrayList<Object>();
        StructuredContent.PageRange currentPageRange = new StructuredContent.PageRange();
        for (StructuredContent item : items) {
            if (item instanceof StructuredContent.Section || useSeparatedChunksForImages && item instanceof StructuredContent.Image) {
                if (!textsToMerge.isEmpty()) {
                    StructuredContent.Text text = new StructuredContent.Text(String.join((CharSequence)"", textsToMerge));
                    text.pageRange = currentPageRange;
                    currentPageRange = new StructuredContent.PageRange();
                    result.add(text);
                    textsToMerge.clear();
                }
                result.add(item);
                continue;
            }
            if (item.pageRange != null) {
                currentPageRange.mergeWith(item.pageRange);
            }
            if (!textsToMerge.isEmpty() && !((String)textsToMerge.get(textsToMerge.size() - 1)).endsWith("\n")) {
                textsToMerge.add("\n" + item.toText());
                continue;
            }
            textsToMerge.add(item.toText());
        }
        if (!textsToMerge.isEmpty()) {
            StructuredContent.Text lastTxt = new StructuredContent.Text(String.join((CharSequence)"", textsToMerge));
            lastTxt.pageRange = currentPageRange;
            result.add(lastTxt);
        }
        return result;
    }

    public static StructuredContent runTxtStructuredExtraction(String txt) {
        StructuredContent.Document doc = new StructuredContent.Document();
        StructuredContent.Text textStructuredItem = new StructuredContent.Text(txt);
        doc.addChild(textStructuredItem);
        return doc;
    }

    @VisibleForTesting
    protected static void flattenContentDeeperThanMaxSectionDepth(StructuredContent content, int maxSectionDepth) {
        if (content == null) {
            return;
        }
        if (content instanceof StructuredContent.Section) {
            StructuredContent.Section section = (StructuredContent.Section)content;
            ArrayList<StructuredContent> newChildren = new ArrayList<StructuredContent>();
            if (section.level != null && section.level == maxSectionDepth) {
                section.collapsedSubTree = section.deepCopy();
            }
            for (StructuredContent child : section.content) {
                if (child instanceof StructuredContent.Section) {
                    StructuredContent.Section childSection = (StructuredContent.Section)child;
                    if (childSection.level != null && childSection.level > maxSectionDepth) {
                        StringBuilder sb = new StringBuilder();
                        StructuredContent.PageRange pageRange = StructuredExtractor.flattenToText(child, sb);
                        StructuredContent.Text text = new StructuredContent.Text(sb.toString());
                        text.pageRange = pageRange;
                        newChildren.add(text);
                        continue;
                    }
                }
                StructuredExtractor.flattenContentDeeperThanMaxSectionDepth(child, maxSectionDepth);
                newChildren.add(child);
            }
            section.content = newChildren;
        }
    }

    @VisibleForTesting
    private static StructuredContent.PageRange flattenToText(StructuredContent structuredContent, StringBuilder sb) {
        if (structuredContent == null) {
            return null;
        }
        if (!(structuredContent instanceof StructuredContent.Section)) {
            if (StringUtils.isNotBlank((String)structuredContent.toText())) {
                sb.append(structuredContent.toText()).append("\n");
            }
            return structuredContent.pageRange;
        }
        sb.append(structuredContent.toText()).append("\n");
        if (structuredContent.content != null) {
            for (StructuredContent child : structuredContent.content) {
                StructuredContent.PageRange pageRange = StructuredExtractor.flattenToText(child, sb);
                if (structuredContent.pageRange == null) {
                    structuredContent.pageRange = pageRange;
                    continue;
                }
                structuredContent.pageRange.mergeWith(pageRange);
            }
        }
        return structuredContent.pageRange;
    }

    @PartOfPublicAPI
    public static class StructuredExtractorSettings {
        public int maxSectionDepth = 6;
        public ImageHandlingMode imageHandlingMode = ImageHandlingMode.IGNORE;
        public OCRSettings ocrSettings;
        public VLMAnnotationSettings vlmAnnotationSettings;
        public String outputManagedFolderId;
        public List<AbstractSQLConnection.CustomDatabaseProperty> dkuProperties = new ArrayList<AbstractSQLConnection.CustomDatabaseProperty>();
        public boolean imageValidation = true;

        public String toString() {
            StringBuilder sb = new StringBuilder();
            sb.append("maxSectionDepth=").append(this.maxSectionDepth).append(", imageHandlingMode=").append(this.imageHandlingMode.name()).append(", outputManagedFolderId=").append(this.outputManagedFolderId).append(", imageValidation=").append(this.imageValidation);
            if (this.ocrSettings != null) {
                sb.append(", ocrEngine=").append(this.ocrSettings.ocrEngine.name()).append(", ocrLanguages=").append(this.ocrSettings.ocrLanguages);
            } else if (this.vlmAnnotationSettings != null) {
                sb.append(", vlmId=").append(this.vlmAnnotationSettings.llmId).append(", llmPrompt=").append(this.vlmAnnotationSettings.llmPrompt);
            }
            if (this.dkuProperties != null && !this.dkuProperties.isEmpty()) {
                sb.append(", recipeProperties={");
                sb.append(this.dkuProperties.stream().map(AbstractSQLConnection.CustomDatabaseProperty::toString).collect(Collectors.joining(", ")));
                sb.append("}");
            }
            return sb.toString();
        }
    }

    @PartOfPublicAPI
    public static enum ImageHandlingMode {
        OCR,
        VLM_ANNOTATE,
        IGNORE;

    }

    @PartOfPublicAPI
    public static class VLMAnnotationSettings {
        public String llmId;
        public String llmPrompt = StructuredExtractor.getVlmDefaultAnnotationPrompt(null);
    }

    @PartOfPublicAPI
    public static class OCRSettings {
        public OCREngine ocrEngine;
        public String ocrLanguages = "en";

        public static enum OCREngine {
            EASYOCR,
            TESSERACT,
            AUTO;

        }
    }

    @PartOfPublicAPI
    public static class StructuredExtractorInputs {
        public InputRefs.DocumentRef document;
    }

    public static class StructuredExtractionResponseOrError {
        public boolean ok;
        public StructuredContent content;
        public String errorMessage;

        public static StructuredExtractionResponseOrError fromSuccess(StructuredContent response) {
            StructuredExtractionResponseOrError resp = new StructuredExtractionResponseOrError();
            resp.ok = true;
            resp.content = response;
            return resp;
        }

        public static StructuredExtractionResponseOrError fromError(Throwable e) {
            StructuredExtractionResponseOrError resp = new StructuredExtractionResponseOrError();
            resp.ok = false;
            resp.errorMessage = ExceptionUtils.getMessageWithCauses((Throwable)e);
            return resp;
        }

        public StructureExtractionDTOResponseOrError toDTO(boolean forExtractContentRecipe) {
            StructureExtractionDTOResponseOrError res = new StructureExtractionDTOResponseOrError();
            res.ok = this.ok;
            res.errorMessage = this.errorMessage;
            if (this.ok) {
                res.content = this.content.toDTO(forExtractContentRecipe);
            }
            return res;
        }
    }

    public static class StructureExtractionDTOResponseOrError {
        public StructuredContentDTO content;
        public boolean ok;
        public String errorMessage;
    }

    public static class StructuredExtractorRequest {
        public StructuredExtractorInputs inputs = new StructuredExtractorInputs();
        public StructuredExtractorSettings settings = new StructuredExtractorSettings();

        public StructuredExtractorRequest() {
        }

        public StructuredExtractorRequest(InputRefs.DocumentRef document, StructuredExtractorSettings structuredSettings) {
            this.inputs.document = document;
            this.settings = structuredSettings;
        }
    }
}

