/*
 * Decompiled with CFR 0.152.
 */
package com.dataiku.dip.analysis.model.preprocessing;

import com.dataiku.dip.analysis.model.MLTask;
import com.dataiku.dip.analysis.model.preprocessing.FeaturePreprocessingParams;
import com.dataiku.dip.llm.LLMStructuredRef;
import com.dataiku.dip.utils.ErrorContext;
import java.util.Map;
import java.util.Optional;
import org.apache.commons.lang.StringUtils;

public class TextFeaturePreprocessingParams
extends FeaturePreprocessingParams {
    public TextHandlingMethod text_handling;
    public float minRowsRatio = 0.001f;
    public float maxRowsRatio = 0.8f;
    public int maxWords;
    public int ngramMinSize = 1;
    public int ngramMaxSize = 1;
    public int hashSize = 200000;
    public int hashSVDSVDLimit = 50000;
    public int hashSVDSVDComponents = 100;
    public StopWordsMode stopWordsMode = StopWordsMode.NONE;
    public String customStopWords;
    public String sentenceEmbeddingModel;
    public int maxSequenceLength = 128;
    public int sentenceEmbeddingBatchSize = 32;
    public boolean useCustomVectorizer;
    public String customVectorizerCode;
    public Integer embeddingSize;
    public boolean isStructuredRef = false;

    public TextFeaturePreprocessingParams() {
        this.type = FeaturePreprocessingParams.FeatureType.TEXT;
    }

    @Override
    public void check(String featureName, MLTask task) throws Exception {
        if (this.role != FeaturePreprocessingParams.Role.INPUT) {
            return;
        }
        if (this.text_handling == null) {
            throw ErrorContext.iaef((String)"Feature '%s' has no text handling method", (Object)featureName, (Object[])new Object[0]);
        }
        if (this.text_handling == TextHandlingMethod.SENTENCE_EMBEDDING) {
            ErrorContext.checkNotEmptyOrBlank((String)this.sentenceEmbeddingModel, (String)(" text embedding model id. (Please select a model for Feature '" + featureName + "' preprocessing)"));
        }
    }

    @Override
    public boolean usesCustomHandling() {
        if (this.role != FeaturePreprocessingParams.Role.INPUT) {
            return false;
        }
        return !StringUtils.isBlank((String)this.customHandlingCode) && this.text_handling == TextHandlingMethod.CUSTOM;
    }

    @Override
    public Optional<String> getUsedConnection() {
        if (this.isStructuredRef) {
            return Optional.of(LLMStructuredRef.decodeId((String)this.sentenceEmbeddingModel).connection);
        }
        return Optional.empty();
    }

    @Override
    public boolean replaceConnections(Map<String, String> connectionsReplacements) {
        if (this.isStructuredRef) {
            LLMStructuredRef oldRef = LLMStructuredRef.decodeId(this.sentenceEmbeddingModel);
            if (connectionsReplacements.containsKey(oldRef.connection)) {
                String newConnection = connectionsReplacements.get(oldRef.connection);
                this.sentenceEmbeddingModel = oldRef.withOtherConnection(newConnection).encodeToId();
                return true;
            }
        }
        return false;
    }

    public static enum StopWordsMode {
        NONE,
        CUSTOM,
        AFRIKAANS,
        ALBANIAN,
        ARABIC,
        ARMENIAN,
        BASQUE,
        BENGALI,
        BULGARIAN,
        CATALAN,
        CHINESE,
        CHINESE_TRADITIONAL,
        CROATIAN,
        CZECH,
        DANISH,
        DUTCH,
        ENGLISH,
        ENGLISH_2021,
        ESTONIAN,
        FINNISH,
        FRENCH,
        FRENCH_2021,
        GERMAN,
        GREEK,
        GUJARATI,
        HEBREW,
        HINDI,
        HUNGARIAN,
        ICELANDIC,
        INDONESIAN,
        IRISH,
        ITALIAN,
        JAPANESE,
        KANNADA,
        KOREAN,
        LATVIAN,
        LITHUANIAN,
        LUXEMBOURGISH,
        MACEDONIAN,
        MALAYALAM,
        MARATHI,
        NEPALI,
        NORWEGIAN,
        PERSIAN,
        POLISH,
        PORTUGUESE,
        ROMANIAN,
        RUSSIAN,
        SANSKRIT,
        SERBIAN,
        SINHALA,
        SLOVAK,
        SLOVENIAN,
        SPANISH,
        SWEDISH,
        TAGALOG,
        TAMIL,
        TATAR,
        TELUGU,
        THAI,
        TURKISH,
        UKRAINIAN,
        URDU,
        VIETNAMESE,
        YORUBA;

    }

    public static enum TextHandlingMethod {
        TOKENIZE_HASHING,
        TOKENIZE_HASHING_SVD,
        TOKENIZE_COUNTS,
        TOKENIZE_TFIDF,
        SENTENCE_EMBEDDING,
        CUSTOM;

    }
}

