/*
 * Decompiled with CFR 0.152.
 */
package com.dataiku.dip.shaker.processors.expr;

import com.dataiku.dip.ApplicationConfigurator;
import com.dataiku.dip.DKUApp;
import com.dataiku.dip.datalayer.Column;
import com.dataiku.dip.datalayer.Processor;
import com.dataiku.dip.datalayer.Row;
import com.dataiku.dip.datalayer.SingleInputSingleOutputRowProcessor;
import com.dataiku.dip.datalineage.DatasetPairLineage;
import com.dataiku.dip.datalineage.RecipeLineage;
import com.dataiku.dip.shaker.model.ProcessorScriptStep;
import com.dataiku.dip.shaker.model.StepParams;
import com.dataiku.dip.shaker.processors.Category;
import com.dataiku.dip.shaker.processors.ProcessorMeta;
import com.dataiku.dip.shaker.processors.ProcessorTag;
import com.dataiku.dip.shaker.processors.expr.TextSimplifier;
import com.dataiku.dip.shaker.processors.expr.TokenizedText;
import com.dataiku.dip.shaker.server.ProcessorDesc;
import com.dataiku.dip.utils.JSON;
import com.dataiku.dip.utils.Pair;
import com.google.common.collect.Sets;
import java.util.ArrayList;
import java.util.Set;
import org.apache.commons.lang.StringUtils;
import org.json.JSONArray;

public class NGramExtract
extends SingleInputSingleOutputRowProcessor {
    public static final ProcessorMeta<NGramExtract, Parameter> META = new ProcessorMeta<NGramExtract, Parameter>(){

        @Override
        public String getName() {
            return "NGramExtract";
        }

        @Override
        public String getDocPage() {
            return "extract-ngrams";
        }

        @Override
        public Category getCategory() {
            return Category.TRANSFORMATION;
        }

        @Override
        public Set<ProcessorTag> getTags() {
            return Sets.newHashSet((Object[])new ProcessorTag[]{ProcessorTag.NLP});
        }

        @Override
        public String getHelp(String language) {
            return this.translate(language, "SHAKER.PROCESSOR.NGramExtract.HELP", "This processor extracts sequences of words, called _ngrams_, from a text column.\n\n# What are ngrams ?\n\nFor example, for text 'the quick brown fox jumps', the ngrams are:\n\n* ngrams of size 2 (also called 2-grams) : the quick, quick brown, brown fox, fox jumps\n* ngrams of size 3 (also called 3-grams): the quick brown, quick brown fox, brown fox jumps\n\n# Example use case\n\nYou want to perform statistics on the sequence of words used in a query log.\n\n# Output\n\nThe NGram extractor offers several output modes:\n\n* Convert to JSON: A JSON array containing the ngrams is generated, either in the input column or in another column.\nThis mode is most useful if you intend to perform some custom processing and need to retain the structure of the original text.\n* One ngram per row: in this mode, for each ngram, a new row is generated. The row contains a copy of all other columns in the original row.\nThis mode is most useful if you intend to group by ngram afterwards.\n* One ngram per column: in this mode, a new column is generated for each ngram. For example, if a column contains 4 words, you ask for 2-grams, and you use 'out_' as prefix, columns 'out_0', 'out_1' and 'out_2' will be generated.\n\n# Simplification\n\nVery often, you'll want to simplify the text to remove some variance in your text corpus.\nThis processor offers several possible simplifications on the text before extracting ngrams.\n\n" + TextSimplifier.getHelp() + "\n\nNote: it is strongly advised to clear stop words before extracting ngrams\n\n# Advanced options\n\n* Split on sentence boundaries: Generally, you don't want to compute cross-sentence ngrams. For example, with text 'The rain falls. The sun shines', you don't want to generate 'falls the' as a ngram.\n* Compute skip-grams : In our sample sentence, the skip-grams would be: the brown, the fox, the jumps, quick fox, quick jumps, ...\nEnabling skip-grams computation dramatically increases output size and computation requirements.\n");
        }

        @Override
        public Class<Parameter> stepParamClass() {
            return Parameter.class;
        }

        @Override
        public ProcessorDesc describe(String language) {
            ProcessorDesc pd = new ProcessorDesc(this.getName(), this.translate(language, "SHAKER.PROCESSOR.NGramExtract.DESCRIPTION", 1.actionVerb("Extract") + " ngrams"), false).withMNEColParam("inCol", this.translate(language, "SHAKER.PROCESSOR.NGramExtract.DESCRIPTION.IN_COL", "Column")).withParam("outCol", "string", false, true, this.translate(language, "SHAKER.PROCESSORS.DESCRIPTION.OUTPUT_COLUMN_EMPTY_FOR_INPLACE", "Output column (empty for in-place)")).withBool("skip", this.translate(language, "SHAKER.PROCESSOR.NGramExtract.DESCRIPTION.SKIP", "Also compute skip ngrams (non adjacent terms)")).withBool("sentenceSplit", this.translate(language, "SHAKER.PROCESSOR.NGramExtract.DESCRIPTION.SENTENCE_SPLIT", "Compute ngrams on blocks separated by characters .,:; and new lines")).withBoundedMandInt("nmin", this.translate(language, "SHAKER.PROCESSOR.NGramExtract.DESCRIPTION.NMIN", "Smallest number of terms in ngrams."), 2, 10).withBoundedMandInt("nmax", this.translate(language, "SHAKER.PROCESSOR.NGramExtract.DESCRIPTION.NMAX", "Largest number of terms in ngrams."), 2, 10);
            TextSimplifier.withParams(language, pd, true);
            return pd;
        }

        @Override
        public Object selfReport(Parameter parameter) {
            return JSON.deepCopyExcept((Object)((Object)parameter), (String[])new String[]{"inCol", "outCol"});
        }

        @Override
        public NGramExtract build(Parameter params) throws Exception {
            return new NGramExtract(params);
        }

        @Override
        public RecipeLineage getUpdatedRecipeLineage(ProcessorScriptStep pss, RecipeLineage previousRecipeLineage) {
            if (!(pss.params instanceof Parameter)) {
                throw new IllegalArgumentException("Unsupported param type: " + pss.params.getClass().getSimpleName());
            }
            Parameter ngramExtractorParam = (Parameter)pss.params;
            RecipeLineage updatedRecipeLineage = new RecipeLineage();
            previousRecipeLineage.getDatasetPairLineages().forEach((datasetPair, previousDatasetPairLineage) -> {
                DatasetPairLineage updatedDatasetPairLineage = new DatasetPairLineage((DatasetPairLineage)previousDatasetPairLineage);
                switch (ngramExtractorParam.operation) {
                    case TO_JSON: 
                    case FOLD: {
                        if (!StringUtils.isNotBlank((String)ngramExtractorParam.outCol)) break;
                        updatedDatasetPairLineage.removeRelationsOnColumn(ngramExtractorParam.outCol);
                        updatedDatasetPairLineage.addFactorizedColumnRelations(ngramExtractorParam.inCol, ngramExtractorParam.outCol);
                        break;
                    }
                    case SPLIT: {
                        updatedRecipeLineage.setUncertain(true);
                    }
                }
                updatedRecipeLineage.setDatasetPairLineage((Pair<String, String>)datasetPair, updatedDatasetPairLineage);
            });
            return updatedRecipeLineage;
        }
    };
    Parameter params;
    Column inCD;
    Column outCD;
    Column nextColCD;
    String nextColName;
    TextSimplifier textSimplifier;
    private final int ngramLimit;

    public NGramExtract(Parameter params) throws Exception {
        this.params = params;
        this.textSimplifier = new TextSimplifier(params);
        this.ngramLimit = !DKUApp.isConfigured() || DKUApp.getRunsWithoutConfig() || ApplicationConfigurator.isInSparkDriver() ? 0 : DKUApp.getParams().getIntParam("dku.processors.ngramExtract.ngramLimit", Integer.valueOf(50000));
    }

    public void extract(TokenizedText tokens, int nmin, int nmax, ArrayList<TokenizedText> ngrams) throws TooManyNgramsException {
        for (int n = nmin; n <= nmax; ++n) {
            if (this.ngramLimit > 0 && ngrams.size() + tokens.size() - n + 1 >= this.ngramLimit) {
                throw new TooManyNgramsException(this.ngramLimit);
            }
            for (int offset = 0; offset <= tokens.size() - n; ++offset) {
                ngrams.add(tokens.getTokens(offset, offset + n));
            }
        }
    }

    public void extractWithSkip(TokenizedText tokens, int nmin, int nmax, ArrayList<TokenizedText> ngrams) throws TooManyNgramsException {
        for (int n = nmin; n <= nmax; ++n) {
            this.extractRec(tokens, n, new TokenizedText(), ngrams);
        }
    }

    private void extractRec(TokenizedText tokens, int n, TokenizedText prefix, ArrayList<TokenizedText> ngrams) throws TooManyNgramsException {
        if (tokens.size() == 0) {
            return;
        }
        if (n == 1) {
            if (this.ngramLimit > 0 && ngrams.size() + tokens.size() >= this.ngramLimit) {
                throw new TooManyNgramsException(this.ngramLimit);
            }
            for (int i = 0; i < tokens.size(); ++i) {
                TokenizedText ngram = new TokenizedText(prefix);
                ngram.add(tokens.get(i));
                ngrams.add(ngram);
            }
        } else if (n > 1) {
            this.extractRec(tokens.getTokens(1, tokens.size()), n, prefix, ngrams);
            TokenizedText newPrefix = new TokenizedText(prefix);
            newPrefix.add(tokens.get(0));
            this.extractRec(tokens.getTokens(1, tokens.size()), n - 1, newPrefix, ngrams);
        }
    }

    public void init() {
        this.inCD = this.getCf().column(this.params.inCol, Processor.ProcessorRole.INPUT_COLUMN);
        if (this.params.operation == TextSimplifier.OperationType.TO_JSON || this.params.operation == TextSimplifier.OperationType.FOLD) {
            this.outCD = StringUtils.isNotBlank((String)this.params.outCol) ? this.getCf().columnAfter(this.params.inCol, this.params.outCol, Processor.ProcessorRole.OUTPUT_COLUMN) : this.getCf().column(this.params.inCol, Processor.ProcessorRole.OUTPUT_COLUMN);
        } else if (this.params.operation == TextSimplifier.OperationType.SPLIT) {
            this.nextColCD = this.getCf().getColumnAfter(this.params.inCol);
            this.nextColName = this.nextColCD == null ? null : this.nextColCD.getName();
        }
    }

    public void processRow(Row row) throws Exception {
        String v = row.get(this.inCD);
        ArrayList<TokenizedText> tokenized = new ArrayList<TokenizedText>();
        ArrayList<TokenizedText> ngrams = new ArrayList<TokenizedText>();
        if (this.params.sentenceSplit) {
            String delimiterClauses = "[\n\r\u0085\u2028\u2029,.:;()?!]";
            if (v == null) {
                v = "";
            }
            String[] stringArray = v.split(delimiterClauses);
            for (String s : stringArray) {
                tokenized.add(new TokenizedText(s));
            }
        } else {
            tokenized.add(new TokenizedText(v));
        }
        for (TokenizedText tokenizedText : tokenized) {
            this.textSimplifier.simplify(tokenizedText);
            if (this.params.skip) {
                this.extractWithSkip(tokenizedText, this.params.nmin, this.params.nmax, ngrams);
                continue;
            }
            this.extract(tokenizedText, this.params.nmin, this.params.nmax, ngrams);
        }
        if (this.params.operation == TextSimplifier.OperationType.TO_JSON) {
            JSONArray jsonArray = new JSONArray();
            for (TokenizedText ngram : ngrams) {
                jsonArray.put((Object)ngram.toJSONArray());
            }
            row.put(this.outCD, jsonArray.toString());
            this.getProcessorOutput().emitRow(row);
        } else if (this.params.operation == TextSimplifier.OperationType.FOLD) {
            if (ngrams.size() == 0) {
                row.put(this.outCD, "");
                this.getProcessorOutput().emitRow(row);
            }
            for (int i = 0; i < ngrams.size(); ++i) {
                Row row2 = this.getRf().row();
                for (Column c2 : this.getCf().columns()) {
                    String colVal = row.get(c2);
                    if (StringUtils.isBlank((String)colVal)) continue;
                    row2.put(c2, colVal);
                }
                row2.put(this.outCD, ngrams.get(i).toString());
                this.getProcessorOutput().emitRow(row2);
            }
        } else if (this.params.operation == TextSimplifier.OperationType.SPLIT) {
            for (int i = 0; i < ngrams.size(); ++i) {
                Object object = this.params.prefix == null ? this.params.inCol + "_" : this.params.prefix;
                row.put(this.getCf().columnBefore(this.nextColName, (String)object + Integer.toString(i), Processor.ProcessorRole.OUTPUT_COLUMN), ngrams.get(i).toString());
            }
            this.getProcessorOutput().emitRow(row);
        }
    }

    public void postProcess() throws Exception {
        this.getProcessorOutput().lastRowEmitted();
    }

    public static class Parameter
    extends TextSimplifier.Parameter
    implements StepParams {
        private static final long serialVersionUID = -1L;
        public String inCol;
        public String outCol;
        public int nmin = 2;
        public int nmax = 2;
        boolean skip;
        boolean sentenceSplit = true;

        public Parameter() {
            this.normalize = true;
            this.clearStopWords = true;
        }

        @Override
        public void validate() throws IllegalArgumentException {
        }
    }

    static class TooManyNgramsException
    extends Exception {
        TooManyNgramsException(int limit) {
            super(String.format("Too many ngrams generated (>= %d). Processing aborted.", limit));
        }
    }
}

