/*
 * Decompiled with CFR 0.152.
 */
package com.dataiku.dip.shaker.processors.join;

import com.dataiku.dip.DKUApp;
import com.dataiku.dip.DatasetDependency;
import com.dataiku.dip.ProcessorWithSingleCopyAdditionalInputs;
import com.dataiku.dip.SingleCopyAdditionalInputsLoader;
import com.dataiku.dip.datalayer.Column;
import com.dataiku.dip.datalayer.ColumnFactory;
import com.dataiku.dip.datalayer.Processor;
import com.dataiku.dip.datalayer.Row;
import com.dataiku.dip.datalayer.RowInputStream;
import com.dataiku.dip.datalayer.SingleRowProcessor;
import com.dataiku.dip.datalayer.streamimpl.StreamRow;
import com.dataiku.dip.shaker.model.StepParams;
import com.dataiku.dip.shaker.processors.Category;
import com.dataiku.dip.shaker.processors.ProcessorMeta;
import com.dataiku.dip.shaker.processors.ProcessorTag;
import com.dataiku.dip.shaker.processors.expr.TextSimplifier;
import com.dataiku.dip.shaker.processors.join.AutomatonSearchEngine;
import com.dataiku.dip.shaker.processors.join.BasicSimplifySearchEngine;
import com.dataiku.dip.shaker.processors.join.BlockSearchEngine;
import com.dataiku.dip.shaker.processors.join.FuzzySearchEngine;
import com.dataiku.dip.shaker.processors.join.RawLevenshteinSearchEngine;
import com.dataiku.dip.shaker.server.AdditionalInputAccessor;
import com.dataiku.dip.shaker.server.ProcessorDesc;
import com.dataiku.dip.util.ParamDesc;
import com.dataiku.dip.utils.DKULogger;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.gson.JsonObject;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
import org.apache.commons.lang.StringUtils;

public class MemoryEquiJoiner
extends SingleRowProcessor
implements ProcessorWithSingleCopyAdditionalInputs<RightDataLoader>,
Processor {
    public static final ProcessorMeta<MemoryEquiJoiner, Parameter> META = new ProcessorMeta<MemoryEquiJoiner, Parameter>(){

        @Override
        public String getName() {
            return "MemoryEquiJoiner";
        }

        @Override
        public String getDocPage() {
            return "join";
        }

        @Override
        public Category getCategory() {
            return Category.MISC;
        }

        @Override
        public Set<ProcessorTag> getTags() {
            return Sets.newHashSet();
        }

        @Override
        public Class<Parameter> stepParamClass() {
            return Parameter.class;
        }

        @Override
        public String getHelp(String language) {
            return this.translate(language, "SHAKER.PROCESSOR.MemoryEquiJoiner.HELP", "This processor performs a left join with another (small) dataset.\n\n# Example use case\n\nYou are processing a dataset of events. The events contain a reference to a product id. You have another dataset which contains details about the products, and you want to retrieve the product details for each event.\n# Limitations\n\nThe 'other' dataset must be small (a few thousand records maximum). Prefer using a Join recipe\n\n# Behaviour details\nThe processor performs a deduplicated left join:\n\n* If no rows in the 'other' dataset match, joined columns are left empty\n* If multiple rows match in the 'other' dataset, the 'last' one is selected (but ordering is not guaranteed)\n\n# Parameters\nThe processor needs the following parameters:\n\n* Column containing the join key in the current dataset (which may have been generated by a previous step)\n* Name of the dataset to join with. Note that the dataset to join with must be in the same project.\n* Column containing the join key in the joined dataset.\n* Columns from the joined dataset that should be copied to the local dataset, for the matched row.\n\n# Output\nThe processor outputs selected columns from the joined dataset. For each row of the current dataset, the columns will contain the data from the matching row in the joined dataset.\n\nIf no row matched in the joined dataset, the output columns will be left empty.\n\n");
        }

        @Override
        public ProcessorDesc describe(String language) {
            ParamDesc columnsParam = new ParamDesc("copyColumns", "columns").withDefaultValue(new ArrayList());
            ProcessorDesc pd = new ProcessorDesc(this.getName(), this.translate(language, "SHAKER.PROCESSOR.MemoryEquiJoiner.DESCRIPTION", 1.actionVerb("Join") + " with other dataset (memory-based)"), false).withMNEColParam("leftCol", this.translate(language, "SHAKER.PROCESSOR.MemoryEquiJoiner.DESCRIPTION.LEFT_COL", "Join column (here)")).withMNESParam("rightInput", this.translate(language, "SHAKER.PROCESSOR.MemoryEquiJoiner.DESCRIPTION.RIGHT_INPUT", "Dataset to join with")).withMNESParam("rightCol", this.translate(language, "SHAKER.PROCESSOR.MemoryEquiJoiner.DESCRIPTION.RIGHT_COL", "Join column (in other dataset)")).withParam(columnsParam).withParam(ParamDesc.string("copyPrefix", this.translate(language, "SHAKER.PROCESSOR.MemoryEquiJoiner.DESCRIPTION.COPY_PREFIX", "Prefix column names"), this.translate(language, "SHAKER.PROCESSOR.MemoryEquiJoiner.DESCRIPTION.COPY_PREFIX.TOOLTIP", "Prefix all copied column names with this"), "join_").withMandatory(false)).withBool("fuzzy", this.translate(language, "SHAKER.PROCESSOR.MemoryEquiJoiner.DESCRIPTION.FUZZY_WITH_LEVENSTEIN", "Fuzzy join within a Levenstein Automaton")).deprecate().withReplacementDocLink("other_recipes/join").withReplacementName("Join recipe");
            TextSimplifier.withParams(language, pd, false);
            return pd;
        }

        @Override
        public Object selfReport(Parameter parameter) {
            JsonObject out = new JsonObject();
            out.addProperty("maxLevenshtein", (Number)parameter.maxLevenshtein);
            out.addProperty("fuzzy", Boolean.valueOf(parameter.fuzzy));
            out.addProperty("numCopyColumns", (Number)parameter.copyColumns.size());
            return out;
        }

        @Override
        public MemoryEquiJoiner build(Parameter param) throws Exception {
            return new MemoryEquiJoiner(param);
        }
    };
    public static final ProcessorMeta<MemoryEquiJoiner, Parameter> META_FUZZY = new ProcessorMeta<MemoryEquiJoiner, Parameter>(){

        @Override
        public String getName() {
            return "MemoryEquiJoinerFuzzy";
        }

        @Override
        public String getDocPage() {
            return "fuzzy-join";
        }

        @Override
        public Category getCategory() {
            return Category.MISC;
        }

        @Override
        public Set<ProcessorTag> getTags() {
            return Sets.newHashSet();
        }

        @Override
        public Class<Parameter> stepParamClass() {
            return Parameter.class;
        }

        @Override
        public String getHelp(String language) {
            return this.translate(language, "SHAKER.PROCESSOR.MemoryEquiJoinerFuzzy.HELP", "This processor performs a fuzzy left join with another (small) dataset.\n\n'Fuzzy' means that the join can match even if the two strings being matched are not exactly equal, but close.\n\n# Example use case\nYou are processing a dataset of search queries. Many queries target the name of a product, but with lots of variations and typos. You also have a dataset with all your products, and you want to add some product details info to each query, when we can identify which product it is about.\n\nFuzzy join can help you find the correct product, even when the product name is not exact.\n# Limitations\n\nThe 'other' dataset must be small (a few thousand records maximum). Prefer using a Fuzzy-join recipe\n\n# Behaviour details\nThe processor performs a deduplicated left join:\n\n* If no rows in the 'other' dataset match, joined columns are left empty\n* If multiple rows match in the 'other' dataset, the 'closest' one in terms of edit distanceis selected\n\n# Requirements and limitations\n\nThe 'other' dataset must fit in RAM. A good limit would be that it should not be more than ~500 000 rows. If this is not the case, you should use a recipe to join the datasets (for example, a Pig, Hive, Python or SQL recipe).\n\nBoth the dataset being processed and the 'other' dataset must contain a column containing the join key.\n# Fuzziness and simplification\nThe processor performs a fuzzy search by computing the 'distance' between two string (roughly speaking, the number of differing characters between them). In order to increase the recall (ie, the number of times we find a match), it is generally recommended to first 'simplify' the text in both datasets, to remove some variance. This processor has built-in simplification options.\n\n" + TextSimplifier.getHelp() + "# Parameters\nThe processor needs the following parameters:\n\n* Column containing the join key in the current dataset (which may have been generated by a previous step)\n* Name of the dataset to join with. Note that the dataset to join with must be in the same project.\n* Column containing the join key in the joined dataset.\n* Columns from the joined dataset that should be copied to the local dataset, for the matched row.\n* Simplification options\n* Maximum Damerau-Levenstein distance between the simplified strings so that they are considered a match.\n\n# Output\nThe processor outputs selected columns from the joined dataset. For each row of the current dataset, the columns will contain the data from the matching row in the joined dataset.\n\nIf no row matched in the joined dataset, the output columns will be left empty.\n\n");
        }

        @Override
        public ProcessorDesc describe(String language) {
            ParamDesc columnsParam = new ParamDesc("copyColumns", "columns").withDefaultValue(new ArrayList());
            ProcessorDesc pd = new ProcessorDesc(this.getName(), this.translate(language, "SHAKER.PROCESSOR.MemoryEquiJoiner.FUZZY.DESCRIPTION", 2.actionVerb("Fuzzy join") + " with other dataset (memory-based)"), false).withMNEColParam("leftCol", this.translate(language, "SHAKER.PROCESSOR.MemoryEquiJoiner.DESCRIPTION.LEFT_COL", "Join column (here)")).withMNESParam("rightInput", this.translate(language, "SHAKER.PROCESSOR.MemoryEquiJoiner.DESCRIPTION.RIGHT_INPUT", "Dataset to join with")).withMNESParam("rightCol", this.translate(language, "SHAKER.PROCESSOR.MemoryEquiJoiner.DESCRIPTION.RIGHT_COL", "Join column (in other dataset)")).withParam(columnsParam).withParam(ParamDesc.string("copyPrefix", this.translate(language, "SHAKER.PROCESSOR.MemoryEquiJoiner.DESCRIPTION.COPY_PREFIX", "Prefix column names"), this.translate(language, "SHAKER.PROCESSOR.MemoryEquiJoiner.DESCRIPTION.COPY_PREFIX.TOOLTIP", "Prefix all copied column names with this"), "join_").withMandatory(false)).withBoolDefaultTrue("fuzzy", this.translate(language, "SHAKER.PROCESSOR.MemoryEquiJoiner.DESCRIPTION.FUZZY_JOIN", "Fuzzy join"), "").withBoundedMandInt("maxLevenshtein", this.translate(language, "SHAKER.PROCESSOR.MemoryEquiJoiner.DESCRIPTION.MAX_LEVENSHTEIN", "Maximal Damerau-Levenstein distance"), 0, 10).deprecate().withReplacementDocLink("other_recipes/fuzzy-join").withReplacementName("Fuzzy join recipe");
            TextSimplifier.withParams(language, pd, false);
            return pd;
        }

        @Override
        public MemoryEquiJoiner build(Parameter param) throws Exception {
            return new MemoryEquiJoiner(param);
        }
    };
    private Parameter param;
    private FuzzySearchEngine fuzzySearchEngine;
    private int countFuzzyRetrieval = 0;
    Map<String, String[]> hashJoin;
    Column leftDesc;
    List<Column> copyTargets = new ArrayList<Column>();
    private static DKULogger logger = DKULogger.getLogger((String)"dku.shaker.join");

    public MemoryEquiJoiner(Parameter param) throws Exception {
        this.param = param;
    }

    @Override
    public RightDataLoader buildLoader() {
        return new RightDataLoader();
    }

    @Override
    public void setAdditionalInputs(RightDataLoader rightDataLoader) {
        this.hashJoin = rightDataLoader.localHashJoin;
        this.fuzzySearchEngine = rightDataLoader.fuzzySearchEngine;
    }

    public void init() {
        this.leftDesc = this.getColumnFactory().column(this.param.leftCol, Processor.ProcessorRole.INPUT_COLUMN);
        for (String copyColumn : this.param.copyColumns) {
            this.copyTargets.add(this.getColumnFactory().column((String)(this.param.copyPrefix == null ? copyColumn : this.param.copyPrefix + copyColumn), Processor.ProcessorRole.OUTPUT_COLUMN));
        }
    }

    public void processRow(Row row) throws Exception {
        String closeMatch;
        String ljoinValue = row.get(this.leftDesc);
        if (ljoinValue == null) {
            return;
        }
        String[] rd = this.hashJoin.get(ljoinValue);
        if (rd == null && this.param.fuzzy && (closeMatch = this.fuzzySearchEngine.search(ljoinValue)) != null && (rd = this.hashJoin.get(closeMatch)) != null) {
            ++this.countFuzzyRetrieval;
        }
        if (rd != null) {
            for (int i = 0; i < this.copyTargets.size(); ++i) {
                row.put(this.copyTargets.get(i), rd[i]);
            }
        }
    }

    public void postProcess() {
        logger.info((Object)("Number of matches retrieved fuzzily : " + this.countFuzzyRetrieval));
    }

    @Override
    public List<DatasetDependency> listDependencies() {
        Preconditions.checkArgument((boolean)StringUtils.isNotBlank((String)this.param.rightInput), (Object)"Dataset to join with is not specified");
        Preconditions.checkArgument((this.param.rightCol != null && !this.param.rightCol.isEmpty() ? 1 : 0) != 0, (Object)"Columns to use in join are not specified");
        ArrayList<String> requiredColumns = new ArrayList<String>();
        requiredColumns.addAll(this.param.copyColumns);
        requiredColumns.add(this.param.rightCol);
        return Lists.newArrayList((Object[])new DatasetDependency[]{new DatasetDependency(this.param.rightInput, requiredColumns, "scriptDeps")});
    }

    public static class Parameter
    extends TextSimplifier.Parameter
    implements StepParams {
        private static final long serialVersionUID = -1L;
        public String leftCol;
        public String rightInput;
        public String rightCol;
        public List<String> copyColumns = new ArrayList<String>();
        public String copyPrefix = "";
        public boolean fuzzy = false;
        public int maxLevenshtein = 1;
        public boolean forceRawLevenshteinEngine;

        @Override
        public void validate() throws IllegalArgumentException {
        }
    }

    public class RightDataLoader
    implements SingleCopyAdditionalInputsLoader {
        final Map<String, String[]> localHashJoin = new HashMap<String, String[]>();
        final FuzzySearchEngine fuzzySearchEngine;

        public RightDataLoader() {
            logger.info((Object)"Creating data loader for memory-join");
            if (MemoryEquiJoiner.this.param.fuzzy) {
                logger.infoV("Fuzzy join (distance=%d), creating fuzzy search engine", new Object[]{MemoryEquiJoiner.this.param.maxLevenshtein});
                this.fuzzySearchEngine = MemoryEquiJoiner.this.param.forceRawLevenshteinEngine ? new RawLevenshteinSearchEngine(MemoryEquiJoiner.this.param) : (MemoryEquiJoiner.this.param.maxLevenshtein == 0 ? new BasicSimplifySearchEngine(MemoryEquiJoiner.this.param) : (MemoryEquiJoiner.this.param.maxLevenshtein <= 2 ? new AutomatonSearchEngine(MemoryEquiJoiner.this.param) : new BlockSearchEngine(MemoryEquiJoiner.this.param)));
            } else {
                this.fuzzySearchEngine = null;
            }
        }

        @Override
        public Callable<Void> loadAdditionalInputs(AdditionalInputAccessor srunner) throws Exception {
            AdditionalInputAccessor.AdditionalInput ai = srunner.getAdditionalInput(MemoryEquiJoiner.this.param.rightInput);
            final RowInputStream rightIS = ai.getInput();
            ColumnFactory rightCF = ai.getColumnFactory();
            final Column rightDesc = rightCF.column(MemoryEquiJoiner.this.param.rightCol, Processor.ProcessorRole.INPUT_COLUMN);
            final ArrayList<Column> rightCopyDesc = new ArrayList<Column>();
            for (String copyColumn : MemoryEquiJoiner.this.param.copyColumns) {
                rightCopyDesc.add(rightCF.column(copyColumn, Processor.ProcessorRole.INPUT_COLUMN));
            }
            return new Callable<Void>(){

                @Override
                public Void call() throws Exception {
                    Row row;
                    int rowIdx = 0;
                    long fetchedBytes = 0L;
                    int maxRows = DKUApp.getParams().getIntParam("dku.prepare.processors.join.maxRowsInJoinedDataset", Integer.valueOf(100000));
                    long maxBytes = DKUApp.getParams().getLongParam("dku.prepare.processors.join.maxSizeFromJoinedDatasetMB", 100L) * 1000000L;
                    logger.infoV("Starting to load join data maxRows=%d maxBytes=%d", new Object[]{maxRows, maxBytes});
                    while ((row = rightIS.next()) != null) {
                        RightData rd = new RightData(row, rightCopyDesc, rightDesc);
                        if (MemoryEquiJoiner.this.param.fuzzy) {
                            if (row instanceof StreamRow) {
                                fetchedBytes += ((StreamRow)row).getApproximateMemoryUsage();
                            }
                        } else {
                            fetchedBytes += rd.getEstimatedMemoryUsage();
                        }
                        if (rowIdx % 1000 == 0) {
                            Runtime runtime = Runtime.getRuntime();
                            double p = (double)runtime.totalMemory() / (double)runtime.maxMemory() * 100.0;
                            logger.infoV("Loaded rows=%d bytes=%d mem_usage=%.1f%%", new Object[]{rowIdx, fetchedBytes, p});
                        }
                        if (rowIdx > maxRows || fetchedBytes > maxBytes) {
                            throw new Exception(String.format("Joined dataset is too large, aborting. Please use a %s recipe instead (rows:%s bytes:%s)", MemoryEquiJoiner.this.param.fuzzy ? "Fuzzy-join" : "Join", rowIdx, fetchedBytes));
                        }
                        if (rd.joinValue != null) {
                            if (MemoryEquiJoiner.this.param.fuzzy) {
                                RightDataLoader.this.fuzzySearchEngine.populate(rd.joinValue);
                            }
                            RightDataLoader.this.localHashJoin.put(rd.joinValue, rd.data);
                        }
                        ++rowIdx;
                    }
                    if (MemoryEquiJoiner.this.param.fuzzy) {
                        logger.info((Object)"Load done, finalizing Fuzzy search engine");
                        RightDataLoader.this.fuzzySearchEngine.endPopulating();
                    }
                    logger.info((Object)("Load done, join size " + RightDataLoader.this.localHashJoin.size()));
                    return null;
                }
            };
        }
    }

    static class RightData {
        String joinValue;
        String[] data;

        public RightData(Row r, List<Column> copyColumns, Column joinColumn) {
            this.joinValue = r.get(joinColumn);
            this.data = new String[copyColumns.size()];
            for (int i = 0; i < copyColumns.size(); ++i) {
                this.data[i] = r.get(copyColumns.get(i));
            }
        }

        public long getEstimatedMemoryUsage() {
            long usage = 0L;
            if (this.data != null) {
                for (String s : this.data) {
                    if (s == null) continue;
                    usage += (long)s.length() * 2L;
                }
            }
            if (this.joinValue != null) {
                usage += (long)this.joinValue.length() * 2L;
            }
            return usage + 32L;
        }
    }
}

