/*
 * Decompiled with CFR 0.152.
 */
package com.dataiku.dip.eda.worksheets.suggester;

import com.dataiku.dip.eda.compute.computations.Computation;
import com.dataiku.dip.eda.compute.computations.ComputationResult;
import com.dataiku.dip.eda.compute.computations.bivariate.Chi2IndTest;
import com.dataiku.dip.eda.compute.computations.bivariate.Spearman;
import com.dataiku.dip.eda.compute.computations.common.Count;
import com.dataiku.dip.eda.compute.computations.common.GroupedComputation;
import com.dataiku.dip.eda.compute.computations.common.MultiComputation;
import com.dataiku.dip.eda.compute.computations.univariate.CountDistinct;
import com.dataiku.dip.eda.compute.computations.univariate.Entropy;
import com.dataiku.dip.eda.compute.computations.univariate.FitDistribution;
import com.dataiku.dip.eda.compute.computations.univariate.GuessTimeStep;
import com.dataiku.dip.eda.compute.computations.univariate.MatchTimeStep;
import com.dataiku.dip.eda.compute.computations.univariate.Max;
import com.dataiku.dip.eda.compute.computations.univariate.Mean;
import com.dataiku.dip.eda.compute.computations.univariate.Min;
import com.dataiku.dip.eda.compute.computations.univariate.MinTime;
import com.dataiku.dip.eda.compute.computations.univariate.OneWayAnova;
import com.dataiku.dip.eda.compute.computations.univariate.Shapiro;
import com.dataiku.dip.eda.compute.computations.univariate.StdDev;
import com.dataiku.dip.eda.compute.distributions.Beta;
import com.dataiku.dip.eda.compute.distributions.Distribution;
import com.dataiku.dip.eda.compute.distributions.Exponential;
import com.dataiku.dip.eda.compute.distributions.Laplace;
import com.dataiku.dip.eda.compute.distributions.Normal;
import com.dataiku.dip.eda.compute.distributions.NormalMixture;
import com.dataiku.dip.eda.compute.distributions.Pareto;
import com.dataiku.dip.eda.compute.distributions.Triangular;
import com.dataiku.dip.eda.compute.distributions.Weibull;
import com.dataiku.dip.eda.compute.engine.ComputationRunner;
import com.dataiku.dip.eda.compute.filtering.AllFilter;
import com.dataiku.dip.eda.compute.filtering.AnumFilter;
import com.dataiku.dip.eda.compute.filtering.Filter;
import com.dataiku.dip.eda.compute.filtering.MissingValueFilter;
import com.dataiku.dip.eda.compute.grouping.AnumGrouping;
import com.dataiku.dip.eda.compute.grouping.SubsetGrouping;
import com.dataiku.dip.eda.worksheets.suggester.ComputationSampler;
import com.dataiku.dip.eda.worksheets.suggester.SniffedVariable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;
import org.apache.log4j.Logger;

public class VariableSniffer {
    private final ComputationRunner fullSampleRunner;
    private static final Logger logger = Logger.getLogger((String)"dku.eda.worksheets.suggester");

    public VariableSniffer(ComputationRunner fullSampleRunner) {
        this.fullSampleRunner = fullSampleRunner;
    }

    public List<SniffedVariable> sniff(List<String> variableNames, List<String> selectedVariableNames) throws Exception {
        long totalCount = this.sniffTotalCount();
        if (totalCount == 0L) {
            return Collections.emptyList();
        }
        List<SniffedVariable> variables = this.create(variableNames, selectedVariableNames, totalCount);
        this.sniffCardinality(variables);
        this.sniffEntropy(variables);
        this.sniffNumericalStats(variables);
        this.sniffFitDistributions(variables);
        this.sniffShapiro(variables);
        this.sniffCorrelation(variables);
        this.sniffTimeSeriesStats(variables);
        return variables;
    }

    private List<SniffedVariable> create(List<String> variableNames, List<String> selectedVariableNames, long totalCount) {
        ArrayList<SniffedVariable> sniffedVariables = new ArrayList<SniffedVariable>(variableNames.size());
        for (String name : variableNames) {
            boolean isSelected = selectedVariableNames.contains(name);
            SniffedVariable sniffedVariable = new SniffedVariable(name, isSelected, totalCount);
            sniffedVariables.add(sniffedVariable);
        }
        return sniffedVariables;
    }

    private long sniffTotalCount() throws Exception {
        ComputationSampler sampler = new ComputationSampler();
        sampler.withRandomSampling(new Count());
        MultiComputation.MultiComputationResult mcr = sampler.compute(this.fullSampleRunner);
        return mcr.get((int)0).as(Count.CountResult.class).count;
    }

    private void sniffCardinality(List<SniffedVariable> sniffedVariables) throws Exception {
        ComputationSampler sampler = new ComputationSampler();
        for (SniffedVariable sv : sniffedVariables) {
            MultiComputation computation = new MultiComputation(new GroupedComputation(new Count(), new SubsetGrouping(new MissingValueFilter(sv.name))), new GroupedComputation(new Count(), new SubsetGrouping(new AnumFilter(sv.name, Collections.singleton("")))), new CountDistinct(sv.name), new GroupedComputation(new Count(), new AnumGrouping(sv.name, 5, false)), new MinTime(sv.name));
            sampler.withRandomSampling(computation);
        }
        MultiComputation.MultiComputationResult computationResults = sampler.compute(this.fullSampleRunner);
        for (int i = 0; i < sniffedVariables.size(); ++i) {
            MultiComputation.MultiComputationResult variableResult = computationResults.get(i).asMulti();
            SniffedVariable sniffedVariable = sniffedVariables.get(i);
            sniffedVariable.nbEmpty = variableResult.get((int)1).as(GroupedComputation.GroupedComputationResult.class).results.get((int)0).as(Count.CountResult.class).count;
            sniffedVariable.cardinality = variableResult.get((int)2).as(CountDistinct.CountDistinctResult.class).count;
            for (GroupedComputation.GroupResult topValueResult : variableResult.get(3).as(GroupedComputation.GroupedComputationResult.class).getGroupedResults()) {
                String topValue = (String)((AnumFilter)topValueResult.filter).values.stream().findFirst().get();
                long topValueCount = topValueResult.result.as(Count.CountResult.class).count;
                sniffedVariable.top5values.put(topValue, topValueCount);
            }
            if (variableResult.get(0).isFullyAvailable()) {
                sniffedVariable.markAsNumerical();
                sniffedVariable.nbMissing = variableResult.get((int)0).as(GroupedComputation.GroupedComputationResult.class).results.get((int)0).as(Count.CountResult.class).count;
            }
            if (!variableResult.get(4).isFullyAvailable()) continue;
            sniffedVariable.markAsTemporal();
        }
    }

    private void sniffEntropy(List<SniffedVariable> sniffedVariables) throws Exception {
        List likelyCategoricalVariables = sniffedVariables.stream().filter(SniffedVariable::isLikelyCategorical).collect(Collectors.toList());
        ComputationSampler sampler = new ComputationSampler();
        for (SniffedVariable sv : likelyCategoricalVariables) {
            sampler.withRandomSampling(new Entropy(sv.name));
        }
        MultiComputation.MultiComputationResult computationResults = sampler.compute(this.fullSampleRunner);
        for (int i = 0; i < likelyCategoricalVariables.size(); ++i) {
            SniffedVariable variable = (SniffedVariable)likelyCategoricalVariables.get(i);
            ComputationResult result = computationResults.get(i);
            if (result.isFullyAvailable()) {
                variable.entropy = result.as(Entropy.EntropyResult.class).value;
                continue;
            }
            logger.error((Object)("Could not compute the entropy for column " + variable.name));
        }
    }

    private void sniffNumericalStats(List<SniffedVariable> sniffedVariables) throws Exception {
        List likelyNumericalVariables = sniffedVariables.stream().filter(SniffedVariable::isLikelyNumerical).collect(Collectors.toList());
        ComputationSampler sampler = new ComputationSampler();
        for (SniffedVariable sv : likelyNumericalVariables) {
            MultiComputation multiComputation = new MultiComputation(new Mean(sv.name, null), new StdDev(sv.name), new Min(sv.name), new Max(sv.name));
            sampler.withRandomSampling(multiComputation);
        }
        MultiComputation.MultiComputationResult computationResults = sampler.compute(this.fullSampleRunner);
        for (int i = 0; i < likelyNumericalVariables.size(); ++i) {
            SniffedVariable variable = (SniffedVariable)likelyNumericalVariables.get(i);
            MultiComputation.MultiComputationResult statsResults = computationResults.get(i).asMulti();
            ComputationResult statsResult = statsResults.get(0);
            if (statsResult.isFullyAvailable()) {
                variable.mean = statsResult.as(Mean.MeanResult.class).value;
            }
            if ((statsResult = statsResults.get(1)).isFullyAvailable()) {
                variable.stdDev = statsResult.as(StdDev.StdDevResult.class).value;
            }
            if ((statsResult = statsResults.get(2)).isFullyAvailable()) {
                variable.min = statsResult.as(Min.MinResult.class).value;
            }
            if (!(statsResult = statsResults.get(3)).isFullyAvailable()) continue;
            variable.max = statsResult.as(Max.MaxResult.class).value;
        }
    }

    private void sniffFitDistributions(List<SniffedVariable> sniffedVariables) throws Exception {
        List likelyNumericalVariables = sniffedVariables.stream().filter(SniffedVariable::isLikelyNumerical).collect(Collectors.toList());
        ArrayList<Distribution> candidateDistributions = new ArrayList<Distribution>();
        candidateDistributions.add(new Normal());
        candidateDistributions.add(new Exponential());
        candidateDistributions.add(new Beta());
        candidateDistributions.add(new Laplace());
        candidateDistributions.add(new Pareto());
        if (likelyNumericalVariables.size() == 1 && ((SniffedVariable)likelyNumericalVariables.get(0)).isSelected()) {
            candidateDistributions.add(new NormalMixture(2));
            candidateDistributions.add(new Triangular());
            candidateDistributions.add(new Weibull());
        }
        ComputationSampler sampler = new ComputationSampler();
        for (SniffedVariable sv : likelyNumericalVariables) {
            List<Computation> singleFitComputations = candidateDistributions.stream().map(distribution -> new FitDistribution(sv.name, (Distribution)((Object)distribution), true)).collect(Collectors.toList());
            sampler.withRandomSampling(new MultiComputation(singleFitComputations));
        }
        MultiComputation.MultiComputationResult computationResults = sampler.compute(this.fullSampleRunner);
        for (int i = 0; i < likelyNumericalVariables.size(); ++i) {
            SniffedVariable variable = (SniffedVariable)likelyNumericalVariables.get(i);
            MultiComputation.MultiComputationResult fitResults = computationResults.get(i).asMulti();
            for (int j = 0; j < fitResults.size(); ++j) {
                ComputationResult result = fitResults.get(j);
                if (!result.isFullyAvailable()) continue;
                FitDistribution.FitDistributionResult fitResult = result.as(FitDistribution.FitDistributionResult.class);
                Distribution originalDistribution = (Distribution)((Object)candidateDistributions.get(j));
                variable.fitDistributionResults.put(originalDistribution, fitResult);
            }
        }
    }

    private void sniffShapiro(List<SniffedVariable> sniffedVariables) throws Exception {
        List likelyNumericalVariables = sniffedVariables.stream().filter(SniffedVariable::isLikelyNumerical).collect(Collectors.toList());
        ComputationSampler sampler = new ComputationSampler();
        for (SniffedVariable sv : likelyNumericalVariables) {
            sampler.withRandomSampling(new Shapiro(sv.name));
        }
        MultiComputation.MultiComputationResult computationResults = sampler.compute(this.fullSampleRunner);
        for (int i = 0; i < likelyNumericalVariables.size(); ++i) {
            SniffedVariable variable = (SniffedVariable)likelyNumericalVariables.get(i);
            ComputationResult result = computationResults.get(i);
            if (result.isFullyAvailable()) {
                Shapiro.ShapiroResult shapiroResult = result.as(Shapiro.ShapiroResult.class);
                variable.setShapiroResult(shapiroResult);
                continue;
            }
            logger.error((Object)("Could not perform a Shapiro-Wilk test for column " + variable.name));
        }
    }

    private void sniffCorrelation(List<SniffedVariable> sniffedVariables) throws Exception {
        List selectedSniffedVariables = sniffedVariables.stream().filter(SniffedVariable::isSelected).collect(Collectors.toList());
        ComputationSampler sampler = new ComputationSampler();
        for (SniffedVariable selected : selectedSniffedVariables) {
            for (SniffedVariable other : sniffedVariables) {
                OneWayAnova anova;
                if (Objects.equals(selected, other)) continue;
                if (selected.isLikelyNumerical() && (other.isLikelyNumerical() || other.isLikelyIndicator())) {
                    Spearman spearmanTest = new Spearman(selected.name, other.name);
                    sampler.withRandomSampling(spearmanTest);
                }
                if (selected.isLikelyCategorical() && other.isLikelyCategorical()) {
                    Chi2IndTest chi2Test = new Chi2IndTest(new AnumGrouping(selected.name, 30, false), new AnumGrouping(other.name, 30, false));
                    sampler.withRandomSampling(chi2Test);
                }
                if (selected.isLikelyCategorical() && other.isLikelyNumerical()) {
                    anova = new OneWayAnova(other.name, new AnumGrouping(selected.name, 10, false), 0.95);
                    sampler.withRandomSampling(anova);
                    continue;
                }
                if (!selected.isLikelyNumerical() || !other.isLikelyCategorical()) continue;
                anova = new OneWayAnova(selected.name, new AnumGrouping(other.name, 10, false), 0.95);
                sampler.withRandomSampling(anova);
            }
        }
        MultiComputation.MultiComputationResult computationResults = sampler.compute(this.fullSampleRunner);
        int i = 0;
        for (SniffedVariable selected : selectedSniffedVariables) {
            for (SniffedVariable other : sniffedVariables) {
                ComputationResult result;
                if (Objects.equals(selected, other)) continue;
                if (selected.isLikelyNumerical() && (other.isLikelyNumerical() || other.isLikelyIndicator())) {
                    if ((result = computationResults.get(i++)).isFullyAvailable()) {
                        other.spearmanTests.put(selected.name, result.as(Spearman.SpearmanResult.class));
                    } else {
                        logger.error((Object)("Could not perform a Spearman test between variables " + selected.name + " and " + other.name));
                    }
                }
                if (selected.isLikelyCategorical() && other.isLikelyCategorical()) {
                    if ((result = computationResults.get(i++)).isFullyAvailable()) {
                        other.chi2tests.put(selected.name, result.as(Chi2IndTest.Chi2IndTestResult.class));
                    } else {
                        logger.error((Object)("Could not perform a Chi2 test between variables " + selected.name + " and " + other.name));
                    }
                }
                if ((!selected.isLikelyNumerical() || !other.isLikelyCategorical()) && (!selected.isLikelyCategorical() || !other.isLikelyNumerical())) continue;
                if ((result = computationResults.get(i++)).isFullyAvailable()) {
                    other.oneWayAnovaTests.put(selected.name, result.as(OneWayAnova.OneWayAnovaResult.class));
                    continue;
                }
                logger.error((Object)("Could not perform a one way ANOVA between variables " + selected.name + " and " + other.name));
            }
        }
    }

    private void sniffTimeSeriesStats(List<SniffedVariable> sniffedVariables) throws Exception {
        ComputationResult result;
        Object temporal2;
        List temporalCandidates = sniffedVariables.stream().filter(SniffedVariable::isSelected).filter(SniffedVariable::holdsTemporalValues).collect(Collectors.toList());
        List seriesIdCandidates = sniffedVariables.stream().filter(SniffedVariable::isSelected).filter(SniffedVariable::isSeriesIdCandidate).collect(Collectors.toList());
        ComputationSampler sampler = new ComputationSampler();
        for (Object temporal2 : temporalCandidates) {
            for (SniffedVariable seriesId : seriesIdCandidates) {
                if (Objects.equals(temporal2, seriesId)) continue;
                sampler.withTopNTimeSampling(new GroupedComputation(new MultiComputation(new Count(), new CountDistinct(((SniffedVariable)temporal2).name)), new AnumGrouping(seriesId.name, 5, false)), ((SniffedVariable)temporal2).name);
            }
        }
        MultiComputation.MultiComputationResult computationResults = sampler.compute(this.fullSampleRunner);
        int i = 0;
        for (SniffedVariable temporal3 : temporalCandidates) {
            SniffedVariable bestSeriesId = null;
            double bestDistinctRatio = (double)temporal3.cardinality.longValue() / (double)temporal3.totalCount;
            for (SniffedVariable seriesId : seriesIdCandidates) {
                ComputationResult sniffingResult;
                if (Objects.equals(temporal3, seriesId)) continue;
                if (!(sniffingResult = computationResults.get(i++)).isFullyAvailable()) {
                    logger.error((Object)String.format("Could not compute count for time column %s grouped by %s", temporal3.name, seriesId.name));
                    continue;
                }
                GroupedComputation.GroupedComputationResult groupResult = sniffingResult.asGrouped();
                double meanDistinctRatio = 0.0;
                double meanSeriesLength = 0.0;
                for (int j = 0; j < groupResult.groups.size(); ++j) {
                    MultiComputation.MultiComputationResult mcr = groupResult.results.get(j).asMulti();
                    long groupCount = mcr.get((int)0).as(Count.CountResult.class).count;
                    long groupDistinctCount = mcr.get((int)1).as(CountDistinct.CountDistinctResult.class).count;
                    meanSeriesLength += (double)groupDistinctCount;
                    meanDistinctRatio += (double)groupDistinctCount / (double)groupCount;
                }
                meanDistinctRatio /= (double)groupResult.groups.size();
                meanSeriesLength /= (double)groupResult.groups.size();
                if (!(meanDistinctRatio >= bestDistinctRatio) || !(meanDistinctRatio >= 0.95) || !(meanSeriesLength >= 30.0)) continue;
                bestDistinctRatio = meanDistinctRatio;
                bestSeriesId = seriesId;
            }
            if (bestSeriesId == null) continue;
            temporal3.addSeriesIdentifier(bestSeriesId);
        }
        sampler = new ComputationSampler();
        for (SniffedVariable temporal3 : temporalCandidates) {
            sampler.withTopNTimeSampling(new GuessTimeStep(temporal3.name, temporal3.getSeriesIdentifierNames()), temporal3.name);
        }
        computationResults = sampler.compute(this.fullSampleRunner);
        for (i = 0; i < temporalCandidates.size(); ++i) {
            temporal2 = (SniffedVariable)temporalCandidates.get(i);
            result = computationResults.get(i);
            if (result.isFullyAvailable()) {
                ((SniffedVariable)temporal2).guessTimeStepResult = result.as(GuessTimeStep.GuessTimeStepResult.class);
                continue;
            }
            logger.error((Object)String.format("Error when guessing the time step for %s with identifier %s", ((SniffedVariable)temporal2).name, String.join((CharSequence)",", ((SniffedVariable)temporal2).getSeriesIdentifierNames())));
            ((SniffedVariable)temporal2).unMarkAsTemporal();
        }
        sampler = new ComputationSampler();
        for (SniffedVariable temporal3 : temporalCandidates) {
            Filter longFormatFilter = temporal3.getSeriesIdentifiers().stream().map(it -> AnumFilter.single(it.name, it.top5values.keySet().iterator().next())).reduce(new AllFilter(), Filter::and);
            sampler.withTopNTimeSampling(new GroupedComputation(new MatchTimeStep(temporal3.name), new SubsetGrouping(longFormatFilter)), temporal3.name);
        }
        computationResults = sampler.compute(this.fullSampleRunner);
        for (i = 0; i < computationResults.size(); ++i) {
            temporal2 = (SniffedVariable)temporalCandidates.get(i);
            result = computationResults.get(i);
            if (result.isFullyAvailable()) {
                ((SniffedVariable)temporal2).matchTimeStepResult = result.asGrouped().results.get(0).as(MatchTimeStep.MatchTimeStepResult.class);
                continue;
            }
            logger.error((Object)String.format("Error when matching the time step for %s with identifier %s", ((SniffedVariable)temporal2).name, String.join((CharSequence)",", ((SniffedVariable)temporal2).getSeriesIdentifierNames())));
            ((SniffedVariable)temporal2).unMarkAsTemporal();
        }
    }
}

