/*
 * Decompiled with CFR 0.152.
 */
package com.dataiku.dip.shaker.facet;

import com.dataiku.dip.datalayer.Column;
import com.dataiku.dip.datalayer.memimpl.MemRow;
import com.dataiku.dip.datalayer.memimpl.MemTable;
import com.dataiku.dip.shaker.facet.CountMap;
import com.dataiku.dip.shaker.facet.FacetUtils;
import com.dataiku.dip.shaker.facet.TextFacet;
import com.dataiku.dip.shaker.processors.expr.TextSimplifier;
import com.dataiku.dip.shaker.processors.expr.TokenizedText;
import com.dataiku.dip.shaker.processors.expr.WordType;
import com.dataiku.dip.utils.DKUMathsUtils;
import com.dataiku.dip.utils.DKUtils;
import com.dataiku.dss.shadelib.org.tartarus.snowball.SnowballStemmer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.apache.commons.lang.StringUtils;

public class TextFacetBuilder {
    private TextSimplifier.Parameter simplificationParameters;
    private Set<String> stopWords;
    private SnowballStemmer stemmer;

    public TextFacetBuilder(TextSimplifier.Parameter simplificationParameters) throws Exception {
        this.simplificationParameters = simplificationParameters;
        if (simplificationParameters.clearStopWords) {
            String content = DKUtils.getResourceFileContentUTF8((String)("com/dataiku/dip/shaker/processors/expr/stopwords_" + simplificationParameters.language + ".txt"));
            String[] splitContent = content.split("\n");
            this.stopWords = new HashSet<String>(splitContent.length);
            for (String s : splitContent) {
                this.stopWords.add(s.toLowerCase());
            }
        }
        if (simplificationParameters.stem) {
            Class<?> stemClass = Class.forName("com.dataiku.dss.shadelib.org.tartarus.snowball.ext." + StringUtils.capitalize((String)simplificationParameters.getLanguage()) + "Stemmer");
            this.stemmer = (SnowballStemmer)stemClass.newInstance();
        }
    }

    /*
     * WARNING - void declaration
     */
    public TextFacet build(MemTable mt, Column col, int maxResults) {
        TextFacet ret = new TextFacet();
        CountMap<String> counts = new CountMap<String>();
        for (MemRow memRow : mt.rows) {
            if (memRow.isDeleted()) continue;
            String string = memRow.get(col);
            if (StringUtils.isBlank((String)string)) {
                ret.missing += 1.0;
                continue;
            }
            TokenizedText tt = new TokenizedText(string);
            if (this.simplificationParameters.normalize) {
                tt.normalize();
            }
            if (this.simplificationParameters.clearStopWords && this.stopWords != null) {
                tt.clearStopWords(this.stopWords);
            }
            if (this.simplificationParameters.stem && this.stemmer != null) {
                tt.stem(this.stemmer);
            }
            for (WordType token : tt) {
                counts.inc(token.toString());
            }
        }
        ArrayList<FacetUtils.FacetValue> values = new ArrayList<FacetUtils.FacetValue>();
        for (Map.Entry entry : counts) {
            FacetUtils.FacetValue fv = new FacetUtils.FacetValue();
            fv.key = (String)entry.getKey();
            fv.value = entry.getValue().intValue();
            values.add(fv);
        }
        Collections.sort(values, FacetUtils.Sort.COUNT);
        ret.totalNbValues = counts.size();
        ret.counts = new int[Math.min(counts.size(), maxResults)];
        ret.values = new String[Math.min(counts.size(), maxResults)];
        ret.percentages = new double[Math.min(counts.size(), maxResults)];
        ret.cumPercentages = new double[Math.min(counts.size(), maxResults)];
        ret.missing = mt.nrows() == 0 ? 0.0 : ret.missing / (double)mt.nrows();
        ret.totalRows = mt.nrows();
        boolean bl = false;
        for (FacetUtils.FacetValue fv : values) {
            var7_10 += fv.value;
        }
        boolean bl2 = false;
        int cur = 0;
        for (FacetUtils.FacetValue fv : values) {
            void var8_16;
            void var7_10;
            ret.values[var8_16] = fv.key;
            ret.counts[var8_16] = fv.value;
            ret.percentages[var8_16] = DKUMathsUtils.safeDivide((int)fv.value, (int)var7_10);
            ret.cumPercentages[var8_16] = DKUMathsUtils.safeDivide((int)(cur += fv.value), (int)var7_10);
            if (++var8_16 < maxResults) continue;
            break;
        }
        return ret;
    }
}

