/*
 * Decompiled with CFR 0.152.
 */
package com.dataiku.dip.shaker.text;

import com.dataiku.dip.shaker.processors.expr.WordType;
import com.dataiku.dip.shaker.text.Clusterer;
import com.dataiku.dip.shaker.text.StringNormalizer;
import com.dataiku.dip.utils.DKULogger;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;

public class NGramClusterer
extends Clusterer {
    int blockSize;
    boolean multithreaded;
    Map<String, Set<String>> blocks = new HashMap<String, Set<String>>();
    private static DKULogger logger = DKULogger.getLogger((String)"dip.shaker.analysis.clusterer");

    public NGramClusterer(boolean multithreaded, int blockSize) {
        this.multithreaded = multithreaded;
        this.blockSize = blockSize;
    }

    @Override
    public void populate(String s) {
        ArrayList tokens = new ArrayList();
        String preprocessed = StringNormalizer.normalize((String)s).toUpperCase().toLowerCase();
        preprocessed = StringNormalizer.removePunct((String)s);
        WordType.blockTokenize((String)preprocessed, tokens, (int)this.blockSize);
        for (WordType wt : tokens) {
            String ss = wt.toString();
            Set<Object> l = null;
            if (!this.blocks.containsKey(ss)) {
                l = new HashSet();
                this.blocks.put(ss, l);
            } else {
                l = this.blocks.get(ss);
            }
            l.add(s);
        }
    }

    @Override
    public List<Set<String>> getClusters(float radius, int timeOut) {
        ArrayList<BlockEvaluator> evaluators;
        ArrayList<Map> cluster_maps;
        ExecutorService executor = null;
        if (radius < 0.0f) {
            logger.error((Object)"Error: radius < 0");
            return null;
        }
        if (this.multithreaded) {
            int cores = Runtime.getRuntime().availableProcessors();
            executor = Executors.newFixedThreadPool(cores, new ThreadFactoryBuilder().setNameFormat("NGramCluster-%d").build());
            int size = this.blocks.size();
            int range = size / cores + 1;
            cluster_maps = new ArrayList(cores);
            evaluators = new ArrayList(cores);
            for (int i = 0; i < cores; ++i) {
                int rangeStart = range * i;
                int rangeEnd = range * (i + 1);
                if (rangeEnd > size) {
                    rangeEnd = size;
                }
                evaluators.add(new BlockEvaluator(new ArrayList<Set<String>>(this.blocks.values()), radius, rangeStart, rangeEnd, 1000 / cores));
            }
        } else {
            executor = Executors.newSingleThreadExecutor(new ThreadFactoryBuilder().setNameFormat("NGramCluster-%d").build());
            cluster_maps = new ArrayList<Map>(1);
            evaluators = new ArrayList<BlockEvaluator>(1);
            evaluators.add(new BlockEvaluator(new ArrayList<Set<String>>(this.blocks.values()), radius, 0, this.blocks.size(), 1000));
        }
        try {
            List futures = executor.invokeAll(evaluators, timeOut, TimeUnit.SECONDS);
            int i = 0;
            for (Future future : futures) {
                if (future.isCancelled()) {
                    cluster_maps.add(((BlockEvaluator)evaluators.get(i)).getResults());
                    this.timedOut = true;
                } else {
                    try {
                        cluster_maps.add((Map)future.get());
                    }
                    catch (Exception e) {
                        this.timedOut = true;
                    }
                }
                ++i;
            }
        }
        catch (InterruptedException e) {
            Thread.currentThread().interrupt();
            logger.error((Object)"clusterer has been interrupted");
        }
        executor.shutdown();
        HashSet<Set> clusters = new HashSet<Set>();
        for (Map cluster_map : cluster_maps) {
            for (Set v : cluster_map.values()) {
                if (v.size() <= 1) continue;
                clusters.add(v);
            }
        }
        ArrayList<Set<String>> sorted_clusters = new ArrayList<Set<String>>(clusters);
        Collections.sort(sorted_clusters, new Clusterer.SizeComparator());
        return sorted_clusters;
    }

    public class BlockEvaluator
    implements Callable<Map<String, Set<String>>> {
        int start;
        int stop;
        float radius;
        int maxSize;
        List<Set<String>> blocks;
        Map<String, Set<String>> cluster_map = new HashMap<String, Set<String>>();

        public BlockEvaluator(List<Set<String>> blocks, float radius, int start, int stop, int maxSize) {
            this.blocks = blocks;
            this.start = start;
            this.stop = stop;
            this.radius = radius;
            this.maxSize = maxSize;
        }

        public Map<String, Set<String>> getResults() {
            return this.cluster_map;
        }

        @Override
        public Map<String, Set<String>> call() {
            Thread.currentThread().setName("dss-clusterer");
            block0: for (int i = this.start; i < this.stop; ++i) {
                Set<String> set = this.blocks.get(i);
                if (set.size() < 2) continue;
                for (String a : set) {
                    if (NGramClusterer.this.timedOut || this.cluster_map.size() > this.maxSize) continue block0;
                    for (String b : set) {
                        int d;
                        int blength;
                        int alength;
                        if (a == b || this.cluster_map.containsKey(a) && this.cluster_map.get(a).contains(b) || this.cluster_map.containsKey(b) && this.cluster_map.get(b).contains(a) || (float)((alength = a.length()) - (blength = b.length())) > this.radius || (float)(alength - blength) < -this.radius || !((float)(d = WordType.levenshtein((String)a, (String)b)) <= this.radius)) continue;
                        Set<Object> l = null;
                        if (!this.cluster_map.containsKey(a)) {
                            l = new HashSet<String>();
                            l.add(a);
                            this.cluster_map.put(a, l);
                        } else {
                            l = this.cluster_map.get(a);
                        }
                        l.add(b);
                    }
                }
            }
            if (NGramClusterer.this.timedOut) {
                logger.debug((Object)("Ended and timed out : cluster size " + this.cluster_map.size()));
            } else {
                logger.debug((Object)("Ended naturally : cluster size " + this.cluster_map.size()));
            }
            return this.cluster_map;
        }
    }
}

