/*
 * Decompiled with CFR 0.152.
 */
package com.dataiku.scoring.pipelines;

import com.dataiku.scoring.pipelines.Processor;
import com.dataiku.scoring.pipelines.Tokenizer;
import com.dataiku.scoring.util.RawObservation;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class CountVectorizer
implements Processor {
    private static final long serialVersionUID = 0L;
    private final Tokenizer[] tokenizers;
    private final String[] columns;
    private final List<Set<String>> vocabularies;

    public CountVectorizer(String[] columns, Tokenizer[] tokenizer, String[][] vocabularies) {
        this.tokenizers = tokenizer;
        if (columns.length != vocabularies.length) {
            throw new IllegalArgumentException("Columns and values must have the same length");
        }
        this.columns = columns;
        this.vocabularies = new ArrayList<Set<String>>(vocabularies.length);
        for (int i = 0; i < vocabularies.length; ++i) {
            HashSet v = new HashSet();
            Collections.addAll(v, vocabularies[i]);
            this.vocabularies.add(v);
        }
    }

    public String[] getColumns() {
        return Arrays.copyOf(this.columns, this.columns.length);
    }

    public Tokenizer[] getTokenizers() {
        return Arrays.copyOf(this.tokenizers, this.tokenizers.length);
    }

    public List<Set<String>> getVocabularies() {
        ArrayList<Set<String>> result = new ArrayList<Set<String>>();
        for (Set<String> v : this.vocabularies) {
            result.add(Collections.unmodifiableSet(v));
        }
        return result;
    }

    private static String name(String col, String token) {
        return "countvec:" + col + ":" + token;
    }

    @Override
    public void process(RawObservation data) {
        for (int i = 0; i < this.columns.length; ++i) {
            String col = this.columns[i];
            Object text = data.get(col);
            if (text == null) {
                text = "";
            } else if (!(text instanceof String)) {
                data.setError("Text expected but " + String.valueOf(text.getClass()) + " found.");
                return;
            }
            Map<String, Double> counts = this.tokenizers[i].tokenCounts((String)text);
            for (Map.Entry<String, Double> e : counts.entrySet()) {
                if (!this.vocabularies.get(i).contains(e.getKey())) continue;
                data.put(CountVectorizer.name(col, e.getKey()), e.getValue());
            }
        }
    }

    public String toString() {
        StringBuilder s = new StringBuilder().append("CountVectorizer(");
        for (int i = 0; i < this.columns.length; ++i) {
            s.append(this.columns[i]);
            if (i == this.columns.length - 1) continue;
            s.append(" ; ");
        }
        return s.append(")").toString();
    }
}

