# Do not forget to update notebook/impact_coding.py when modifying this file (or
# categorical.py), to make sure that the notebook export class produces the same
# results as this one

import logging
import re
import pandas as pd
import numpy as np

from dataiku.core import doctor_constants
from dataiku.doctor.utils import get_rescaling_params
from dataiku.doctor.preprocessing.features_encoding.categorical import CategoricalEncoderBase
from dataiku.doctor.preprocessing.features_encoding.categorical import WrappingCategoricalEncoder

preproc_logger = logging.getLogger(doctor_constants.PREPROCESSING_LOGGER_NAME)
preproc_logger.setLevel(logging.DEBUG)


class TargetEncoding(CategoricalEncoderBase):
    __slots__ = ('rescaling_method', )

    def __init__(self, rescaling_method=doctor_constants.AVGSTD):
        super(TargetEncoding, self).__init__()
        self.rescaling_method = rescaling_method

    def _rescale(self, series):
        if self.rescaling_method == doctor_constants.NONE:
            return
        if self.rescaling_method not in {doctor_constants.AVGSTD, doctor_constants.MINMAX}:
            raise ValueError("Unknown rescaling method %s", self.rescaling_method)

        transformed_series = self.transform(series)[0]
        shift, scale = get_rescaling_params(self.rescaling_method, transformed_series)
        self.encoding_map = (self.encoding_map - shift) * scale

    def _target_encode(self, series, target_series, target_mean):
        """
        Compute the target encoding of the feature
        :param series: The categorical feature column
        :type series: pd.Series
        :param target_series: The target column
        :type target_series: pd.Series
        :param target_mean: Mean of the target column
        :type target_mean: float
        :return: The target encoding (a series using the categories as the indices and their respective encodings as the
        values)
        :rtype: pd.Series
        """
        raise NotImplementedError()

    def fit(self, series, target_series):
        # Save the counts for the reportable map
        self.category_counts = series.value_counts()
        self.category_counts.name = "counts"

        target_mean = target_series.mean()
        target_encoded_values = self._target_encode(series, target_series, target_mean)

        # add default value
        self.encoding_map = pd.DataFrame(pd.concat([
            target_encoded_values,
            pd.Series([target_mean], index=[CategoricalEncoderBase.DEFAULT_VALUE]),
        ], axis=0))

        # perform rescaling on the encoding map - if needed
        self._rescale(series)

    def fit_transform(self, X, target):
        self.fit(X, target)
        return self.transform(X)


class ImpactCoding(TargetEncoding):
    """ ImpactCoding is an alternative way to cope with
    categorical values in a regression or in a classification project.

    The base idea is to replace categorical values by their overall observed
    impact on the target value.

    For instance, let's consider a dataset with 5000 persons. We
    aim at predicting their height. Their home country is a feature
    of the dataset, but it can take as many as 300 different values.

    Impact coding consists of replacing the country information by the
    average height of the people in their home country.
    (Note that it may not be a good idea if for instance the
    ratio of men and woman is different in these countries.)

    Because some countries may be underrepresented, we prefer to use
    a more robust estimate of the average. Here we simply use additive
    smoothing.
    ie, if a category is represented n times, we compute lambda = n/(n+m)
    and instead of CAT_AVG, we use lambda*CAT_AVG + (1-lambda) * TARGET_AVG
    (so when a category has very low cardinality like 2 or 3, most of its actual
    value is smoothed by the global average)
    """

    __slots__ = ('m', )

    def __init__(self, m=10, rescaling_method=doctor_constants.AVGSTD):
        super(ImpactCoding, self).__init__(rescaling_method)
        self.m = m

    def _target_encode(self, series, target_series, target_mean):
        feature = series.name or "feature"
        df = pd.DataFrame({feature: series, "target": target_series})

        target_mean = target_series.mean()
        category_means = df.groupby(feature)["target"].mean()
        lambda_weights = self.category_counts.astype("float") / (self.category_counts + self.m)
        return lambda_weights * category_means + (1 - lambda_weights) * target_mean


class GLMMEncoding(TargetEncoding):

    """ GLMMEncoding is, like ImpactCoding, an alternative way to cope with
    categorical values in a regression or in a classification project.

    The idea is to train a linear (mixed) model (more specifically a generalized linear mixed model)
    to approximate either the target variable (for regressions) or the log-odds of its probability (for classifications)
    by a linear combination of the one-hot-encoded values of a categorical variables.

    For instance, let's consider the same dataset the goal of predicting the height of individual.
    The home country of each individual is a feature of the dataset, which can take as many as 300 different values.
    We model the height of individuals as:
     a constant + a specific term solely based on their country + some residual noise
    For each country, this specific term will be its numerical encoding.
    """
    def __init__(self, is_regression, rescaling_method=doctor_constants.AVGSTD):
        super(GLMMEncoding, self).__init__(rescaling_method)
        self.is_regression = is_regression

    def _target_encode(self, series, target_series, target_mean):
        uniques = series.dropna().unique()
        ordinal_map = {val: i for i, val in enumerate(uniques)}
        inverse_ordinal_map = {i: val for i, val in enumerate(uniques)}
        series = series.map(ordinal_map)
        data = pd.DataFrame({"target": target_series, "feature": series})[~(target_series.isna()|series.isna())]

        # compute the GLMM encoding
        # based on https://contrib.scikit-learn.org/category_encoders/_modules/category_encoders/glmm.html
        if self.is_regression:
            from statsmodels.regression.mixed_linear_model import MixedLM
            model = MixedLM(data['target'], np.ones(data.shape[0]), groups=data['feature']).fit()
            return pd.Series({
                inverse_ordinal_map[ordinal_key]: value[0]
                for ordinal_key, value in model.random_effects.items()
            })
        else:
            from statsmodels.genmod.bayes_mixed_glm import BinomialBayesMixedGLM
            # This formula states that
            # - target ~ 1: the target is first modeled as a constant
            # - '0 + C(feature)': the noise part of the mixture has no bias term and is composed of the one-hot-encoded
            #   values of the categorical feature
            # In other words: target = constant + sum_c(coeff_c * delta_c) where delta_c stands for the one-hot-encoded value c of the feature
            #
            # statsmodels >= 0.14 requires numeric variables
            # see https://github.com/statsmodels/statsmodels/blob/2571862da163840bf23d106c9bfdd231bbce764f/statsmodels/base/model.py#L207-L212
            data.target = data.target.replace({True: 1, False: 0})
            model = BinomialBayesMixedGLM.from_formula('target ~ 1', {'a': '0 + C(feature)'}, data).fit_vb()
            # The names of the variables are stored as "C(i)" for each ordinal encoding i
            index_names = [
                inverse_ordinal_map[int(float(re.sub(r'C\(feature\)\[(\S+)\]', r'\1', index_name)))]
                for index_name in model.model.vc_names
            ]
            return pd.Series(model.vc_mean, index=index_names)


class ClassificationImpactEncoder(WrappingCategoricalEncoder):
    """
    Wraps a categorical target encoder for a classification task.

    For N target classes, the target series is converted into N-1 target series (one
    per class except the last). The target series for a given class has value 1 for
    the class and 0 for the others (one-vs-all).

    The encoding computation is performed on each of theses series, and the resulting
    encoding map has N-1 columns each one corresponding to the encoded target class.
    """

    __slots__ = ('inverse_target_map',)

    def __init__(self, encoder, target_map=None):
        super(ClassificationImpactEncoder, self).__init__(encoder)
        # The preprocessing handler creates a target_map, that maps the classes to integers.
        # The column labels that we want for target encoding are the classes, not the mapped
        # integers, hence the need for an inverse_target_map that map the integers to the classes.
        self.inverse_target_map = {v: k for k, v in target_map.items()}

    def fit(self, series, target_series):
        target_values = target_series.unique()
        target_values = target_values[:-1]
        target_df = pd.get_dummies(target_series)
        for target_val in target_values:
            self.encoder.fit(series, target_df[target_val], self.inverse_target_map[target_val])

        # Same encoding map as underlying encoder
        self.encoding_map = self.encoder.encoding_map

    def fit_transform(self, series, target_series):
        target_values = target_series.unique()
        target_values = target_values[:-1]
        target_df = pd.get_dummies(target_series)
        impact_series_list = []
        for target_val in target_values:
            impact_series = self.encoder.fit_transform(series, target_df[target_val], self.inverse_target_map[target_val])
            impact_series_list.append(impact_series)

        # Same category_counts and encoding map as underlying encoder
        self.encoding_map = self.encoder.encoding_map
        self.category_counts = self.encoder.category_counts

        impact_df = pd.concat(impact_series_list, axis=1)
        return impact_df
