""" Preprocessing takes a dataframe as an input,
and returns a dataframe as an output.

At the end of the pipeline, the matrix underlying the dataframe
should be ready to use for scikit-learn's ML algorithm.
"""
import json
import logging
import sys
from enum import Enum
from numbers import Number

import numpy as np
import pandas as pd
import scipy
import scipy.sparse
from six.moves import xrange
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

from dataiku.base.utils import RaiseWithTraceback
from dataiku.base.utils import encode_utf8
from dataiku.base.utils import safe_convert_to_string
from dataiku.base.utils import safe_exception
from dataiku.base.utils import safe_unicode_str
from dataiku.core import dkujson as dkujson
from dataiku.core import doctor_constants
from dataiku.core.doctor_constants import PROBA_COLUMNS
from dataiku.doctor import utils
from dataiku.doctor.causal.utils.misc import TreatmentMap
from dataiku.doctor.diagnostics.clustering_parameters import check_outliers_parameters
from dataiku.doctor.multiframe import DataFrameWrapper
from dataiku.doctor.multiframe import DropRowReason
from dataiku.doctor.multiframe import MultiFrame
from dataiku.doctor.multiframe import NamedNPArray
from dataiku.doctor.multiframe import SparseMatrixWithNames
from dataiku.doctor.preprocessing.assertions import MLAssertion
from dataiku.doctor.preprocessing.assertions import MLAssertions
from dataiku.doctor.preprocessing.features_encoding.categorical import CategoricalKFoldEncoder
from dataiku.doctor.preprocessing.features_encoding.categorical import CategoricalSimpleEncoder
from dataiku.doctor.preprocessing.features_encoding.impact_coding import ClassificationImpactEncoder
from dataiku.doctor.preprocessing.features_encoding.impact_coding import GLMMEncoding
from dataiku.doctor.preprocessing.features_encoding.impact_coding import ImpactCoding
from dataiku.doctor.preprocessing.generated_features_mapping import GeneratedFeaturesMapping
from dataiku.doctor.utils.skcompat import get_feature_names
from dataiku.doctor.utils.skcompat import get_n_stop_words_and_remove
from dataiku.doctor.utils.skcompat import get_kmeans_estimator

preproc_logger = logging.getLogger(doctor_constants.PREPROCESSING_LOGGER_NAME)
preproc_logger.setLevel(logging.DEBUG)


def append_sparse_with_prefix(current_mf, prefix, input_columns_names, matrix, generated_features_mapping):
    block_name = prefix + ":".join(input_columns_names)
    generated_features_mapping.add_features_to_block(matrix.names, block_name)
    generated_features_mapping.add_whole_block_mapping(block_name, input_columns_names)
    current_mf.append_sparse(block_name, matrix)


def add_column_to_builder(builder, new_column, input_columns, series, generated_features_mapping):
    builder.add_column(
        new_column,
        series
    )

    generated_features_mapping.add_per_column_mapping(builder.prefix, input_columns, builder.prefix + ":" + new_column)


class Step(object):
    """
    Since the steps are used in a pipeline,
    it really makes no sense to have a "fit" or "partial_fit" on them.
    All which must be "fitted" but that must be handled in stream is
    managed by preprocessing collector
    """

    def __init__(self, output_name=None):
        self.output_name = output_name

    def init_resources(self, resources_handler):
        pass

    def report_fit(self, ret_obj, core_params):
        # No report by default
        pass

    def fit_and_process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        # Default implem: no fitting
        return self.process(input_df, current_mf, output_ppr, generated_features_mapping)

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        """
        :param input_df: Pandas DataFrame, the main input of the current preprocessing Step
        :param current_mf: MultiFrame storing the current state of the preprocessed data, and importantly the reference index
        :param output_ppr: dict of preprocessing pipeline result outputs stored into values as either Pandas Series,
                           Pandas DataFrame or DSS MultiFrame.
                           Keys belong to the set {"target", "weight", "treatment", "assertions",  "UNPROCESSED",
                           "TRAIN", "TRAIN_PREPCA", "PROFILING"}
                           Note:
                            * output_ppr["TRAIN"] is the MultiFrame input of ML algorithm training features (X)
                            * output_ppr["target"] is the Series of ML prediction algorithm training target (y)
                            * output_ppr["weight"] is the Series of ML prediction algorithm sample weights
                            * output_ppr["treatment"] is the Series of causal ML prediction algorithm treatment
        :param generated_features_mapping: see GeneratedFeaturesMapping class
        :return: either a MultiFrame or None
        """
        raise NotImplementedError()

    @staticmethod
    def drop_rows(idx, current_mf, input_df, reason, column_name=None):
        # Always execute both actions together
        current_mf.drop_rows(idx, reason, column_name)
        input_df.drop(input_df.index[utils.series_nonzero(idx)], inplace=True)

    def __str__(self,):
        return "Step:" + self.__class__.__name__


class ExtractMLAssertionMasksNbInitialRows(Step):
    def __init__(self, assertions, output_name=None):
        super(ExtractMLAssertionMasksNbInitialRows, self).__init__(output_name)
        self.assertions = assertions

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        if self.assertions is not None:
            assertions_list = MLAssertions()
            for assertion_params in self.assertions:
                assertion_col_name = MLAssertion.assertion_col_name(assertion_params)
                if assertion_col_name not in input_df.columns:
                    # Should only happen for train set, for which we do not compute assertions
                    preproc_logger.debug(u"assertion column for assertion {} not found, "
                                         u"skipping computation".format(safe_unicode_str(assertion_params["name"])))
                    continue
                assertion_mask = input_df[assertion_col_name]
                assertions_list.add_assertion(MLAssertion(assertion_params, np.sum(assertion_mask)))

            if len(assertions_list) > 0:
                output_ppr["assertions"] = assertions_list


class ExtractMLAssertionMasks(Step):

    def __init__(self, assertions, output_name=None):
        super(ExtractMLAssertionMasks, self).__init__(output_name)
        self.assertions = assertions

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        if self.assertions is not None:
            for idx, assertion_params in enumerate(self.assertions):
                assertion_col_name = MLAssertion.assertion_col_name(assertion_params)
                if assertion_col_name not in input_df.columns:
                    # Should only happen for train set, for which we do not compute assertions
                    preproc_logger.debug(u"assertion column for assertion {} not found, "
                                         u"skipping computation".format(safe_unicode_str(assertion_params["name"])))
                    continue
                output_ppr["assertions"].assertions[idx].mask = input_df[assertion_col_name]


class SingleColumnDropNARows(Step):
    """ Drop rows containing any NA value in input_df"""

    def __init__(self, column_name):
        self.column_name = column_name

    def __str__(self,):
        return "Step:%s (%s)" % (self.__class__.__name__, self.column_name)

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        idx = input_df[self.column_name].isnull()
        preproc_logger.debug("Deleting %s rows" % idx.sum())
        Step.drop_rows(idx, current_mf, input_df, DropRowReason.NULL_COLUMN_VALUE, self.column_name)
        preproc_logger.info("After SCDNA input_df=%s" % str(input_df.shape))

    def init_resources(self, resources_handler):
        super(SingleColumnDropNARows, self).init_resources(resources_handler)
        drop = resources_handler.get_resource("drop_rows", "json")
        if "columns" not in drop:
            drop["columns"] = []
        drop["columns"].append(self.column_name)


class SpecialNumericOutputsDropRows(Step):
    """
    Drop rows for which at least one of a selection of special columns (present in output_ppr) is inf / na.
    The rows need to be of a numeric dtype, else np.isinf fails.
    Note that in the current implementation, only:
       - the current MultiFrame
       - outputs in the preprocessing pipeline results (output_ppr) explicitly a member of the output_names list
       will have the relevant rows dropped.
    """

    def __init__(self, output_ppr_keys, output_name=None, allow_empty_mf=False):
        """
        :param list output_ppr_keys: list of potential keys of output_ppr dict (e.g. "target", "weight", "treatment", "prediction")
        :param str output_name:
        :param boolean allow_empty_mf: if True the Step will fail if it results in an empty MultiFrame
        """
        super(SpecialNumericOutputsDropRows, self).__init__(output_name)
        self.allow_empty_mf = allow_empty_mf
        self.output_ppr_keys = output_ppr_keys

    def _get_drop_idx(self, outputs):
        idx = None
        for output_value in outputs.values():
            if isinstance(output_value, pd.DataFrame):
                for col_name, series in output_value.items():
                    idx_series_nulls = series.isnull()
                    idx = idx | idx_series_nulls if idx is not None else idx_series_nulls
                    idx_series_infs = np.isinf(series)
                    idx = idx | idx_series_infs if idx is not None else idx_series_infs
            else:
                if isinstance(output_value, pd.Series):
                    idx_series_nulls = output_value.isnull()
                    idx = idx | idx_series_nulls if idx is not None else idx_series_nulls
                    idx_series_infs = np.isinf(output_value)
                    idx = idx | idx_series_infs if idx is not None else idx_series_infs
                else:
                    raise Exception("Unexpected col type")
        return idx

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        for output_key in self.output_ppr_keys:
            assert(output_key in output_ppr)
        outputs = {n: output_ppr[n] for n in self.output_ppr_keys}
        idx = self._get_drop_idx(outputs)

        preproc_logger.debug("Deleting {} rows because one of {} is missing or infinity".format(idx.sum(), self.output_ppr_keys))

        preproc_logger.debug("MF before = {}".format(str(current_mf.shape())))
        for output_key, output_value in outputs.items():
            preproc_logger.debug("{} before = {}".format(output_key, str(output_value.shape)))

        # Drop rows of the preprocessing pipeline results
        for output_key in self.output_ppr_keys:
            cur_output = outputs[output_key]
            cur_output = cur_output.loc[cur_output.index[~idx]]
            output_ppr[output_key] = cur_output

        num_rows_before = current_mf.shape()[0]

        # Drop rows of the MultiFrame
        Step.drop_rows(idx, current_mf, input_df, DropRowReason.NULL_INF_TARGET)
        preproc_logger.debug("After DRWNT input_df=%s" % str(input_df.shape))

        nb_rows_after = current_mf.shape()[0]
        # We may want to allow empty multiframe for KERAS backend. When you only have "Special" features,
        # they are only created at process time, so for the first fit_and_process, the current_mf will be
        # empty. Also used for subpopulation computation.
        if nb_rows_after == 0 and ((not self.allow_empty_mf) or (num_rows_before > 0)):
            error_message = "{} values all empty, infinity or with unknown classes (you may need to recompute the training set)"
            raise DkuDroppedMultiframeException(error_message.format(self.output_ppr_keys))

        preproc_logger.debug("MF after = %s" % str(current_mf.shape()))
        for output_key, output_value in outputs.items():
            preproc_logger.debug("{} after = {}".format(output_key, str(output_value.shape)))


class DropRowsWhereNoTarget(SpecialNumericOutputsDropRows):
    """Drop rows for which the target is inf / na (probably because it was an unknown class)"""

    def __init__(self, output_name=None, allow_empty_mf=False):
        super(DropRowsWhereNoTarget, self).__init__(["target"], output_name, allow_empty_mf)


class DropRowsWhereNoTargetOrNoTreatment(SpecialNumericOutputsDropRows):
    """Drop rows for which the target is na or the treatment is na"""

    def __init__(self, output_name=None, allow_empty_mf=False):
        super(DropRowsWhereNoTargetOrNoTreatment, self).__init__(["target", "treatment"], output_name, allow_empty_mf)


class DropRowsWhereNoTargetOrNoPrediction(SpecialNumericOutputsDropRows):
    """Drop rows for which the target or prediction is inf / na (probably because it was an unknown class)"""

    def __init__(self, output_name=None, allow_empty_mf=False,  has_probas=False):
        super(DropRowsWhereNoTargetOrNoPrediction, self).__init__(
            ["target", "prediction"] + ([doctor_constants.PROBA_COLUMNS] if has_probas else []),
            output_name, allow_empty_mf)


class DropRowsWhereNoTargetOrNoWeight(SpecialNumericOutputsDropRows):
    """Drop rows for which the target or weight is inf / na (probably because it was an unknown class)"""

    def __init__(self, output_name=None, allow_empty_mf=False):
        super(DropRowsWhereNoTargetOrNoWeight, self).__init__(["target", "weight"], output_name, allow_empty_mf)


class DropRowsWhereNoTargetOrNoWeightOrNoPrediction(SpecialNumericOutputsDropRows):
    """Drop rows for which the target, weight or prediction is inf / na (probably because it was an unknown class)"""

    def __init__(self, output_name=None, allow_empty_mf=False, has_probas=False):
        super(DropRowsWhereNoTargetOrNoWeightOrNoPrediction, self).__init__(
            ["target", "weight", "prediction"] + ([doctor_constants.PROBA_COLUMNS] if has_probas else []),
            output_name, allow_empty_mf)


class DropRowsWhereNoTargetOrNoTreatmentOrNoPrediction(SpecialNumericOutputsDropRows):
    """Drop rows for which the target is na or the treatment is na or the prediction is na"""

    def __init__(self, output_name=None, allow_empty_mf=False, has_probas=False):
        super(DropRowsWhereNoTargetOrNoTreatmentOrNoPrediction, self).__init__(
            ["target", "treatment", "prediction"] + ([doctor_constants.PROBA_COLUMNS] if has_probas else []),
            output_name, allow_empty_mf)


class ExtractColumn(Step):
    """Extracts a single column from the current multiframe and puts it as a Series
    in result"""

    __slots__ = ('column_name', 'output_name')

    def __init__(self, column_name, output_name):
        self.column_name = column_name
        self.output_name = output_name

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        for (name, df) in current_mf.iter_dataframes():
            if self.column_name in df:
                output_ppr[self.output_name] = df([self.column_name])
                del df[self.column_name]
                return
        raise Exception("Unknown column %s" % self.column_name)


class FlagMissingValue2(Step):
    def __init__(self, feature, output_block_name):
        self.feature = feature
        self.output_block_name = output_block_name

    def _output_name(self):
        return self.output_block_name + ":" + self.feature + ":not_missing"

    def init_resources(self, resources_handler):
        map = resources_handler.get_resource("flagged", "json")
        if "columns" not in map:
            map["columns"] = []
            map["output_names"] = []
        map["columns"].append(self.feature)
        map["output_names"].append(self._output_name())

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        new_column = "%s:not_missing" % self.feature
        builder = current_mf.get_df_builder(self.output_block_name)

        add_column_to_builder(builder, new_column, [self.feature], input_df[self.feature].notnull().astype(float),
                              generated_features_mapping)



class FlushDFBuilder(Step):
    def __init__(self, block_name):
        self.block_name = block_name

    def __str__(self):
        return "Step:FlushDFBuilder(%s)" % self.block_name

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        if current_mf.has_df_builder(self.block_name):
            current_mf.flush_df_builder(self.block_name)


class OutputRawColumns(Step):
    """Copy a value from input df to an output key.
    Used for target.
    Makes a deep copy"""

    def __init__(self, column_names, output_name):
        self.column_names = column_names
        self.output_name = output_name
        self.values_map = None

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        output_ppr[self.output_name] = input_df[self.column_names].copy()


class RemapValueToOutput(Step):
    """Remap a value from input df to an output key as a series.
    Used for target.
    Makes a deep copy"""

    __slots__ = ('values_map',)

    def __init__(self, column_name, output_name, values_map):
        self.column_name = column_name
        self.output_name = output_name
        if values_map is not None:
            if sys.version_info > (3, 0):
                self.values_map = { k: v for k, v in values_map.items() }
            else:
                self.values_map = {
                    k.encode("utf-8"): v
                    for k, v in values_map.items()
                }
        else:
            self.values_map = None

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        if self.column_name not in input_df.columns:
            raise ValueError("The input does not include the required column '{}'".format(self.column_name))
        series = input_df[self.column_name].copy()

        if self.values_map is not None and len(self.values_map) > 0:
            series = series.astype(str).map(self.values_map)
            nb_null = series.isnull().sum()
            if nb_null > 0:
                preproc_logger.warning("Found %s nulls in %s" % (nb_null, self.output_name))
        output_ppr[self.output_name] = series


class FetchRollingWindows(Step):
    """
    Precomputed rolling windows fetching (times series only)
    """

    def __init__(self, windows_list, windows_partial_res_map):
        self.windows_list = windows_list
        self.windows_partial_res_map = windows_partial_res_map
        self.out_block = "rolling_window"

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        if len(self.windows_list) == 0:
            return
        builder = current_mf.get_df_builder(self.out_block)
        for window in self.windows_list:
            length = window["length"]
            operations_map = window["operations_map"]
            for column_name in operations_map.keys():
                for operation in operations_map[column_name]:
                    if operation == 'FREQUENCY':
                        categories = self.windows_partial_res_map[column_name]["categories"]
                        feature_names = ["{}:{}:{}:{}".format(length, operation, column_name, category) for category in categories]
                    else:
                        feature_names = ["{}:{}:{}".format(length, operation, column_name)]
                    for feature_name in feature_names:
                        if feature_name not in builder.columns:
                            result = input_df["rolling_window:{}".format(feature_name)]
                            add_column_to_builder(builder, feature_name, [column_name], result, generated_features_mapping)
        current_mf.flush_df_builder(self.out_block)


class RemapTreatmentToOutput(Step):

    def __init__(self, column_name, control_value, drop_missing_value, enable_multi_treatment=False, treatment_values=None):
        self.column_name = column_name
        self.output_name = "treatment"
        self.control_value = control_value
        self.drop_missing_value = drop_missing_value
        self.enable_multi_treatment = enable_multi_treatment
        self.treatment_values = treatment_values

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        if self.column_name not in input_df.columns:
            raise ValueError("The input does not include the required column '{}'".format(self.column_name))
        series = input_df[self.column_name].copy()
        if self.control_value == "" or not self.drop_missing_value:
            # Missing values in the treatment column are parsed by Pandas as NaN. When the control value or a valid
            # treatment value is the empty string (meaning missing value), we replace np.nan with "" so missing values
            # will be properly  converted to 0 (control value) or the relevant > 0 integer treatment code.
            series = series.replace(np.nan, "")

        if self.enable_multi_treatment and len(self.treatment_values) > 2:
            # Multi-valued treatment: remap missing to NaN (if drop_missing_value is True), control value to 0, all
            # valid treatments to > 0 integers
            treatment_map = TreatmentMap(self.control_value, self.treatment_values, self.drop_missing_value)
            series = series.map(treatment_map.mapping)
        else:
            # Binary treatment: remap missing to NaN (if drop_missing_value is True), control value to 0, all else to 1 (considered as treated)
            if self.drop_missing_value:
                # We leave the NaN values as NaN (no casting to 0/1), so that the rows will be dropped later (step DropRowsWhereNoTargetOrNoTreatment)
                # Note that if the control value is set to empty value (""), there is no longer np.nan in the dataset so the drop_missing_value arg
                # does not do anything
                series = series.map(lambda x: np.nan if pd.isnull(x) else x != self.control_value)
            else:
                series = series.map(lambda x: x != self.control_value)
        series = series.astype(float)
        nb_null = series.isnull().sum()
        if nb_null > 0:
            preproc_logger.warning("Found %s nulls in treatment" % nb_null)
        output_ppr[self.output_name] = series


class RealignTarget(Step):
    @staticmethod
    def get_realigned_target(current_mf, output_ppr):
        """
        Return a new target series (a copy), realigned with respect to the current_mf index
        :param current_mf: Current MultiFrame
        :param output_ppr: Dictionary containing the target series at the "target" key
        :return: realigned target series
        """
        target_series = output_ppr["target"].copy()
        return target_series.loc[current_mf.index]

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        preproc_logger.debug("Realign target series = %s" % str(output_ppr["target"].shape))
        output_ppr["target"] = self.get_realigned_target(current_mf, output_ppr)
        preproc_logger.debug("After realign target: %s" % str(output_ppr["target"].shape))


class RealignWeight(Step):
    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        series = output_ppr["weight"]
        main_index = current_mf.index
        preproc_logger.debug("Realign weight series = %s" % str(series.shape))
        series = series.loc[main_index]
        preproc_logger.debug("After realign weight: %s" % str(series.shape))
        output_ppr["weight"] = series


class RealignTreatment(Step):
    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        series = output_ppr["treatment"]
        main_index = current_mf.index
        preproc_logger.debug("Realign treatment series = %s" % str(series.shape))
        series = series.loc[main_index]
        preproc_logger.debug("After realign treatment: %s" % str(series.shape))
        output_ppr["treatment"] = series


class RealignPrediction(Step):
    def __init__(self, has_probas):
        self.has_probas = has_probas
        
    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        series = output_ppr["prediction"]
        main_index = current_mf.index
        preproc_logger.debug("Realign prediction series = %s" % str(series.shape))
        series = series.loc[main_index]
        preproc_logger.debug("After realign prediction: %s" % str(series.shape))
        output_ppr["prediction"] = series
        if self.has_probas:
            probas_df = output_ppr[PROBA_COLUMNS]
            main_index = current_mf.index
            preproc_logger.debug("Realign probas df = %s" % str(probas_df.shape))
            probas_df = probas_df.loc[main_index]
            preproc_logger.debug("After realign probas: %s" % str(probas_df.shape))
            output_ppr[PROBA_COLUMNS] = probas_df


class CopyMultipleColumnsFromInput(Step):
    def __init__(self, columns, output_block_name):
        self.columns = columns
        self.output_block_name = output_block_name

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        new_df = input_df[self.columns].copy()
        current_mf.append_df(self.output_block_name, new_df)


class MultipleImputeMissingFromInput(Step):
    """Multi-column impute missing values.
    A sub-df is extracted from the input df and series are fillna-ed.

    The sub-df is added as a single output block
    """
    def __init__(self, impute_map, output_block_name, keep_output_block, as_categorical):
        self.impute_map = impute_map
        self.output_block_name = output_block_name
        self.keep_output_block = keep_output_block
        self.as_categorical = as_categorical

    def init_resources(self, resources_handler):
        resource = resources_handler.get_resource("imputed", "json")
        if "num_columns" not in resource:
            resource["num_columns"] = []
            resource["num_values"] = []
            resource["cat_columns"] = []
            resource["cat_values"] = []
        for (col, val) in self.impute_map.items():
            if self.as_categorical:
                resource["cat_columns"].append(col)
                resource["cat_values"].append(val)
            else:
                resource["num_columns"].append(col)
                resource["num_values"].append(val)

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        if preproc_logger.isEnabledFor(logging.DEBUG):
            preproc_logger.debug("MIMIFI: Imputing with map %s" % self.impute_map)
        columns = self.impute_map.keys()
        out = {}

        if not len(self.impute_map):
            return

        # TODO: Might be faster to fillna(inplace)

        for (col, val) in self.impute_map.items():
            if val is None:
                out[col] = input_df[col]
                continue
            if self.as_categorical:
                series = input_df[col].astype(object)
            else:
                series = input_df[col]
            out[col] = series.fillna(val)

        out_df = pd.DataFrame(out)
        current_mf.append_df(self.output_block_name, out_df, self.keep_output_block)

class NumericalNumericalInteraction(Step):
    def __init__(self, out_block, column_1, column_2, rescale):
        super(NumericalNumericalInteraction, self).__init__()
        self.out_block = out_block
        self.column_1 = column_1
        self.column_2 = column_2
        self.rescale = rescale
        self.shift = None
        self.inv_scale = None
        self.json_data = None

    def fit_and_process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        if self.rescale:
            s = input_df[self.column_1] * input_df[self.column_2]
            self.shift = np.mean(s)
            self.inv_scale = 1.0 / np.std(s)
        else:
            self.shift = 0.0
            self.inv_scale = 1.0

        if "column_1" not in self.json_data:
            self.json_data["column_1"] = []
            self.json_data["column_2"] = []
            self.json_data["rescale"] = []
            self.json_data["shift"] = []
            self.json_data["inv_scale"] = []
        self.json_data["column_1"].append(self.column_1)
        self.json_data["column_2"].append(self.column_2)
        self.json_data["rescale"].append(self.rescale)
        self.json_data["shift"].append(self.shift)
        self.json_data["inv_scale"].append(self.inv_scale)
        return self.process(input_df, current_mf, output_ppr, generated_features_mapping)

    def _output_name(self):
        return "%s:%s" % (self.column_1, self.column_2)

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):

        def make_series(n):
            try:
                if n in current_mf.get_block(doctor_constants.NUM_IMPUTED_KEPT).df:
                    return current_mf.get_block(doctor_constants.NUM_IMPUTED_KEPT).df[n]
                if n in current_mf.get_block(doctor_constants.NUM_IMPUTED_NOT_KEPT).df:
                    return current_mf.get_block(doctor_constants.NUM_IMPUTED_NOT_KEPT).df[n]
            except KeyError:
                pass
            return input_df[n]

        s = make_series(self.column_1) * make_series(self.column_2)
        if self.rescale:
            s = (s - self.shift) * self.inv_scale
        builder = current_mf.get_df_builder(self.out_block)
        add_column_to_builder(builder, self._output_name(), [self.column_1, self.column_2], s, generated_features_mapping)

    def init_resources(self, resources_handler):
        super(NumericalNumericalInteraction, self).init_resources(resources_handler)
        self.json_data = resources_handler.get_resource("num_num", "json")
        if "column_1" in self.json_data:
            i = 0
            for c1, c2 in zip(self.json_data["column_1"], self.json_data["column_2"]):
                if c1 == self.column_1 and c2 == self.column_2:
                    self.rescale = self.json_data["rescale"][i]
                    self.shift = self.json_data["shift"][i]
                    self.inv_scale = self.json_data["inv_scale"][i]
                    break
                i += 1


class NumericalCategoricalInteraction(Step):
    def __init__(self, out_block, cat, num, max_features):
        super(NumericalCategoricalInteraction, self).__init__()
        self.out_block = out_block
        self.cat = cat
        self.num = num
        self.values = None
        self.max_features = max_features
        self.json_data = None

    def _make_series(self, current_mf, input_df, blocks, n):
        try:
            for block in blocks:
                if n in current_mf.get_block(block).df:
                    return current_mf.get_block(block).df[n]
        except KeyError:
            pass
        return input_df[n]

    def fit_and_process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        # we could get the values from the collector, but it turns out we only compute those necessary for dummification,
        # so we may have missing values ...

        series = self._make_series(current_mf, input_df, ["CAT_IMPUTED"], self.cat).fillna("N/A")
        self.values = np.unique(series)[:self.max_features]  # already sorted by counts decreasing
        # cleanup on isle unicode
        self.values = dkujson.loads(dkujson.dumps(self.values.tolist()))
        if "num" not in self.json_data:
            self.json_data["num"] = []
            self.json_data["cat"] = []
            self.json_data["values"] = []
        self.json_data["num"].append(self.num)
        self.json_data["cat"].append(self.cat)
        self.json_data["values"].append(self.values)
        return self.process(input_df, current_mf, output_ppr, generated_features_mapping)

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        cat_series = self._make_series(current_mf, input_df, ["CAT_IMPUTED"], self.cat).fillna("N/A")
        num_series = self._make_series(current_mf, input_df,
                                       [doctor_constants.NUM_IMPUTED_KEPT, doctor_constants.NUM_IMPUTED_NOT_KEPT],
                                       self.num)
        dumm = FastSparseDummifyProcessor(None, self.cat, self.values, False)._create_matrix(cat_series)
        # this is probably too dirty but we drop the NA and Others columns of the resulting matrix
        result = scipy.sparse.diags(num_series.values) * dumm.matrix[:, :-2]
        names = ["interaction:%s:%s:%s" % (self.num, self.cat, v) for v in self.values]
        append_sparse_with_prefix(current_mf, "interaction:", [self.num, self.cat], SparseMatrixWithNames(result, names), generated_features_mapping)

    def init_resources(self, resources_handler):
        super(NumericalCategoricalInteraction, self).init_resources(resources_handler)
        self.json_data = resources_handler.get_resource("num_cat", "json")
        if "num" in self.json_data:
            for num, cat, values in zip(self.json_data["num"], self.json_data["cat"], self.json_data["values"]):
                if num == self.num and cat == self.cat:
                    self.values = values
                    break


class CategoricalCategoricalInteraction(Step):
    def __init__(self, out_block, column_1, column_2, max_features):
        super(CategoricalCategoricalInteraction, self).__init__()
        self.out_block = out_block
        self.column_1 = column_1
        self.column_2 = column_2
        self.max_features = max_features
        self.values = None
        self.json_data = None

    def _make_series(self, current_mf, input_df, n):
        try:
            if n in current_mf.get_block("CAT_IMPUTED").df:
                return current_mf.get_block("CAT_IMPUTED").df[n].fillna("N/A")
        except KeyError:
            pass
        return input_df[n].fillna("N/A")

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        s1 = self._make_series(current_mf, input_df, self.column_1)
        s2 = self._make_series(current_mf, input_df, self.column_2)
        series = s1.str.cat(s2, sep="__dku__")
        values_cat = ["%s__dku__%s" % (a,b) for (a,b) in self.values]
        dumm = FastSparseDummifyProcessor(None, None, values_cat, False)._create_matrix(series)
        # this is probably too dirty but we drop the NA and Others columns of the resulting matrix
        result = dumm.matrix[:, :-2]
        names = ["interaction:%s:%s:%s:%s" % (self.column_1, self.column_2, a, b) for (a, b) in self.values]
        append_sparse_with_prefix(current_mf, "interaction:", [self.column_1, self.column_2], SparseMatrixWithNames(result, names), generated_features_mapping)

    def fit_and_process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        s1 = self._make_series(current_mf, input_df, self.column_1)
        s2 = self._make_series(current_mf, input_df, self.column_2)
        self.values = pd.DataFrame({"s1": s1, "s2": s2}).groupby(["s1", "s2"]) \
                        .size().sort_values(ascending=False)[:self.max_features].index.values
        # cleanup on isle unicode
        self.values = dkujson.loads(dkujson.dumps(self.values.tolist()))
        if "column_1" not in self.json_data:
            self.json_data["column_1"] = []
            self.json_data["column_2"] = []
            self.json_data["values"] = []
        self.json_data["column_1"].append(self.column_1)
        self.json_data["column_2"].append(self.column_2)
        self.json_data["values"].append([[a,b] for (a,b) in self.values])

        return self.process(input_df, current_mf, output_ppr, generated_features_mapping)

    def init_resources(self, resources_handler):
        super(CategoricalCategoricalInteraction, self).init_resources(resources_handler)
        self.json_data = resources_handler.get_resource("cat_cat", "json")
        if "column_1" in self.json_data:
            for c1, c2, values in zip(self.json_data["column_1"], self.json_data["column_2"], self.json_data["values"]):
                if c1 == self.column_1 and c2 == self.column_2:
                    self.values = values
                    break

class BlockStdRescalingProcessor(Step):
    """A avg/std rescaler that needs to be fit.
    Operates on a whole DF block"""

    def __init__(self, in_block):
        self.in_block = in_block

    def init_resources(self, mp):
        self.resource = mp.get_resource("block_std_rescaler", "json")
        if not self.in_block in self.resource:
            self.resource[self.in_block ] = { "shifts" : {}, "inv_scales" : {}}
        self.r = self.resource[self.in_block]
        # because we currently use different systems for rescaling of normal columns and derivatives,
        # but we want a single resource for java exports, we duplicate the resource dumping here
        self.generic_resource = mp.get_resource("rescalers", "json")
        if "columns" not in self.generic_resource:
            self.generic_resource["columns"] = []
            self.generic_resource["shifts"] = []
            self.generic_resource["inv_scales"] = []

    def _fit(self, input_df, current_mf):
        df = current_mf.get_block(self.in_block).df

        for col in df.columns:
            series = df[col]
            shift = series.mean()
            std = series.std()
            inv_scale = 0.0 if std == 0.0 else 1. / std
            self.r["shifts"][col] = shift
            self.generic_resource["columns"].append(col)
            self.generic_resource["shifts"].append(shift)
            self.r["inv_scales"][col] = inv_scale
            self.generic_resource["inv_scales"].append(inv_scale)

    def fit_and_process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        self._fit(input_df, current_mf)
        self.process(input_df, current_mf, output_ppr, generated_features_mapping)

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        df = current_mf.get_block(self.in_block).df

        for col in df.columns:
            if col in self.r["shifts"]:
                series = df[col]
                df[col] = (series - self.r["shifts"][col]) * self.r["inv_scales"][col]


class BinarizeSeries(Step):
    """Rescale a single series in-place in a DF block"""
    def __init__(self, in_block, in_col, out_block, threshold):
        self.in_block = in_block
        self.in_col = in_col
        self.threshold = threshold
        self.out_block = out_block

    def __str__(self,):
        return "Step:%s (col=%s, thresh=%s)" % (self.__class__.__name__, self.in_col, self.threshold)

    def _output_name(self):
        return "%s:above:%s" % (self.in_col, self.threshold)

    def init_resources(self, resources_handler):
        resource = resources_handler.get_resource("binarized", "json")
        if "columns" not in resource:
            resource["columns"] = []
            resource["output_name"] = []
            resource["thresholds"] = []
        resource["columns"].append(self.in_col)
        resource["output_name"].append("num_binarized:" + self._output_name())
        resource["thresholds"].append(self.threshold)

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        dfw = current_mf.get_block(self.in_block)
        series = dfw.df[self.in_col]
        builder = current_mf.get_df_builder(self.out_block)
        add_column_to_builder(builder, self._output_name(), [self.in_col], series > self.threshold, generated_features_mapping)


class QuantileBinSeries(Step):
    def __init__(self, in_block, in_col, out_block, nb_bins):
        self.in_block = in_block
        self.in_col = in_col
        self.out_block = out_block
        self.nb_bins = nb_bins

    def __str__(self,):
        return "Step:%s (col=%s, nb=%s)" % (self.__class__.__name__, self.in_col, self.nb_bins)


    def init_resources(self, mp):
        self.resource = mp.get_resource("quantile_binner", "json")
        if not self.in_col in self.resource:
            self.resource[self.in_col] = { "bounds" : []}
        self.r = self.resource[self.in_col]

    def fit_and_process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        builder = current_mf.get_df_builder(self.out_block)
        df = current_mf.get_block(self.in_block).df
        series = df[self.in_col]

        try:
            (categorical, bounds) = pd.qcut(series, self.nb_bins, retbins=True, labels = xrange(0, self.nb_bins))
        except ValueError as e:
            raise ValueError("Could not cut feature %s in %s quantiles. It might be too skewed or not have enough values." % (self.in_col, self.nb_bins))
        new_column = "%s:quantile:%s" % (self.in_col, self.nb_bins)
        add_column_to_builder(builder, new_column, [self.in_col], pd.Series(categorical).astype(float).fillna(-1), generated_features_mapping)

        self.r["bounds"] = bounds

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        builder = current_mf.get_df_builder(self.out_block)
        df = current_mf.get_block(self.in_block).df
        series = df[self.in_col]

        categorical = pd.cut(series, self.r["bounds"], labels = xrange(0, self.nb_bins))
        new_column = "%s:quantile:%s" % (self.in_col, self.nb_bins)
        add_column_to_builder(builder, new_column, [self.in_col], pd.Series(categorical).astype(float).fillna(-1), generated_features_mapping)


class DatetimeCyclicalEncodingStep(Step):

    # Keep in sync with both DatetimeCyclicalEncoder.Period and NumericalVariableAnalyzer.Period
    class Period(Enum):
        MINUTE = (60, 'min')
        HOUR = (3600, 'h')
        DAY = (3600*24, 'd')
        WEEK = (3600*24*7, 'w')
        MONTH = (3600*24*31, 'm')
        QUARTER = (3600*24*92, 'q')
        YEAR = (3600*24*366, 'y')

        def __init__(self, duration, period_str):
            self.duration = duration
            self.period_str = period_str

    def __init__(self, column_name, selected_periods, out_block):
        self.column_name = column_name
        self.out_block = out_block

        unknown_periods = set(selected_periods) - set(p.name for p in self.Period)
        if unknown_periods:
            raise ValueError("Unknown period(s) for Datetime cyclical encoding: " + str(unknown_periods))
        self.selected_periods = selected_periods

    def init_resources(self, resources_handler):
        map = resources_handler.get_resource("datetime_cyclical", "json")
        if "mapping" not in map:
            map["mapping"] = dict()
        map["mapping"][self.column_name] = self.selected_periods

    @staticmethod
    def compute_duration_in_seconds(period, datetime_series):
        """
        Compute the number of seconds between the datetime series and the datetime series truncated at the given period.
        NB:
        - 2021-09-02T11:15:35 truncated at Period.MINUTE gives 2021-09-02T11:15:00
        - 2021-09-02T11:15:35 truncated at Period.HOUR gives 2021-09-02T11:00:00
        - 2021-09-02T11:15:35 truncated at Period.DAY gives 2021-09-02T00:00:00
        - 2021-09-02T11:15:35 truncated at Period.WEEK gives 2021-08-30T00:00:00
        - 2021-09-02T11:15:35 truncated at Period.MONTH gives 2021-09-01T00:00:00
        - 2021-09-02T11:15:35 truncated at Period.QUARTER gives 2021-07-01T00:00:00
        - 2021-09-02T11:15:35 truncated at Period.YEAR gives 2021-01-01T00:00:00
        """
        # to_period return a DataFrame of dtype period[T], we need to convert to datetime via .dt.start_time to be able
        # to perform the subtraction

        if datetime_series.empty:
            return pd.Series(name=datetime_series.name, dtype=np.float64)

        truncated_datetime_series = datetime_series.dt.to_period(period.period_str).dt.start_time
        # Round at the ms level and then switch back to seconds allows to circumvent a numerical bug in Pandas dt.total_seconds method
        return np.floor((datetime_series - truncated_datetime_series).dt.total_seconds()*1000)/1000

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        builder = current_mf.get_df_builder(self.out_block)
        conv_to_datetime_extra_params = {}
        if pd.api.types.is_numeric_dtype(input_df[self.column_name]):
            conv_to_datetime_extra_params["unit"] = 's'
            conv_to_datetime_extra_params["origin"] = pd.Timestamp('1900-01-01')
        elif "Invalid date" in set(input_df[self.column_name]):
            # Edge case for interactive scoring
            # If the user manually enters a wrongly formatted date, the date picker component yields the value
            # 'Invalid date'. In this case, the column has only one value so the check is pretty cheap.
            raise ValueError("Invalid date format")
        datetime_series = pd.to_datetime(input_df[self.column_name], **conv_to_datetime_extra_params)

        for period in self.Period:
            if period.name not in self.selected_periods:
                continue
            cos_sin_arg = self.compute_duration_in_seconds(period, datetime_series)*2*np.pi/period.duration
            prefix = u"{}:{}".format(safe_unicode_str(self.column_name), period.name.lower())
            add_column_to_builder(builder, "{}:sin".format(prefix), [self.column_name], np.sin(cos_sin_arg),
                                  generated_features_mapping)
            add_column_to_builder(builder, "{}:cos".format(prefix), [self.column_name], np.cos(cos_sin_arg),
                                  generated_features_mapping)

class RescalingProcessor2(Step):
    """Rescale a single series in-place in a DF block"""

    def __str__(self,):
        return "Step:%s (%s)" % (self.__class__.__name__, self.in_col)

    def __init__(self, in_block, in_col, shift=None, scale=None):
        self.in_block = in_block
        self.in_col = in_col
        self.shift = shift
        self.set_scale(scale)

    def init_resources(self, resources_handler):
        resource = resources_handler.get_resource("rescalers", "json")
        if "columns" not in resource:
            resource["columns"] = []
            resource["shifts"] = []
            resource["inv_scales"] = []
        if self.in_col not in resource["columns"]:
            resource["columns"].append(self.in_col)
            resource["shifts"].append(self.shift)
            resource["inv_scales"].append(self.inv_scale)

    def set_scale(self, scale):
        scale_is_nan = np.isnan(scale)
        if scale_is_nan:
            preproc_logger.warning("The scale used for rescaling is not a number, using zero instead." +
                                   "An issue might have happened earlier during preprocessing.")
        if scale_is_nan or scale == 0.:
            # if there is no variance, just return a null-series
            self.inv_scale = 0.
        else:
            self.inv_scale = 1. / scale

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        dfw = current_mf.get_block(self.in_block)
        series = dfw.df[self.in_col]
        if preproc_logger.isEnabledFor(logging.DEBUG):
            preproc_logger.debug("  Rescale %s (avg=%s std=%s shift=%s inv_scale=%s nulls=%s)" % (self.in_col, series.mean(), series.std(), self.shift, self.inv_scale, series.isnull().sum()))
        dfw.df[self.in_col] = (series - self.shift) * self.inv_scale
        s2 = dfw.df[self.in_col]



class AllInteractionFeaturesGenerator(Step):
    """Generates all polynomial interaction features from the imputed input numericals"""
    def __init__(self, in_block, out_block, features):
        self.in_block = in_block
        self.out_block = out_block
        self.features = features
        self.built = 0

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        from sklearn.preprocessing import PolynomialFeatures
        pf = PolynomialFeatures(degree=2, interaction_only=True)

        dic = {}
        for feature_name in self.features:
            dic[feature_name] = current_mf.col_as_series(self.in_block, feature_name)
        df = pd.DataFrame(dic)

        # Fit is only used to compute the n_features_out x n_features_in matrix
        # it's actually stateless
        pf.fit(df)
        names = []
        for out_powers in pf.powers_:
            factors = []
            for i in xrange(0, len(out_powers)):
                if out_powers[i] == 2:
                    factors.append("%s^2" % (self.features[i]))
                elif out_powers[i] == 1:
                    factors.append("%s" % (self.features[i]))
            names.append("poly_int:%s" % " * ".join(factors))
            generated_features_mapping.add_per_column_mapping("poly_int", factors, "poly_int:%s" % " * ".join(factors))
            self.built += 1
        out_matrix = pf.transform(df)
        df = pd.DataFrame(out_matrix, columns=names)
        current_mf.append_df(self.out_block, df)

    def report_fit(self, ret_obj, core_params):
        ret_obj["polynomial_interactions"] = {
            "input_features": len(self.features),
            "built_features" : self.built
        }


class PairwiseLinearCombinationsGenerator(Step):
    def __init__(self, in_block, out_block, features):
        self.in_block = in_block
        self.out_block = out_block
        self.features = features
        self.built = 0

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        assert len(self.features) >= 2
        out = current_mf.get_df_builder(self.out_block)
        for i1 in xrange(0, len(self.features)):
            f1 = self.features[i1]
            s1 = current_mf.col_as_series(self.in_block, f1)
            for i2 in xrange(i1+1, len(self.features)):
                f2 = self.features[i2]
                if f1 == f2:
                    continue

                assert(s1.isnull().sum() == 0)
                s2 = current_mf.col_as_series(self.in_block, f2)
                add_column_to_builder(out, u"{}+{}".format(safe_unicode_str(f1), safe_unicode_str(f2)), [f1, f2], s1+s2, generated_features_mapping)
                add_column_to_builder(out, u"{}-{}".format(safe_unicode_str(f1), safe_unicode_str(f2)), [f1, f2], s1-s2, generated_features_mapping)
                self.built += 2

        current_mf.flush_df_builder(self.out_block)

    def report_fit(self, ret_obj, core_params):
        ret_obj["pairwise_linear"] = {
            "input_features": len(self.features),
            "built_features": self.built
        }

class NumericalDerivativesGenerator(Step):
    """Generate derivative features from selected numerical features
    in a block.
    Generates square, log(), sqrt"""
    def __init__(self, in_block, out_block, features):
        self.in_block = in_block
        self.out_block = out_block
        self.features = features

    def init_resources(self, resources_handler):
        res = resources_handler.get_resource("derivatives", "json")
        if "columns" not in res:
            res["columns"] = []
        for feature in self.features:
            res["columns"].append(feature)

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        out = current_mf.get_df_builder(self.out_block)
        for feature in self.features:
            series = current_mf.get_block(self.in_block).df[feature]
            add_column_to_builder(out, u"{}^2".format(safe_unicode_str(feature)), [feature], np.power(series, 2), generated_features_mapping)

            # TODO:  We should probably make possibly-NA generators optional
            # We don't care about generating NA in a DF, DropNARows will
            # clean up after us
            add_column_to_builder(out, u"sqrt({})".format(safe_unicode_str(feature)), [feature], np.sqrt(series).fillna(0), generated_features_mapping)

            add_column_to_builder(out, u"log({})".format(safe_unicode_str(feature)), [feature], np.log(series + 0.00000001).fillna(0), generated_features_mapping)

        current_mf.flush_df_builder(self.out_block)

class DumpPipelineState(Step):
    def __init__(self, name):
        self.name = name
        pass

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        preproc_logger.debug("********* Pipeline state (%s)" % self.name)
        preproc_logger.debug("   input_df= %s " % str(input_df.shape))
        preproc_logger.debug("   current_mf=%s " % str(current_mf.shape()))
        preproc_logger.debug("   PPR: ")
        for (k, v) in output_ppr.items():
            if isinstance(v, MultiFrame):
                preproc_logger.debug("      %s = %s (%s)" % (k, v.__class__, str(v.shape())))
            elif isinstance(v, MLAssertions):
                preproc_logger.debug("       %s = %s (%s assertions)" % (k, v.__class__, len(v)))
            else:
                preproc_logger.debug("      %s = %s (%s)" % (k, v.__class__, str(v.shape)))

class DumpInputDF(Step):
    def __init__(self, name):
        self.name = name
        pass

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        preproc_logger.debug("********* DUMP InputDF (%s)" % self.name)
        preproc_logger.debug("%s" % input_df)

class DumpMFDetails(Step):
    def __init__(self, name):
        self.name = name

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        preproc_logger.debug("********* DUMP Multiframe Details (%s)" % self.name)
        for (block_name, block, kept) in current_mf.iter_blocks(True):
            shape = "?"
            if isinstance(block, SparseMatrixWithNames):
                shape = block.matrix.shape
            elif isinstance(block, NamedNPArray):
                shape = block.array.shape
            else:
                shape = block.df.shape

            preproc_logger.debug("  Block: %s clazz=%s shape=%s kept=%s" % (block_name, block.__class__, shape, kept))

class DumpFullMF(Step):
    def __init__(self, name):
        self.name = name
        pass

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        preproc_logger.debug("********* DUMP Multiframe (%s)" % self.name)
        preproc_logger.debug("%s" % current_mf.as_dataframe().to_dict(orient='records'))


class EmitCurrentMFAsResult(Step):
    """Emits the current multi frame in the result object and
    optionally injects a *brand new* multiframe in the pipeline"""
    def __init__(self, output_name):
        self.output_name = output_name

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        output_ppr[self.output_name] = current_mf
        output_ppr["UNPROCESSED"] = input_df
        new_mf = MultiFrame()
        # In this final step, the result MultiFrame must know how to translate between:
        # - unrecorded entries in sparse matrices
        # - NaN or 0 values in dense arrays
        new_mf.set_unrecorded_value(current_mf.unrecorded_value)
        new_mf.set_index_from_df(input_df)
        return new_mf

class AddReferenceInOutput(Step):
    """Add an alias in output"""
    def __init__(self, output_name_from, output_name_to):
        self.output_name_from = output_name_from
        self.output_name_to = output_name_to

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        output_ppr[self.output_name_to] = output_ppr[self.output_name_from]


class FastSparseDummifyProcessor(Step):

    def __init__(self, input_block, input_column_name, values, should_drop):
        self.values = [val for val in values]
        self.input_block = input_block
        self.input_column_name = input_column_name
        self.should_drop = should_drop
        self.mapping_table = self._create_mapping_table()

    def __str__(self,):
        return "Step:%s (%s)" % (self.__class__.__name__, self.input_column_name)

    # Construct a mapping table mapping each value to its integer position
    def _create_mapping_table(self):
        mapping_table = {}
        nb_vals = len(self.values)
        for i in xrange(0, nb_vals):
            v = self.values[i]
            if sys.version_info > (3,0):
                mapping_table[v] = i
            else:
                mapping_table[encode_utf8(v)] = i
        return mapping_table

    def init_resources(self, resources_handler):
        resources = resources_handler.get_resource("dummies", "json")
        if "details" not in resources:
            resources["details"] = {}
        resources["details"][self.input_column_name] = {
            "levels": self.values,
            "with_others": not self.should_drop #todo : check this
        }

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        if self.input_block is None:
            series = input_df[self.input_column_name]
        else:
            series = current_mf.get_block(self.input_block).df[self.input_column_name]

        append_sparse_with_prefix(current_mf, "dummy:", [self.input_column_name], self._create_matrix(series), generated_features_mapping)


    def _create_matrix(self, series):
        nb_vals = len(self.values) + 1
        series = series.fillna("_DKU_NA_")
        self.mapping_table["_DKU_NA_"] = nb_vals - 1

        # Create a series containing the value index for each row
        # If we have 50 values: [0-49] will indicate the real value and NaN indicates "other", which is filled with 50,
        # or dropped if we are dropping

        mapped = series.map(self.mapping_table)

        if not self.should_drop:
            labels_series = mapped.fillna(nb_vals).astype(np.int16)
        else:
            labels_series = mapped

        # We construct the data/indices/indptr structure which is the native
        # format for CSR matrixes. This allows for an extremely fast creation

        # We won't map NaN rows, which have their dummy dropped
        nb_rows = len(labels_series)
        data = np.ones(nb_rows - labels_series.isnull().sum(), dtype='u1')
        # create indices, skipping one if it is NaN, as we won't be creating a 1 on the row, then clean the series
        indptr = [0] + [y for y in labels_series.notnull().cumsum()]
        labels_series = labels_series.dropna()

        if sys.version_info > (3,0):
            names = ["dummy:%s:%s" % (self.input_column_name, value) for value in self.values]
            names.append("dummy:%s:N/A" % self.input_column_name)
        else:
            names = [u"dummy:%s:%s" % (self.input_column_name, unicode(value)) for value in self.values]
            names.append("dummy:%s:N/A" % self.input_column_name)

        if not self.should_drop:
            names.append("dummy:%s:%s" % (self.input_column_name, "__Others__"))

        #create dummy matrix
        matrix = scipy.sparse.csr_matrix((data, labels_series.values, indptr), shape=(nb_rows, len(names)))
        preproc_logger.debug("Dummifier: Append a sparse block shape=%s nnz=%s" % (str(matrix.shape), matrix.nnz))

        return SparseMatrixWithNames(matrix, names)

class CategoricalFeatureHashingProcessor(Step):
    """
    Hashing trick for category features.

    This creates an extremely huge sparse matrix and should only be used with algorithms that support
    them.

    It takes values from an input block

    @param bool hash_whole_categories (default True): Indicate whether the processor should hash the whole categories or
    each of the characters that compose them. It is kept for legacy reasons.
    """

    def __init__(self, input_block, column_name, hash_whole_categories=True, n_features=2**20):
        self.n_features = n_features
        self.input_block = input_block
        self.column_name = column_name
        self.hash_whole_categories = hash_whole_categories

    def process(self,  input_df, current_mf, output_ppr, generated_features_mapping):
        from sklearn.feature_extraction import FeatureHasher
        fh = FeatureHasher(n_features=self.n_features, input_type="string")

        if self.input_block is None:
            series = input_df[self.column_name]
        else:
            series = current_mf.get_block(self.input_block).df[self.column_name]

        if series.empty:
            # Simulate an empty matrix result to avoid hv.transform erroring out due to no samples
            matrix = scipy.sparse.csr_matrix((0, self.n_features), dtype=np.float64)
        else:
            # The input data must be reshaped for the FeatureHasher to hash whole categories
            if self.hash_whole_categories:
                matrix = fh.transform(series.values.reshape(-1, 1))
            else:
                matrix = fh.transform(series)

        # No name on the generated features
        append_sparse_with_prefix(current_mf,"hashing:", [self.column_name], SparseMatrixWithNames(matrix, None), generated_features_mapping)


class TextHashingVectorizerProcessor(Step):
    """
    Hashing trick for text features using Bag of words.
    http://scikit-learn.org/stable/modules/feature_extraction.html#vectorizing-a-large-text-corpus-with-the-hashing-trick

    This creates an extremely huge sparse matrix and should only be used with algorithms that support
    them.

    It takes values directly from the input df since we don't do other preprocessing for
    these features
    """
    __slots__ = ('column_name','n_features')

    def __init__(self, column_name, n_features=200000):
        self.n_features = n_features
        self.column_name = column_name

    def __str__(self,):
        return "%s (%s)" % (self.__class__, self.column_name)

    def process(self,  input_df, current_mf, output_ppr, generated_features_mapping):
        series = input_df[self.column_name]
        from sklearn.feature_extraction.text import HashingVectorizer
        hv = HashingVectorizer(n_features=self.n_features, dtype=np.float32)

        if series.empty:
            # Simulate an empty matrix result to avoid hv.transform erroring out due to no samples
            matrix = scipy.sparse.csr_matrix((0, self.n_features), dtype=np.float64)
        else:
            matrix = hv.transform(series.fillna(""))

        # No name on the generated features
        append_sparse_with_prefix(current_mf, "hashvect:", [self.column_name], SparseMatrixWithNames(matrix, None), generated_features_mapping)


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

class BaseCountVectorizerProcessor(Step):
    def __init__(self, column_name, min_df, max_df, max_features, min_gram, max_gram, stop_words=None):
        self.column_name = column_name
        self.min_df = min_df
        self.max_df = max_df
        if max_features == 0:
            self.max_features = None
        else:
            self.max_features = max_features
        self.min_gram = min_gram
        self.max_gram = max_gram
        self.stop_words = stop_words
        self.dropped_words = 0

    def gen_voc(self, vec):
        voc_sorted = [None for x in xrange(0, len(vec.vocabulary_))]
        for (k, v) in vec.vocabulary_.items():
            voc_sorted[v] = k
        return voc_sorted

    def __str__(self,):
        return "%s (%s)" % (self.__class__, self.column_name)

    def init_resources(self, mp):
        self.resource = mp.get_resource("%s_%s" % (self.prefix, self.column_name), "pkl")

    def report_fit(self, ret_obj, core_params):
        vec = self.resource["vectorizer"]
        if not self.prefix in ret_obj:
            ret_obj[self.prefix] = {}
        ret_obj[self.prefix][self.column_name] = {
            "used_words": len(vec.vocabulary_),
            "dropped_words": self.dropped_words
        }


class TextCountVectorizerProcessor(BaseCountVectorizerProcessor):
    def __init__(self, column_name, min_df, max_df, max_features, min_gram=1, max_gram=2, stop_words=None,
        custom_code=None):
        BaseCountVectorizerProcessor.__init__(self,column_name, min_df, max_df, max_features, min_gram, max_gram, stop_words)
        self.prefix = "countvec"
        self.custom_code = custom_code

    def fit_and_process(self,  input_df, current_mf, output_ppr, generated_features_mapping):
        series = input_df[self.column_name]
        if self.custom_code is not None:
            dic = {}
            exec(self.custom_code, dic, dic)
            vec = dic["transformer"]
        else:
            vec = CountVectorizer(min_df = self.min_df, max_df = self.max_df,
                            max_features = self.max_features,
                            stop_words = self.stop_words,
                            ngram_range = (self.min_gram, self.max_gram))
        preproc_logger.debug("Using vectorizer: %s" % vec)
        matrix = vec.fit_transform(series.fillna(""))
        preproc_logger.debug("Produced a matrix of size %s" % str(matrix.shape))

        voc_sorted = self.gen_voc(vec)
        names = [
            "countvec:%s:%s" % (self.column_name, w)
            for w in voc_sorted
        ]
        append_sparse_with_prefix(current_mf, "countvec:", [self.column_name], SparseMatrixWithNames(matrix, names), generated_features_mapping)
        self.dropped_words = get_n_stop_words_and_remove(vec)
        self.resource["vectorizer"] = vec
        self._report_json_data()

    def _report_json_data(self):
        json_data = self.json_data
        if "column" not in json_data:
            json_data["column"] = []
            json_data["vocabulary"] = []
            json_data["stop_words"] = []
            json_data["min_n_grams"] = []
            json_data["max_n_grams"] = []
            json_data["origin"] = "SCIKIT"
        vec = self.resource["vectorizer"]
        json_data["column"].append(self.column_name)
        if vec.stop_words == 'english':
            stop_words = list(ENGLISH_STOP_WORDS)  # maybe would take too much space in json if many text features ?
        elif isinstance(vec.stop_words, list):
            stop_words = vec.stop_words
        else:
            stop_words = []
        json_data["stop_words"].append(stop_words)
        json_data["vocabulary"].append(get_feature_names(vec))
        json_data["min_n_grams"].append(self.min_gram)
        json_data["max_n_grams"].append(self.max_gram)

    def init_resources(self, mp):
        super(TextCountVectorizerProcessor, self).init_resources(mp)
        self.json_data = mp.get_resource("word_counts", "json")

    def process(self,  input_df, current_mf, output_ppr, generated_features_mapping):
        series = input_df[self.column_name]
        vec = self.resource["vectorizer"]
        matrix = vec.transform(series.fillna(""))
        names = [
            "countvec:%s:%s" % (self.column_name, w)
            for w in get_feature_names(vec)
        ]
        append_sparse_with_prefix(current_mf, "countvec:", [self.column_name], SparseMatrixWithNames(matrix, names), generated_features_mapping)

class TextTFIDFVectorizerProcessor(BaseCountVectorizerProcessor):
    def __init__(self, column_name, min_df, max_df, max_features, min_gram=1, max_gram=2, stop_words=None, custom_code=None):
        BaseCountVectorizerProcessor.__init__(self,column_name, min_df, max_df, max_features, min_gram, max_gram, stop_words)
        self.prefix = "tfidfvec"
        self.custom_code = custom_code

    def fit_and_process(self,  input_df, current_mf, output_ppr, generated_features_mapping):
        series = input_df[self.column_name]
        if self.custom_code is not None:
            dic = {}
            exec(self.custom_code, dic, dic)
            vec = dic["transformer"]
        else:
            vec = TfidfVectorizer(min_df = self.min_df, max_df = self.max_df,
                            max_features = self.max_features,
                            stop_words = self.stop_words,
                            ngram_range = (self.min_gram, self.max_gram))
        preproc_logger.debug("Using vectorizer: %s" % vec)
        matrix = vec.fit_transform(series.fillna(""))
        preproc_logger.debug("Produced a matrix of size %s" % str(matrix.shape))

        voc_sorted = self.gen_voc(vec)
        names = [
            "tfidfvec:%s:%.3f:%s" % (self.column_name, idf, w)
            for (w, idf) in zip(voc_sorted, vec.idf_)
        ]
        append_sparse_with_prefix(current_mf, "tfidfvec:", [self.column_name], SparseMatrixWithNames(matrix, names), generated_features_mapping)
        self.dropped_words = get_n_stop_words_and_remove(vec)
        self.resource["vectorizer"] = vec
        self._report_json_data()

    def init_resources(self, mp):
        super(TextTFIDFVectorizerProcessor, self).init_resources(mp)
        self.json_data = mp.get_resource("tfidf", "json")

    def _report_json_data(self):
        json_data = self.json_data
        if "column" not in json_data:
            json_data["column"] = []
            json_data["vocabulary"] = []
            json_data["stop_words"] = []
            json_data["min_n_grams"] = []
            json_data["max_n_grams"] = []
            json_data["idf"] = []
            json_data["norm"] = []
            json_data["output_names"] = []
            json_data["origin"] = "SCIKIT"
        vec = self.resource["vectorizer"]
        json_data["column"].append(self.column_name)
        if vec.stop_words == 'english':
            stop_words = list(ENGLISH_STOP_WORDS)  # maybe would take too much space in json if many text features ?
        elif isinstance(vec.stop_words, list):
            stop_words = vec.stop_words
        else:
            stop_words = []
        json_data["stop_words"].append(stop_words)
        json_data["vocabulary"].append(get_feature_names(vec))
        json_data["min_n_grams"].append(self.min_gram)
        json_data["max_n_grams"].append(self.max_gram)
        output_names = [
            "tfidfvec:%s:%.3f:%s" % (self.column_name, idf, w)
            for (w, idf) in zip(get_feature_names(vec), vec.idf_)
        ]
        json_data["output_names"].append(output_names)
        json_data["idf"].append(vec.idf_)
        json_data["norm"].append("NONE" if vec.norm is None else vec.norm.upper())

    def process(self,  input_df, current_mf, output_ppr, generated_features_mapping):
        series = input_df[self.column_name]
        vec = self.resource["vectorizer"]

        if series.empty:
            # Simulate an empty matrix result to avoid vec.transform erroring out due to no samples
            matrix = scipy.sparse.csr_matrix((0, len(vec.idf_)), dtype=np.float64)
        else:
            matrix = vec.transform(series.fillna(""))

        names = [
            "tfidfvec:%s:%.3f:%s" % (self.column_name, idf, w)
            for (w, idf) in zip(get_feature_names(vec), vec.idf_)
        ]
        append_sparse_with_prefix(current_mf, "tfidfvec:", [self.column_name], SparseMatrixWithNames(matrix, names), generated_features_mapping)


class TextHashingVectorizerWithSVDProcessor(Step):
    """
    Use a restricted version of the hashing trick.
    http://scikit-learn.org/stable/modules/feature_extraction.html#vectorizing-a-large-text-corpus-with-the-hashing-trick

    This is designed to be used with dense matrixes. Instead of creating a huge sparse matrix,
    it first creates the huge sparse matrix then applies a SVD on it to only keep a small (10-50) number
    of features
    It takes values directly from the input df since we don't do other preprocessing for
    these features
    """

    def __init__(self, column_name, n_features=100, n_hash=200000, svd_limit=50000, random_state=1337):
        self.prefix = "hashsvd"
        self.n_features = n_features
        self.n_hash = n_hash
        self.svd_limit = svd_limit
        self.column_name = column_name
        self.random_state = random_state

    def __str__(self,):
        return "Step:%s (%s hs=%s sl=%s sc=%s)" % (self.__class__.__name__, self.column_name, self.n_hash, self.svd_limit, self.n_features)

    def fit_and_process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        from sklearn.decomposition import TruncatedSVD
        from sklearn.feature_extraction.text import HashingVectorizer

        series = input_df[self.column_name]
        hv = HashingVectorizer(n_features=self.n_hash, dtype=np.float32)
        matrix = hv.transform(series.fillna(""))
        preproc_logger.debug("Produced matrix: %s" % str(matrix.shape))
        self.resource["svd"] = TruncatedSVD(n_components=self.n_features, random_state=self.random_state)
        self.resource["svd"].fit(matrix[:self.svd_limit])
        self.process(input_df, current_mf, output_ppr, generated_features_mapping, matrix)

    def init_resources(self, resources_handler):
        resource_name = "%s_%s" % (self.prefix, self.column_name)
        resource_name_legacy = "texthash_svd"
        resource_type = "pkl"
        # Backwards compatible fallback to outdated storage format, see sc74451
        if resources_handler.exist_resource(resource_name_legacy, resource_type):
            preproc_logger.info("Compatibility mode for legacy 'Term Hashing + SVD' preprocessing.")
            self.resource = resources_handler.get_resource(resource_name_legacy, resource_type)["column_name"]
        else:
            self.resource = resources_handler.get_resource(resource_name, resource_type)

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping, matrix=None):
        from sklearn.feature_extraction.text import HashingVectorizer

        series = input_df[self.column_name]
        if series.empty:
            # Simulate an empty array result to avoid hv / svd transform erroring out due to no samples
            transformed = np.empty((0, self.n_features), dtype=np.float32)
        else:
            if matrix is None:
                hv = HashingVectorizer(n_features=self.n_hash, dtype=np.float32)
                matrix = hv.transform(series.fillna(""))
            transformed = self.resource["svd"].transform(matrix)

        block_name = u"thsvd:{}".format(safe_unicode_str(self.column_name))
        out = current_mf.get_df_builder(block_name)
        for i in xrange(0, transformed.shape[1]):
            add_column_to_builder(out, str(i), [self.column_name], transformed[:, i], generated_features_mapping)

        current_mf.flush_df_builder(block_name)


class UnfoldVectorProcessor(Step):

    def __init__(self, input_column_name, vector_length, in_block=None):
        self.input_column_name = input_column_name
        self.vector_length = vector_length
        self.in_block = in_block

    def init_resources(self, resources_handler):
        resource = resources_handler.get_resource("vectors-unfold", "json")
        if "vector_lengths" not in resource.keys():
            resource["vector_lengths"] = {}
        vec_lengths = resource["vector_lengths"]
        if self.input_column_name not in vec_lengths.keys():
            vec_lengths[self.input_column_name] = self.vector_length

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        if self.in_block is None:
            series = input_df[self.input_column_name]
        else:
            series = current_mf.get_block(self.in_block).df[self.input_column_name]
        block_name = u"unfold:{}".format(safe_unicode_str(self.input_column_name))

        def parse_vector(row):
            try:
                vec = json.loads(row)
            except ValueError as e:
                raise safe_exception(ValueError, u"Invalid vector data in column '{}': {}".format(
                    safe_unicode_str(self.input_column_name), safe_unicode_str(e)))
            except TypeError as e:
                raise safe_exception(ValueError, u"Invalid vector data in column '{}' - maybe empty? ({})" .format(
                    safe_unicode_str(self.input_column_name), safe_unicode_str(e)))
            current_len = len(vec)
            if current_len != self.vector_length:
                raise safe_exception(ValueError, u"Size mismatch between different rows when unfolding vector column '{}'."
                                      u" Expected: {}, found: {}".format(safe_unicode_str(self.input_column_name),
                                                                         self.vector_length, current_len))
            if any([not isinstance(x, Number) for x in vec]):
                raise safe_exception(ValueError, u"Some elements of vector column '{}' are not numbers".format(
                    safe_unicode_str(self.input_column_name)))
            else:
                return vec

        if series.empty:
            # Simulate an empty vector result to keep a consistent shape
            blk = np.empty((0, self.vector_length), dtype=np.float64)
        else:
            series_parsed = series.apply(parse_vector)
            blk = np.asarray(series_parsed.tolist())

        names = [u"unfold:{}:{}".format(safe_unicode_str(self.input_column_name), i) for i in xrange(self.vector_length)]
        generated_features_mapping.add_whole_block_mapping(block_name, [self.input_column_name])
        current_mf.append_np_block(block_name, blk, names)


class CategoricalEncodingMixin(object):

    def __init__(self, impute_block, column_name, encoding_name):
        self.impute_block = impute_block
        self.column_name = column_name
        self.encoding_name = encoding_name
        self.json_data = None

    def _get_input_series(self, input_df, current_mf):
        # That's fairly dirty ...
        if self.impute_block is None:
            series = input_df[self.column_name]
        else:
            series = current_mf.get_block(self.impute_block).df[self.column_name]
        return series

    def _init_encoding_dict(self):
        if "columns" not in self.json_data or self.column_name not in self.json_data["columns"]:
            return {}
        else:
            i = self.json_data["columns"].index(self.column_name)
            values = self.json_data["encodings"][i]
            levels = safe_convert_to_string(pd.Series(self.json_data["levels"][i]))
            return dict(zip(levels, values))

    def _init_default_value(self):
        if "columns" not in self.json_data or self.column_name not in self.json_data["columns"]:
            return None
        else:
            i = self.json_data["columns"].index(self.column_name)
            return self.json_data["defaults"][i][0]

    def _update_json_resource(self, df_encoding_map, default_values):
        if "columns" not in self.json_data:
            self.json_data["columns"] = []
            self.json_data["levels"] = []
            self.json_data["encodings"] = []
            self.json_data["defaults"] = []
            self.json_data["outputNames"] = []
        self.json_data["columns"].append(self.column_name)
        levels = []
        impacts = []
        for line in df_encoding_map.itertuples():
            levels.append(line[0])
            impacts.append([x for x in line[1:]])
        self.json_data["levels"].append(levels)
        self.json_data["encodings"].append(impacts)
        self.json_data["defaults"].append([x for x in default_values])
        self.json_data["outputNames"].append([self.encoding_name + ":" + self.column_name + ":" + x for x in df_encoding_map.columns.values])

    def _build_preprocessing_report(self, encoding_map, ret_obj):
        encoding_map.fillna(0, inplace=True)

        ret = {
            "values": [],
            "targetValues": encoding_map.columns[1:].tolist(),
            "counts": [],
            "encodings": [],
        }
        for line in encoding_map.itertuples():
            ret["values"].append(line[0])
            ret["counts"].append(line[1])
            ret["encodings"].append(list(line[2:]))

        if "categoricalEncodings" not in ret_obj:
            ret_obj["categoricalEncodings"] = {}
        ret_obj["categoricalEncodings"][self.column_name] = ret


class TargetEncodingStep(CategoricalEncodingMixin, Step):

    def __init__(self, impute_block, column_name, impact_coder_params, target_variable, output_block):
        encoding_name = "glmm" if impact_coder_params["impact_method"] == "GLMM" else "impact"
        CategoricalEncodingMixin.__init__(self, impute_block, column_name, encoding_name)
        self.impact_coder_params = impact_coder_params
        self.target_variable = target_variable
        self.output_block = output_block
        self.impact_coder = None

    def fit_and_process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        series = self._get_input_series(input_df, current_mf)

        # Align target with rows in input series
        # Typical example: when some rows have been dropped during preprocessing, we need to
        # only keep the non-dropped rows from output_ppr["target"]
        target = RealignTarget.get_realigned_target(current_mf, output_ppr)
        target.index = series.index

        df = self.impact_coder.fit_transform(series, target)
        builder = current_mf.get_df_builder(self.output_block)
        for (column_name, series) in df.items():
            add_column_to_builder(builder, column_name, [self.column_name], series, generated_features_mapping)
        current_mf.flush_df_builder(self.output_block)
        self.impact_coder.encoding_map.fillna(0, inplace=True)
        self._update_json_resource(self.impact_coder.encoding_map, self.impact_coder.default_value)

    def init_resources(self, resources_handler):
        self.json_data = resources_handler.get_resource("impact_coded", "json")

        if self.impact_coder_params["impact_method"] == "GLMM":
            is_regression = resources_handler.prediction_type == doctor_constants.REGRESSION
            base_impact_coder = GLMMEncoding(is_regression,
                                             rescaling_method=self.impact_coder_params["categorical_rescaling"])
        elif self.impact_coder_params["impact_method"] == "M_ESTIMATOR":
            base_impact_coder = ImpactCoding(m=self.impact_coder_params["impact_m"],
                                             rescaling_method=self.impact_coder_params["categorical_rescaling"])
        else:
            safe_exception(ValueError,
                           "Unknown impact computation method: %s" % self.impact_coder_params["impact_method"])

        if self.impact_coder_params["impact_kfold"]:
            preproc_logger.debug(
                "KFold impact coding | m = %s | k = %s | scaling method = %s | seed = %s",
                self.impact_coder_params["impact_m"],
                self.impact_coder_params["impact_kfold_k"],
                self.impact_coder_params["categorical_rescaling"],
                self.impact_coder_params["impact_kfold_seed"]
            )
            categorical_encoder = CategoricalKFoldEncoder(
                base_impact_coder,
                prediction_type=resources_handler.prediction_type,
                k=self.impact_coder_params["impact_kfold_k"],
                seed=self.impact_coder_params["impact_kfold_seed"],
            )
        else:
            preproc_logger.debug(
                "Simple impact coding | m = %s | scaling method = %s",
                self.impact_coder_params["impact_m"], self.impact_coder_params["categorical_rescaling"],
            )
            categorical_encoder = CategoricalSimpleEncoder(base_impact_coder)

        if resources_handler.prediction_type in {doctor_constants.BINARY_CLASSIFICATION, doctor_constants.MULTICLASS}:
            preproc_logger.debug("Classification impact coding")
            impact_coder = ClassificationImpactEncoder(categorical_encoder, target_map=resources_handler.target_map)
        elif resources_handler.prediction_type == doctor_constants.REGRESSION:
            preproc_logger.debug("Regression impact coding")
            impact_coder = categorical_encoder
        else:
            raise safe_exception(TypeError, u"Impact coding is not available for clustering")

        if self.json_data:
            column_idx = self.json_data["columns"].index(self.column_name)
            # Output names are the encoding map column names prefixed by 'ENCODING_NAME:COLUMN_NAME:', so we remove the
            # prefix to retrieve the proper names.
            prefix = self.encoding_name + ":" + self.column_name + ":"
            column_names = [
                name[len(prefix):] if name.startswith(prefix) else name
                for name in self.json_data["outputNames"][column_idx]
            ]
            impact_coder.encoding_map = pd.DataFrame(
                data=self.json_data["encodings"][column_idx],
                index=self.json_data["levels"][column_idx],
                columns=column_names
            )

        self.impact_coder = impact_coder

    def report_fit(self, ret_obj, core_params):
        encoding_map = self.impact_coder.get_reportable_map()
        self._build_preprocessing_report(encoding_map, ret_obj)

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        series = self._get_input_series(input_df, current_mf)

        if series.empty:
            df = series.to_frame()
        else:
            df = self.impact_coder.transform(series)

        for (column_name, series) in df.items():
            builder = current_mf.get_df_builder(self.output_block)
            add_column_to_builder(builder, column_name, [self.column_name], series, generated_features_mapping)

        current_mf.flush_df_builder(self.output_block)


class BaseSimpleCategoricalEncodingStep(CategoricalEncodingMixin, Step):
    """
    :param str impute_block: Name of the Multiframe block the column belongs to
    :param str column_name: Name of the feature column
    :param str output_block: Name of the Multiframe block the encoded columns belong to
    :param str encoding_name: Name of the encoding, used as a prefix for the names of the encoded columns
    :param dict encoding_dict: A mapping from the name of the category (str) to its encoding (list of int or float)
    :param pd.Series category_counts: A pandas Series (dtype: int) of the number of rows per category
    :param int or float default_value: The default value used for encoding
    :param str suffix: Suffix for the names of the encoded columns
    """
    def __init__(self, impute_block, column_name, output_block, encoding_name):
        CategoricalEncodingMixin.__init__(self, impute_block, column_name, encoding_name)
        self.output_block = output_block
        self.encoding_dict = None
        self.category_counts = None
        self.default_value = None
        self.suffix = None

    def _get_default_value(self):
        raise NotImplementedError()

    def _get_fitted_encoding_dict(self, series):
        raise NotImplementedError()

    def get_encoding_df(self):
        index = []
        values = []
        for key, val in self.encoding_dict.items():
            index.append(key)
            values.append(val[0])
        return pd.DataFrame(index=index, data={self.suffix: values})

    def init_resources(self, resources_handler):
        self.json_data = resources_handler.get_resource("impact_coded", "json")  # Legacy naming of JSON file
        self.encoding_dict = self._init_encoding_dict()
        self.default_value = self._init_default_value()

    def fit_and_process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        series = self._get_input_series(input_df, current_mf)
        self.category_counts = series.value_counts()
        self.category_counts.name = "counts"
        self.encoding_dict = self._get_fitted_encoding_dict(series)
        self.default_value = self._get_default_value()
        self.encoding_dict = self._rescale_encoding(series)
        self._update_json_resource(self.get_encoding_df(), [self.default_value])
        self.process(input_df, current_mf, output_ppr, generated_features_mapping)

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        series = self._get_input_series(input_df, current_mf)
        if not(series.empty):
            series = safe_convert_to_string(series).map(lambda x: self.encoding_dict.get(x, [self.default_value])[0]).fillna(value=self.default_value)
        builder = current_mf.get_df_builder(self.output_block)
        add_column_to_builder(builder, self.suffix, [self.column_name], series, generated_features_mapping)
        current_mf.flush_df_builder(self.output_block)

    def _rescale_encoding(self, series):
        raise NotImplementedError()

    def report_fit(self, ret_obj, core_params):
        encoding_map = pd.concat([self.category_counts, self.get_encoding_df()], axis=1)
        encoding_map.sort_values(by="counts", ascending=False, inplace=True)

        self._build_preprocessing_report(encoding_map, ret_obj)


class OrdinalEncodingStep(BaseSimpleCategoricalEncodingStep):

    def __init__(self, impute_block, column_name, output_block, ordinal_params):
        super(OrdinalEncodingStep, self).__init__(impute_block, column_name, output_block, "ordinal")
        self.ordinal_params = ordinal_params
        self.order = ordinal_params["order"]
        self.suffix = self.order.lower()

    def _get_default_value(self):
        default_mode = self.ordinal_params.get("default_mode", "HIGHEST")
        if default_mode == "HIGHEST":
            return max([x[0] for x in self.encoding_dict.values()]) + 1
        elif default_mode == "LOWEST":
            return min([x[0] for x in self.encoding_dict.values()]) - 1
        elif default_mode == "MEDIAN":
            return np.median([x[0] for x in self.encoding_dict.values()])
        elif default_mode == "EXPLICIT":
            return self.ordinal_params.get("default_value", -1)
        else:
            raise ValueError("Invalid default mode: " + str(default_mode))

    def _get_fitted_encoding_dict(self, series):
        series = safe_convert_to_string(series)
        ascending = self.ordinal_params["ascending"]
        if self.order == "COUNT":
            count_series = series.value_counts(dropna=False)
            if "_NA_" not in count_series.index:
                count_series = pd.concat([
                    count_series,
                    pd.Series([0], index=["_NA_"]),
                ], axis=0)

            # Rightmost column is the primary key with np.lexsort
            # We use a -1 multiplier to sort count_series with a descending order
            stable_index = np.lexsort((count_series.index.astype(str), (1 if ascending else -1)*count_series.values))
            res = count_series.index[stable_index].astype(str)
        elif self.order == "LEXICOGRAPHIC":
            res = np.sort(series.unique())
            if not ascending:
                res = res[::-1]
        else:
            raise ValueError("Unknown order for ordinal encoding: " + self.order)
        return {category: [i + 1] for i, category in enumerate(res)}

    def _rescale_encoding(self, series):
        return self.encoding_dict


class FrequencyEncodingStep(BaseSimpleCategoricalEncodingStep):

    def __init__(self, impute_block, column_name, output_block, frequency_params):
        super(FrequencyEncodingStep, self).__init__(impute_block, column_name, output_block, "frequency")
        self.frequency_params = frequency_params
        self.normalized = frequency_params["normalized"]
        self.suffix = "frequency" if self.normalized else "count"

    def _get_default_value(self):
        default_mode = self.frequency_params.get("default_mode", "EXPLICIT")
        if default_mode == "EXPLICIT":
            return self.frequency_params.get("default_value", 0.)
        elif default_mode == "MIN":
            return min([x[0] for x in self.encoding_dict.values()])
        elif default_mode == "MAX":
            return max([x[0] for x in self.encoding_dict.values()])
        elif default_mode == "MEDIAN":
            return np.median([x[0] for x in self.encoding_dict.values()])
        else:
            raise ValueError("Invalid default mode: " + str(default_mode))

    def _get_fitted_encoding_dict(self, series):
        freq_series = safe_convert_to_string(series).value_counts(dropna=False, normalize=self.normalized)
        encoding_dict = {category: [value] for category, value in freq_series.items()}
        if "_NA_" not in encoding_dict:
            encoding_dict["_NA_"] = [0.]
        return encoding_dict

    def _rescale_encoding(self, series):
        rescaling_method = self.frequency_params.get("categorical_rescaling", doctor_constants.AVGSTD)
        if rescaling_method == doctor_constants.NONE or self.normalized:
            return self.encoding_dict
        if rescaling_method not in {doctor_constants.AVGSTD, doctor_constants.MINMAX}:
            raise ValueError("Unknown rescaling method %s", rescaling_method)
        transformed_series = series.map(lambda x: self.encoding_dict.get(x, [self.default_value])[0]).fillna(value=self.default_value)
        shift, scale = utils.get_rescaling_params(rescaling_method, transformed_series)
        return {category: ((np.array(value) - shift) * scale).tolist() for category, value in self.encoding_dict.items()}


class FeatureSelectorOutputExecStep(Step):
    """Used if feature selection was already trained"""
    def __init__(self, selector):
        self.selector = selector

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        self.selector.prune_output(current_mf)


class FeatureSelectorOutputTrainStep(Step):
    """Used if feature selection was not already trained"""
    def __init__(self, selector):
        self.selector = selector

    def fit_and_process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        preproc_logger.debug("Fit and process with selector %s" % self.selector.__class__)
        self.selector.fit_output(current_mf, output_ppr["target"])
        self.selector.prune_output(current_mf)

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        self.selector.prune_output(current_mf)


from dataiku.doctor.preprocessing.pca import PCA, PCA2
import copy

class PCAStep(Step):
    def __init__(self, pca, input_name, output_name):
        self.output_name = output_name
        self.input_name = input_name
        self.pca = pca

    def normalize(self, df,):
        pass

    def fit_and_process(self, input_df, cur_mf, output_ppr, generated_features_mapping):
        mf_to_process = output_ppr[self.input_name]
        preproc_logger.debug("PCA fitting on %s" % mf_to_process)
        df = mf_to_process.as_dataframe()
        preproc_logger.debug("Starting PCA fit on DF of shape %s" % str(df.shape))
        self.pca.fit(df)
        preproc_logger.debug("PCA fit done")
        return self.process(input_df, cur_mf, output_ppr, generated_features_mapping)

    def process(self, input_df, cur_mf, output_ppr, generated_features_mapping):
        mf_to_process = output_ppr[self.input_name]
        preproc_logger.debug("PCA processing on %s. Transforming to DF" % mf_to_process)
        df = mf_to_process.as_dataframe()

        preproc_logger.debug("Starting PCA process on DF of shape %s" % str(df.shape))
        transformed_df = self.pca.transform(df)

        new_mf = MultiFrame()
        new_mf.index = copy.deepcopy(mf_to_process.index)
        new_mf.append_df("pca_out", transformed_df)
        preproc_logger.debug("PCA process done")
        output_ppr[self.output_name] = new_mf


class CustomPreprocessingStep(Step):
    def __init__(self, input_block, input_col, code, wants_matrix, fit_and_process_only_fits=False, accepts_tensor=False):
        super(CustomPreprocessingStep, self).__init__()
        self.input_block = input_block
        self.input_col = input_col
        self.code = code
        self.processor = None
        self.res = None
        self.wants_matrix = wants_matrix
        self.fit_and_process_only_fits = fit_and_process_only_fits
        self.accepts_tensor = accepts_tensor

    def __str__(self,):
        return "Step:%s (%s)" % (self.__class__.__name__, self.input_col)

    def _get_input_series(self, input_df, current_mf):
        # If input_block is not defined, we take the column directly from the input DataFrame; otherwise we need to
        # first retrieve the DataFrame block from the Multiframe
        df = input_df if self.input_block is None else current_mf.get_block(self.input_block).df
        input_series = df[self.input_col]
        return pd.DataFrame({"_": input_series}) if self.wants_matrix else input_series

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        inp = self._get_input_series(input_df, current_mf)
        blk = self.processor.transform(inp)
        preproc_logger.debug("Returned blk of shape %s" % (blk.shape,))
        block_name = u"custom_prep:{}".format(safe_unicode_str(self.input_col))
        if isinstance(blk, pd.DataFrame):
            blk.rename(mapper=lambda column_name: u"{}:{}".format(block_name, safe_unicode_str(column_name)), axis=1, inplace=True)
            names = blk.columns
            current_mf.append_df(block_name, blk)
        else:
            if not (isinstance(blk, scipy.sparse.csr_matrix) or isinstance(blk, np.ndarray)):
                raise ValueError("Custom preprocessing output should be a pandas DataFrame, numpy array "
                                 "or scipy.sparse.csr_matrix, found %s" % type(blk))
            if len(blk.shape) == 1:
                # reshape the output from a (N,) array into a (N, 1) array as 2D arrays are expected for next steps
                blk = blk.reshape(blk.shape[0], 1)

            if not self.accepts_tensor and len(blk.shape) != 2:
                raise ValueError("Output of custom processor should be a 2d matrix")
            if hasattr(self.processor, "names"):
                names = [u"{}:{}".format(block_name, safe_unicode_str(name)) for name in self.processor.names]
            else:
                names = [u"{}:unnamed_{}".format(block_name, idx) for idx in xrange(blk.shape[1])]
            if len(names) != blk.shape[1]:
                raise ValueError("Size mismatch between feature names (%s) and preprocessed array (%s)" % (len(names), blk.shape[1]))
            if isinstance(blk, scipy.sparse.csr_matrix):
                append_sparse_with_prefix(current_mf, "custom_prep:", [self.input_col], SparseMatrixWithNames(blk, names), generated_features_mapping)
                return

            elif isinstance(blk, np.ndarray):
                current_mf.append_np_block(block_name, blk, names)
            else:
                pass  # won't happen due to above check
        generated_features_mapping.add_features_to_block(names, block_name)
        generated_features_mapping.add_whole_block_mapping(block_name, [self.input_col])

    def fit_and_process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        ctx = {}
        exec(self.code, ctx)
        inp = self._get_input_series(input_df, current_mf)
        processor = ctx.get("processor", None)

        if processor is None:
            raise safe_exception(Exception, u"No 'processor' variable defined for Custom preprocessing of feature '{}'".format(safe_unicode_str(self.input_col)))

        processor.fit(inp)
        self.processor = processor
        self.res[self.input_col] = processor

        if self.fit_and_process_only_fits:
            return None
        else:
            return self.process(input_df, current_mf, output_ppr, generated_features_mapping)

    def init_resources(self, resources_handler):
        self.res = resources_handler.get_resource("custom_prep", "pkl")
        if self.input_col in self.res:
            self.processor = self.res[self.input_col]


class FileFunctionPreprocessor(Step):

    def __init__(self, input_col, code, file_reader, func_name, fit_and_process_only_fits=True):
        super(FileFunctionPreprocessor, self).__init__()
        self.input_col = input_col
        self.code = code
        self.func_name = func_name
        self.fit_and_process_only_fits = fit_and_process_only_fits
        self.file_reader = file_reader

    def fit_and_process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        if not self.fit_and_process_only_fits:
            self.process(input_df, current_mf, output_ppr, generated_features_mapping)

    def process(self, input_df, current_mf, output_ppr, generated_features_mapping):
        dic = {}
        exec(self.code, dic, dic)
        series = input_df[self.input_col]
        block_name = "custom_file_prep_%s" % self.input_col
        blk = np.array(series.apply(self.__apply_user_defined_func(dic)).tolist())

        if not isinstance(blk, np.ndarray):
            raise safe_exception(ValueError, u"Output of '{}' for feature '{}' should be a ndarray".format(
                safe_unicode_str(self.func_name), safe_unicode_str(self.input_col)))

        current_mf.append_np_block(block_name, blk, None)
        generated_features_mapping.add_whole_block_mapping(block_name, [self.input_col])

    def __apply_user_defined_func(self, dic):
        def func_to_apply(x):
            with RaiseWithTraceback(u"Failed to preprocess the following file: '{}'".format(safe_unicode_str(x))):
                return dic[self.func_name](self.file_reader.read(x))
        return func_to_apply


def cubic_root(x):
    return x ** (1. / 3.)


# Special version for the notebook. Works on a dataframe, not
# on a MultiFrame
def detect_outliers(df,
                    pca_kept_variance=0.9,
                    min_n=0,
                    min_cum_ratio=0.01,
                    random_state=1337):

    pca = PCA(kept_variance=pca_kept_variance, normalize=True)
    preproc_logger.debug("Outliers detection: fitting PCA")
    pca.fit(df)
    preproc_logger.debug("Outliers detection: performing PCA")
    df_reduced = pca.transform(df)
    n_lines = df_reduced.shape[0]
    n_clusters = max(3, int(cubic_root(n_lines)))
    preproc_logger.debug("Outliers detection: performing cubic-root kmeans on df %s" % str(df_reduced.shape))
    model = get_kmeans_estimator(n_clusters=n_clusters, random_state=random_state, n_init=10)
    labels = pd.Series(model.fit_predict(df_reduced.values))
    preproc_logger.debug("Outliers detection: selecting mini-clusters")
    label_counts = pd.DataFrame(labels.value_counts(ascending=True))
    label_counts.columns = ["count"]
    label_counts["ratio"] = label_counts["count"] / label_counts["count"].sum()
    label_counts["cum_ratio"] = label_counts["ratio"].cumsum()
    label_counts["outlier"] = (label_counts["ratio"] < min_cum_ratio) | (label_counts["count"] < min_n)
    preproc_logger.debug("Outliers detection: done")
    return labels.map(label_counts["outlier"])

class RandomColumnsGenerator(Step):
    def __init__(self, n_columns):
        self.n_columns = n_columns

    def process(self, input_df, cur_mf, output_ppr, generated_features_mapping):
        nrows = cur_mf.shape()[0]
        rnd = np.random.randn(nrows * self.n_columns).reshape(nrows, self.n_columns)
        cur_mf.append_np_block("random_data", rnd, ["rnd_%s" % xrange(self.n_columns)])

class NumericalFeaturesClusteringGenerator(Step):
    def __init__(self, preprocessing_settings, settings):
        self.preprocessing_settings = preprocessing_settings
        self.settings = settings

    def init_resources(self, mp):
        self.res = mp.get_resource("gen_numericals_clustering", "pkl")

    def get_evolution_def(self):
        pass

    def set_evolution_state(self, es):
        pass

    def get_numerical_features(self):
        ret = []
        for (k, v) in self.preprocessing_settings["per_feature"].items():
            if v["type"] == "NUMERIC" and v["role"] == "INPUT":
                ret.append(k)
        return ret 

    def perform_replacement(self, cur_mf, df, kmeans):
        k = self.settings["k"]
        if self.settings["transformation_mode"] == "REPLACE_BY_DISTANCE":            
            distances = kmeans.transform(df)
            closest_distances = distances.min(axis=1).reshape(distances.shape[0], 1)
            preproc_logger.debug("Distances: %s" % distances)
            preproc_logger.debug("Closesst: %s" % closest_distances)
            cur_mf.append_np_block("numericals_clustering", closest_distances, ["distance_to_centroid"])

        elif self.settings["transformation_mode"] == "DUMMIFY_CLUSTERID":
            labels = kmeans.predict(df)
            dumm = FastSparseDummifyProcessor(None, "numericals_clustering", xrange(k), False)._create_matrix(pd.Series(labels))
            preproc_logger.debug("Labels: %s" % labels)
            preproc_logger.debug("Dummies: %s" % dumm)
            cur_mf.append_sparse("numericals_clustering", dumm)

    def fit_and_process(self, input_df, cur_mf, output_ppr, generated_features_mapping):
        if self.settings["behavior"] == "ENABLED_MANUAL":
            k = self.settings["k"]
            if self.settings["all_features"]:
                features = self.get_numerical_features()
            else:
                features = self.settings["input_features"]
        else:
            raise Exception("Unimplemented")

        block = cur_mf.get_block("NUM_IMPUTED")
        df = block.df[features]

        preproc_logger.debug("Fitting clustering on %s" % (df.shape,))

        kmeans = get_kmeans_estimator(n_clusters=k, random_state=1337, n_init=10)
        kmeans.fit(df)
        self.res["kmeans"] = kmeans

        self.perform_replacement(cur_mf, df, kmeans)

    def process(self, input_df, cur_mf, output_ppr, generated_features_mapping):
        if self.settings["behavior"] == "ENABLED_MANUAL":
            k = self.settings["k"]
            mode = self.settings["transformation_mode"]
            if self.settings["all_features"]:
                features = self.get_numerical_features()
            else:
                features = self.settings["input_features"]
        else:
            raise Exception("Unimplemented")

        block = cur_mf.get_block("NUM_IMPUTED")
        df = block.df[features]
        self.perform_replacement(cur_mf, df, self.res["kmeans"])


class CategoricalsImpactCodingTransformerGenerator(Step):
    pass



class CategoricalsCountTransformerGenerator(Step):
    def __init__(self, preprocessing_settings, settings):
        self.preprocessing_settings = preprocessing_settings
        self.settings = settings

    def init_resources(self, mp):
        self.res = mp.get_resource("gen_categorical_counts", "pkl")

    def get_evolution_def(self):
        pass

    def set_evolution_state(self, es):
        pass

    def get_input_features(self):
        if self.settings["behavior"] == "ENABLED_MANUAL":
            if self.settings["all_features"]:
                ret = []
                for (k, v) in self.preprocessing_settings["per_feature"].items():
                    if v["type"] == "CATEGORY" and v["role"] == "INPUT":
                        ret.append(k)
                return ret 
            else:
                ret = self.settings["input_features"]
        else:
            raise Exception("Unimplemented")

    def fit_and_process(self, input_df, cur_mf, output_ppr, generated_features_mapping):
        #block = cur_mf.get_block("CAT_IMPUTED")
        features = self.get_input_features()

        for feat in features:
            series = input_df[feat]
            counts = series.value_counts(dropna=False)
            to_take = 200
            candidates = [(k, v) for (k, v) in counts.iloc[0:(to_take+1)].items()]

            self.res["counts_%s"% feat] = candidates

        self.process(input_df, cur_mf, output_ppr, generated_features_mapping)

    def process(self, input_df, cur_mf, output_ppr, generated_features_mapping):
        #block = cur_mf.get_block("CAT_IMPUTED")
        features = self.get_input_features()

        new_df = pd.DataFrame()

        preproc_logger.debug("CCTF on %s" % features)
        preproc_logger.debug("Mapping: %s" % self.res)

        for feat in features:
            mapping = {x: y for (x,y) in  self.res["counts_%s"% feat]}
            new_df["categoricals_count_transformer:%s" % feat] = input_df[feat].map(mapping).fillna(1)

        cur_mf.append_df("categoricals_count_transformer", new_df)


class OutlierDetection(Step):
    """Performs outliers detection.
       Outputs a new multiframe in output. Does not touch the main multiframe

    """

    def __init__(self,
                 pca_kept_variance,
                 min_n,
                 min_cum_ratio,
                 outlier_name='OUTLIERS',
                 random_state=1337):
        self.min_n = min_n
        self.min_cum_ratio = min_cum_ratio
        self.pca_kept_variance = pca_kept_variance
        self.outlier_name = outlier_name
        self.random_state = random_state

    def init_resources(self, mp):
        self.res = mp.get_resource("clustering_outliers", "pkl")

    def _find_outliers(self, mini_labels):
        preproc_logger.debug("Outliers detection: selecting mini-clusters")
        label_counts = pd.DataFrame(mini_labels.value_counts(ascending=True))
        label_counts.columns = ["count"]
        dataset_size = label_counts["count"].sum()
        label_counts["ratio"] = label_counts["count"] / dataset_size
        label_counts["cum_ratio"] = label_counts["ratio"].cumsum()
        label_counts["outlier"] = (label_counts["cum_ratio"] < self.min_cum_ratio) | (label_counts["count"] < self.min_n)
        check_outliers_parameters(dataset_size, self.min_n)
        preproc_logger.debug("Outliers detection: done (%s mini-clusters are outliers)", label_counts["outlier"].sum())
        outliers_labels = label_counts[label_counts["outlier"]].index.tolist()
        return outliers_labels, mini_labels.map(label_counts["outlier"])

    def _apply_results(self, outliers_mask, cur_mf, input_df, output_ppr):
        # Save outliers detection
        outliers_mf = MultiFrame()
        outliers_mf.append_df("outliers_block", pd.DataFrame({"data": outliers_mask}))
        output_ppr[self.outlier_name] = outliers_mf

        # Apply suppression
        if outliers_mask.sum() > 0:
            preproc_logger.debug("Remove some rows. Shape before:\n%s" % cur_mf.stats())
            cur_mf.drop_rows(outliers_mask, DropRowReason.CLUSTERING_OUTLIERS)
            preproc_logger.debug("Removed some rows. Shape after:\n%s" % cur_mf.stats())

            input_df.drop(input_df.index[utils.series_nonzero(outliers_mask)], inplace=True)
            preproc_logger.debug("After outliers input_df=%s" % str(input_df.shape))

    def fit_and_process(self, input_df, cur_mf, output_ppr, generated_features_mapping):
        inp = cur_mf.as_np_array()
        names = cur_mf.columns()

        if inp.shape[0] == 0:
            preproc_logger.warning("No remaining row after preprocessing, skipping OutlierDetection Step")
            return

        preproc_logger.debug("Outliers detection: fitting PCA")
        self.res["pca"] = PCA2(kept_variance=self.pca_kept_variance, random_state=self.random_state)
        self.res["pca"].fit(inp, names)
        preproc_logger.debug("Outliers detection: done fitting PCA")

        df_reduced = self.res["pca"].transform(inp, names)
        n_lines = df_reduced.shape[0]
        n_clusters = max(3, int(cubic_root(n_lines)))

        preproc_logger.debug("Outliers detection: performing cubic-root kmeans on df %s" % str(df_reduced.shape))
        self.res["mini_kmeans"] = get_kmeans_estimator(n_clusters=n_clusters, random_state=self.random_state, n_init=10)
        mini_labels = pd.Series(self.res["mini_kmeans"].fit_predict(df_reduced.values))
        preproc_logger.debug("Outliers detection: done kmeans")

        outliers_labels, outliers_mask = self._find_outliers(mini_labels)
        preproc_logger.debug("Detected %d outliers" % len(outliers_labels))
        self._apply_results(outliers_mask, cur_mf, input_df, output_ppr)
        self.res["outliers_labels"] = outliers_labels

        if cur_mf.shape()[0] == 0:
            raise DkuDroppedMultiframeException("Outliers detection: all rows have been dropped. Check mini-cluster size threshold")

    def process(self, input_df, cur_mf, output_ppr, generated_features_mapping):
        inp = cur_mf.as_np_array()
        names = cur_mf.columns()

        if inp.shape[0] == 0:
            preproc_logger.warning("No remaining row after preprocessing, skipping OutlierDetection Step")
            return

        preproc_logger.debug("Outliers detection (apply): applying PCA")
        df_reduced = self.res["pca"].transform(inp, names)
        preproc_logger.debug("Outliers detection (apply): applying KMeans")
        mini_labels = pd.Series(self.res["mini_kmeans"].predict(df_reduced.values))
        preproc_logger.debug("Outliers detection (apply): using")
        if self.res.get("outliers_labels") is not None:
            outlier_labels = self.res["outliers_labels"]
        else:
            # Backward compatibility: only for clustering models trained on older DSS version
            outlier_labels, _ = self._find_outliers(mini_labels)
        outliers_mask = mini_labels.isin(outlier_labels)
        self._apply_results(outliers_mask, cur_mf, input_df, output_ppr)

class PreprocessingResult(dict):

    def __init__(self, retain=None):
        self.retain = retain

    def __setitem__(self, k, v):
        if self.retain is None or k in self.retain:
            dict.__setitem__(self, k, v)


class PreprocessingPipeline(object):
    __slots__ = ('steps', 'results', 'generated_features_mapping', 'unrecorded_value')

    def __setstate__(self, state):
        _, state = state  # because of __slots__ the default __getstate__ and __setstate__ methods expect a tuple
        self.steps = state["steps"]
        self.generated_features_mapping = state["generated_features_mapping"]
        # Attribute added in 13.4.0 and must be set to 0 for compatibility with older ensemble models
        self.unrecorded_value = state.get("unrecorded_value", 0.)

    def __init__(self, steps, unrecorded_value=0.):
        self.steps = steps
        self.generated_features_mapping = GeneratedFeaturesMapping()
        self.unrecorded_value = unrecorded_value

    def init_resources(self, resource_handler):
        for step in self.steps:
            step.init_resources(resource_handler)

    def fit_and_process(self, input_df, *args, **kwargs):
        result = {}
        cur_mf = MultiFrame()
        cur_mf.set_unrecorded_value(self.unrecorded_value)
        cur_mf.set_index_from_df(input_df)
        for step in self.steps:
            preproc_logger.debug("FIT/PROCESS WITH %s" % step)
            new_mf = step.fit_and_process(input_df, cur_mf, result, self.generated_features_mapping)
            if new_mf is not None:
                cur_mf = new_mf
        return result

    def report_fit(self, ret_obj, core_params):
        for step in self.steps:
            step.report_fit(ret_obj, core_params)

    def process(self, input_df, retain=None):
        result = PreprocessingResult(retain=retain)

        cur_mf = MultiFrame()
        cur_mf.set_unrecorded_value(self.unrecorded_value)
        cur_mf.set_index_from_df(input_df)
        for step in self.steps:
            preproc_logger.debug("PROCESS WITH %s" % step)
            new_mf = step.process(input_df, cur_mf, result, self.generated_features_mapping)
            if new_mf is not None:
                cur_mf = new_mf
        return result

class DkuDroppedMultiframeException(Exception):
    def __init__(self, message):
        super(DkuDroppedMultiframeException, self).__init__(message)
