import logging
from math import sqrt

import numpy as np
import pandas as pd
import scipy.stats
from six.moves import xrange
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from dataiku.core import doctor_constants
from dataiku.doctor.prediction.common import prepare_multiframe
from dataiku.doctor.prediction.common import weighted_quantile
from dataiku.doctor.prediction.custom_scoring import aggregate_custom_metrics_for_cross_val_model
from dataiku.doctor.prediction.custom_scoring import calculate_regression_custom_metrics
from dataiku.doctor.prediction.custom_scoring import get_custom_score_from_custom_metrics_results
from dataiku.doctor.prediction.linear_coefficients_computation import Denormalizer
from dataiku.doctor.prediction.linear_coefficients_computation import compute_coefs_if_available
from dataiku.doctor.prediction.overrides.ml_overrides_params import OVERRIDE_INFO_COL
from dataiku.doctor.prediction.overrides.ml_overrides_results import OverriddenPredictionResults
from dataiku.doctor.prediction.overrides.ml_overrides_results import OverridesResultsMixin
from dataiku.doctor.prediction.scorable_model import ScorableModel
from dataiku.doctor.prediction.scoring_base import BaseCVModelScorer
from dataiku.doctor.prediction.scoring_base import ClassicalPredictionModelScorer
from dataiku.doctor.prediction.scoring_base import PredictionModelIntrinsicScorer
from dataiku.doctor.prediction.scoring_base import PredictionModelScorer
from dataiku.doctor.prediction.scoring_base import build_partial_dependence_plot
from dataiku.doctor.preprocessing.assertions import MLAssertionMetrics
from dataiku.doctor.preprocessing.assertions import MLAssertionsMetrics
from dataiku.doctor.preprocessing.dataframe_preprocessing import RescalingProcessor2
from dataiku.doctor.utils import dku_nonaninf
from dataiku.doctor.utils.metrics import handle_failure
from dataiku.doctor.utils.metrics import mean_absolute_percentage_error
from dataiku.doctor.utils.metrics import rmsle_score
from dataiku.doctor.utils.stats import jarque_bera
from dataikuscoring.utils.prediction_result import PREDICTION_INTERVAL_LOWER
from dataikuscoring.utils.prediction_result import PREDICTION_INTERVAL_UPPER
from dataikuscoring.utils.prediction_result import PredictionResult
from dataikuscoring.utils.scoring_data import ScoringData

logger = logging.getLogger(__name__)
from scipy import stats


class RegressionModelIntrinsicScorer(PredictionModelIntrinsicScorer):
    def __init__(self, modeling_params, clf, train_X, train_y, pipeline, out_folder_context, prepared_X, initial_intrinsic_perf_data, with_sample_weight):
        PredictionModelIntrinsicScorer.__init__(self, modeling_params, clf, train_X, train_y, out_folder_context, prepared_X, with_sample_weight)
        self.initial_intrinsic_perf_data = initial_intrinsic_perf_data
        self.pipeline = pipeline
        self._rescalers = None  # lazy init

    def _extract_rescalers(self):
        if self._rescalers is None:
            self._rescalers = list(filter(lambda u: isinstance(u, RescalingProcessor2), self.pipeline.steps))
        return self._rescalers

    def score(self):
        ret = self.initial_intrinsic_perf_data

        logger.info("Intrinsic scoring")

        if self.modeling_params.get("skipExpensiveReports"):
            logger.info("Skipping potentially expensive reports")  # tree(s) summary, PDP

        else:
            logger.info("Extracting rescalers")
            rescalers = self._extract_rescalers()

            if self.modeling_params['algorithm'] == 'DECISION_TREE_REGRESSION':
                logger.info("Creating decision tree summary")
                tree_summary = TreeSummaryBuilder(self.clf, self.train_X.columns(), rescalers, True,
                                                  self.with_sample_weight).build()
                self.out_folder_context.write_json("tree.json", tree_summary)
                logger.info("Computing DT PDP")
                ret["partialDependencies"] = build_partial_dependence_plot(self.clf, self.train_X, self.train_y, rescalers)

            elif self.modeling_params['algorithm'] == 'GBT_REGRESSION':
                logger.info("Creating gradient boosting trees summary")
                summary = GradientBoostingSummaryBuilder(self.clf, self.train_X.columns(), rescalers, True,
                                                         self.modeling_params["max_ensemble_nodes_serialized"],
                                                         self.with_sample_weight).build()
                self.out_folder_context.write_json("trees.json", summary)
                logger.info("Computing GBT PDP")
                ret["partialDependencies"] = build_partial_dependence_plot(self.clf, self.train_X, self.train_y, rescalers)

            elif self.modeling_params['algorithm'] == 'RANDOM_FOREST_REGRESSION':
                logger.info("Creating random forest trees summary")
                summary = RandomForestSummaryBuilder(self.clf, self.train_X.columns(), rescalers, True,
                                                     self.modeling_params["max_ensemble_nodes_serialized"],
                                                     self.with_sample_weight).build()
                self.out_folder_context.write_json("trees.json", summary)
                logger.info("Computing RF PDP")
                ret["partialDependencies"] = build_partial_dependence_plot(self.clf, self.train_X, self.train_y, rescalers)

        if self.modeling_params['algorithm'] == 'LARS':
            self.out_folder_context.write_json("coef_path.json", {
                "path": [[[t] for t in x] for x in self.clf.coef_path_],
                "features": self.train_X.columns(),
                "currentIndex": self.clf.current_index
            })

        self.add_raw_feature_importance_if_exists(self.clf, ret)

        compute_coefs_if_available(self.clf, self.train_X, self.prepared_X, self.train_y, self._extract_rescalers(), ret, False)

        self.out_folder_context.write_json("iperf.json", ret)


def pearson_correlation(valid_y, preds, sample_weight=None):
    if sample_weight is None:
        results = pd.DataFrame({
            "__target__": valid_y,
            "predicted": preds
        })
        correlation = results[['predicted', '__target__']].corr()
        return correlation['predicted'][1]
    else:
        # https://en.wikipedia.org/wiki/Pearson_correlation_coefficient#Weighted_correlation_coefficient
        sum_w = np.sum(sample_weight)
        w_avg_y  = np.sum(sample_weight  * valid_y) / sum_w
        w_avg_yp = np.sum(sample_weight * preds)    / sum_w
        w_sigma_y  = np.sum(sample_weight * valid_y * valid_y) / sum_w - w_avg_y  * w_avg_y
        w_sigma_yp = np.sum(sample_weight * preds   * preds)   / sum_w - w_avg_yp * w_avg_yp
        w_cov =      np.sum(sample_weight * valid_y * preds)   / sum_w - w_avg_y  * w_avg_yp
        try:
            corr = w_cov / np.sqrt(w_sigma_y * w_sigma_yp)
        except:
            corr = np.nan
        return corr


def compute_assertions_for_regression(preds, assertions):
    assertions_metrics = MLAssertionsMetrics()
    logger.info("Computing assertions metrics for assertions {}".format(assertions.printable_names()))
    for assertion in assertions:
        mask = assertion.mask.values  # mask and decision are aligned, we can work with np arrays
        condition = assertion.params["assertionCondition"]
        nb_rows_in_mask = np.sum(mask)
        nb_dropped_rows = assertion.nb_initial_rows - nb_rows_in_mask
        nb_valid_rows_in_mask = np.sum(mask
                                       & (preds >= condition["expectedMinValue"])
                                       & (preds <= condition["expectedMaxValue"]))
        if nb_rows_in_mask > 0:
            valid_ratio = 1.0 * nb_valid_rows_in_mask / nb_rows_in_mask
            result = bool(valid_ratio >= condition["expectedValidRatio"])
        else:
            valid_ratio = None
            result = None

        new_assertion_metrics = MLAssertionMetrics(result, assertion.nb_initial_rows, nb_dropped_rows, valid_ratio,
                                                   assertion.params["name"])
        assertions_metrics.add_assertion_metrics(new_assertion_metrics)
    logger.info("Finished computing assertions metrics")
    return assertions_metrics


def compute_assertions_and_overrides_for_regression_from_clf(clf, model_type, modeling_params,
                                                             transformed, overrides_params):
    logger.info("Computing assertions from model")
    model = ScorableModel.build(clf, model_type, doctor_constants.REGRESSION, modeling_params['algorithm'],
                                overrides_params=overrides_params)

    prediction_result = get_predictions_for_regression(model, modeling_params, transformed)
    has_assertions = "assertions" in transformed
    assertions_metrics = compute_assertions_for_regression(prediction_result.preds,
                                                           transformed["assertions"]) if has_assertions else None
    overrides_metrics = (prediction_result.compute_and_return_overrides_metrics()
                         if isinstance(prediction_result, OverridesResultsMixin) else None)
    return assertions_metrics, overrides_metrics


def compute_metrics(valid_y, preds, sample_weight=None, treat_failure_as_error=True):
    mse = dku_nonaninf(handle_failure(lambda: mean_squared_error(valid_y, preds, sample_weight=sample_weight), treat_failure_as_error))
    return {
        "evs": dku_nonaninf(handle_failure(lambda: explained_variance_score(valid_y, preds, sample_weight=sample_weight), treat_failure_as_error)),
        "mape": dku_nonaninf(handle_failure(lambda: mean_absolute_percentage_error(valid_y, preds, sample_weight=sample_weight), treat_failure_as_error)),
        "mae": dku_nonaninf(handle_failure(lambda: mean_absolute_error(valid_y, preds, sample_weight=sample_weight), treat_failure_as_error)),
        "mse": mse,
        "rmse": dku_nonaninf(sqrt(mse) if mse is not None else None),
        "rmsle": dku_nonaninf(handle_failure(lambda: rmsle_score(valid_y, preds, sample_weight=sample_weight), treat_failure_as_error)),
        "r2": dku_nonaninf(handle_failure(lambda: r2_score(valid_y, preds, sample_weight=sample_weight), treat_failure_as_error)),
        "pearson": dku_nonaninf(handle_failure(lambda: pearson_correlation(valid_y, preds, sample_weight=sample_weight), treat_failure_as_error))
    }


def regression_predict_ensemble(model, data, has_target=False):
    if has_target:
        model.clf.set_with_target_pipelines_mode(True)
    preds_df = model.clf.predict_as_dataframe(data)
    preds = preds_df["prediction"].values
    prediction_result = PredictionResult(preds)
    return ScoringData(prediction_result=prediction_result, preds_df=preds_df)


def regression_predict_single(model, pipeline, modeling_params, data):
    logger.info("Prepare to predict ...")
    transformed = pipeline.process(data)
    features_X_orig = features_X = transformed["TRAIN"]
    unprocessed_df = transformed["UNPROCESSED"]
    features_X, is_sparse = prepare_multiframe(features_X, modeling_params)

    return regression_predict_single_from_prepared(model, features_X, unprocessed_df, features_X_orig.index)


def regression_predict_single_from_prepared(model, X, unprocessed_df, orig_index):
    logger.info("Start actual predict")
    prediction_result = model.compute_predictions(X, unprocessed_df)
    logger.info("Done actual predict, formatting output")
    preds_df = prediction_result.as_dataframe()
    preds_df.index = orig_index
    return ScoringData(prediction_result=prediction_result, preds_df=preds_df)


def regression_predict(model, pipeline, modeling_params, data, ensemble_has_target=False):
    """
    :rtype: ScoringData
    """
    logger.info("Start predict block")
    algo = modeling_params["algorithm"]
    logger.info("Start actual predict")
    if algo == "PYTHON_ENSEMBLE":
        ret = regression_predict_ensemble(model, data, has_target=ensemble_has_target)
    else:
        ret = regression_predict_single(model, pipeline, modeling_params, data)
    logger.info("Done actual predict")
    return ret


def regression_scorer_with_valid(modeling_params, model, valid, model_folder_context, input_df_index, with_sample_weight=False):
    valid_Y = valid["target"]
    if with_sample_weight:
        valid_w = valid["weight"]
    else:
        valid_w = None

    prediction_result = get_predictions_for_regression(model, modeling_params, valid)
    prediction_result.assert_not_all_declined()
    assertions = valid.get("assertions", None)
    return RegressionModelScorer(modeling_params, prediction_result, valid_Y, model_folder_context, test_unprocessed=valid['UNPROCESSED'],
                                 test_X=valid["TRAIN"], test_df_index=input_df_index, test_sample_weight=valid_w, assertions=assertions)


def compute_preds_pdf(preds, sample_weights=None):
    kde = scipy.stats.gaussian_kde(preds, weights=sample_weights)
    xmin = np.min(preds)
    xmax = np.max(preds)
    x = np.linspace(xmin, xmax, num=100)
    pdf = kde(x)
    return {"x": x, "pdf": pdf}


def get_predictions_for_regression(model, modeling_params, valid):
    """
    :rtype: PredictionResult
    """
    valid_X = valid["TRAIN"]
    valid_X, _ = prepare_multiframe(valid_X, modeling_params)
    logger.info("Creating predictions on test set")
    return model.compute_predictions(valid_X, valid["UNPROCESSED"])


class RegressionModelScorer(ClassicalPredictionModelScorer):
    def __init__(self, modeling_params, test_prediction_result, test_y, out_folder_context, test_unprocessed=None, test_X=None, test_df_index=None,
                 test_sample_weight=None, assertions=None):
        """
        :param dict modeling_params: modeling choices of the current ML task (see PredictionModelingParams.java in backend)
        :param PredictionResult test_prediction_result: output of model
        :param Series test_y: 1-dimensional array representing the ground truth target on the test set
        :param dataiku.base.folder_context.FolderContext | None out_folder_context: directory where predicted data and perf.json will be written
        :param pandas.DataFrame | None test_unprocessed: The "UNPROCESSED" value returned from processing the test dataset via pipeline.process().
        Required for the custom metric x_valid parameter.
        :param dataiku.doctor.multiframe.MultiFrame | None test_X: The "TRAIN" value returned from processing the test dataset via pipeline.process().
        If None, no data will be written on disk (e.g. training recipes).
        :param test_df_index: Pandas index of the input dataframe of the original test set, prior to any processing
        :param Series test_sample_weight: 1-dimensional array representing sample weights on the test set
        :param MLAssertions assertions: collection of assertions based on ML performance metrics
        """
        super(RegressionModelScorer, self).__init__(modeling_params, out_folder_context,
                                                    test_prediction_result.align_with_not_declined, test_y,
                                                    test_unprocessed, test_X, test_df_index, test_sample_weight,
                                                    assertions=assertions)
        self.test_prediction_result = test_prediction_result
        self.test_predictions = test_prediction_result.preds_not_declined
        self.scorer_without_overrides = self._instantiate_scorer_without_overrides(
            modeling_params, test_prediction_result, test_y, out_folder_context, test_unprocessed, test_X,
            test_df_index, test_sample_weight, assertions)

    @staticmethod
    def _instantiate_scorer_without_overrides(modeling_params, test_prediction_result, test_y, out_folder_context,
                                              test_unprocessed, test_X, test_df_index, test_sample_weight, assertions):
        if not isinstance(test_prediction_result, OverriddenPredictionResults):
            return None
        # Since test_prediction_result.raw_prediction_result will never be of class
        # OverriddenPredictionResults we won't be entering into an instantiation loop for the Scorer
        raw_test_prediction_result = test_prediction_result.raw_prediction_result
        return RegressionModelScorer(modeling_params, raw_test_prediction_result, test_y, out_folder_context,
                                     test_unprocessed, test_X, test_df_index, test_sample_weight, assertions)

    def save(self, dump_predicted=True):
        PredictionModelScorer.save(self, dump_predicted)
        # Dump the prediction pdf
        save_regression_statistics(self.test_predictions, self.out_folder_context)

    def _do_score(self, with_assertions, treat_metrics_failure_as_error=True):

        self.ret["regression_performance"] = self.get_regression_performance(self.test_y, self.test_predictions, self.test_sample_weight)

        self.ret["residuals"] = self.compute_residuals(self.test_y, self.test_predictions, self.test_sample_weight)

        self.ret["scatterPlotData"] = self.compute_scatter_plot(self.test_predictions, self.test_y)

        self.ret["metrics"] = compute_metrics(self.test_y, self.test_predictions, self.test_sample_weight, treat_metrics_failure_as_error)

        if with_assertions and self.assertions:
            assertions_metrics = compute_assertions_for_regression(self.test_predictions, self.assertions)
            self.ret["metrics"]["assertionsMetrics"] = assertions_metrics.to_dict()

        if isinstance(self.test_prediction_result, OverridesResultsMixin):
            self.ret["metrics"]["overridesMetrics"] = self.test_prediction_result.compute_and_return_overrides_metrics().to_dict()

        if "customMetrics" in self.modeling_params["metrics"]:
            self.ret["metrics"]["customMetricsResults"] = calculate_regression_custom_metrics(self.modeling_params["metrics"],
                                                                                              self.test_unprocessed,
                                                                                              self.test_y,
                                                                                              self.test_predictions,
                                                                                              self.test_sample_weight)
            # "customScore" is deprecated in favour of custom metrics. We set it for backwards compatibility reasons
            if self.modeling_params["metrics"]["evaluationMetric"] == "CUSTOM":
                self.ret["metrics"]["customScore"] = get_custom_score_from_custom_metrics_results(
                    self.ret["metrics"]["customMetricsResults"],
                    self.modeling_params["metrics"]["customEvaluationMetricName"]
                )

        # Global metrics
        global_metrics = {}
        if self.test_sample_weight is not None:
            test_weight = self.test_sample_weight.sum()
            target_avg = np.dot(self.test_y, self.test_sample_weight) / test_weight
            pred_avg = np.dot(self.test_predictions, self.test_sample_weight) / test_weight
            global_metrics["testWeight"] = test_weight
            global_metrics["targetAvg"] = [ target_avg ]
            global_metrics["targetStd"] = [np.sqrt(max(np.dot(self.test_y ** 2, self.test_sample_weight) / test_weight - target_avg ** 2, 0.))]
            global_metrics["predictionAvg"] = [ pred_avg ]
            global_metrics["predictionStd"] = [np.sqrt(max(np.dot(self.test_predictions ** 2, self.test_sample_weight) / test_weight - pred_avg ** 2, 0.))]
        else:
            global_metrics["testWeight"] = self.test_y.shape[0]
            global_metrics["targetAvg"] = [self.test_y.mean()]
            global_metrics["targetStd"] = [self.test_y.std() if self.test_y.shape[0] > 1 else 0]
            global_metrics["predictionAvg"] = [self.test_predictions.mean()]
            global_metrics["predictionStd"] = [self.test_predictions.std() if self.test_predictions.shape[0] > 1 else 0]
        self.ret["globalMetrics"] = global_metrics

        if self.test_X_index is not None:
            self.compute_predicted_data()

        # compute pdf
        try:
            self.ret["predictionPDF"] = RegressionModelScorer.compute_preds_pdf(self.test_predictions, self.test_sample_weight)
        except Exception as err: #NOSONAR (catching all because several different exceptions can be raised by pdf computation)
            logger.warn("Could not compute prediction PDF: {}. Can be normal when performing subpopulation analysis.".format(str(err)))

        self.perf_data = self.ret
        return self.ret

    def compute_scatter_plot(self, preds, valid_Y, random_state=42, max_sample=1000):
        logger.info("Computing scatter plot")
        both = pd.DataFrame({
            "predicted": preds,
            "actual": valid_Y
        })
        if both.shape[0] > max_sample:
            both = both.sample(max_sample, random_state=random_state)
        return {
            "x": both.actual.tolist(),
            "y": both.predicted.round(4).tolist()
        }

    @staticmethod
    def compute_preds_pdf(preds, sample_weights=None):
        kde = scipy.stats.gaussian_kde(preds, weights=sample_weights)
        xmin = np.min(preds)
        xmax = np.max(preds)
        x = np.linspace(xmin, xmax, num=100)
        pdf = kde(x)
        return {"x": x, "pdf": pdf}

    def compute_predicted_data(self):
        df = pd.DataFrame({"prediction": self.test_predictions,
                           "error": self.error_series,
                           "relative_error": self.relative_error_series,
                           "error_decile": self.error_bin_series,
                           "abs_error_decile": self.abs_error_bin_series},
                          columns=["prediction", "error", "relative_error", "error_decile", "abs_error_decile"])
        # Realign
        df.index = self.test_X_index
        if self.test_prediction_result.has_prediction_intervals():
            prediction_intervals = self.test_prediction_result.prediction_intervals_not_declined
            df[PREDICTION_INTERVAL_LOWER] = prediction_intervals[:, 0]
            df[PREDICTION_INTERVAL_UPPER] = prediction_intervals[:, 1]

        full = pd.DataFrame(index=self.test_df_index)
        df = full.join(df, how="left")
        # Add override info after realigning and joining since they are including the declined rows,
        # which are not included on the test_X_index
        if isinstance(self.test_prediction_result, OverridesResultsMixin):
            df[OVERRIDE_INFO_COL] = self.test_prediction_result.compute_and_return_info_column()
        self.predicted_df = df

    def get_regression_performance(self, valid_y, preds, sample_weight=None):
        logger.info("Computing regression performance")

        # Base data
        results = pd.DataFrame({
            "target": valid_y,
            "predicted": preds
        })
        # Error
        results['error'] = results['target'] - results['predicted']
        results['relative_error'] = results['error'] / results['target']

        self.error_series = results["error"]
        self.relative_error_series = results["relative_error"]

        if sample_weight is not None:
            results["sample_weight"] = sample_weight
            results.sort_values(by=["error"], ascending=True, inplace=True)

        # Winsorize
        error_clipped = self._winsorize(results["error"], 0.02, 0.98)

        # Cut
        try:
            (cut_categorical, cut_mins) = pd.cut(error_clipped, 10, labels=xrange(0, 10), retbins=True)
        except Exception as e:
            logger.error(e)
            # ugly hack: when all errors are almost the same, the pd.cut fails, but if you slightly modify them, then it's fine again
            (cut_categorical, cut_mins) = pd.cut(0.999*error_clipped, 10, labels=xrange(0, 10), retbins=True)

        results['error_bin_id'] = cut_categorical
        self.error_bin_series = results["error_bin_id"]

        try:
            self.abs_error_bin_series = pd.cut(error_clipped.abs(), 10, labels=xrange(0, 10), retbins=True)[0]
        except Exception as e:
            logger.error(e)
            # ugly hack: when all errors are almost the same, the pd.cut fails, but if you slightly modify them, then it's fine again
            self.abs_error_bin_series = pd.cut(0.999*error_clipped.abs(), 10, labels=xrange(0, 10), retbins=True)[0]

        if sample_weight is None:
            ags = results.groupby('error_bin_id')['target'].count().reset_index()
            ags.columns = ["error_bin_id", "count"]
            distrib = []
            for row in ags.itertuples():
                bin_id = row.error_bin_id
                distrib.append({
                    "bin_id": bin_id,
                    "bin_min": cut_mins[bin_id],
                    "bin_max": cut_mins[bin_id + 1],
                    "count": dku_nonaninf(row.count)
                })
            return {
                'error_distribution': distrib,
                'raw_min_error': results['error'].min(),
                'min_error': error_clipped.min(),
                'p25_error': results['error'].quantile(.25),
                'median_error': results['error'].median(),
                'average_error':  error_clipped.mean(),
                'raw_average_error': results['error'].mean(),
                'std_error': dku_nonaninf( error_clipped.std()),
                'raw_std_error': dku_nonaninf( results['error'].std()),
                'p75_error': dku_nonaninf( results['error'].quantile(.75)),
                'p90_error': dku_nonaninf( results['error'].quantile(.90)),
                'max_error': error_clipped.max(),
                'raw_max_error': results['error'].max(),
            }
        else:
            errors = results['error'].values
            weights = results['sample_weight'].values
            cumsum_weights = np.cumsum(weights)
            sum_weights = cumsum_weights[-1]
            raw_w_avg_e = np.dot(errors, weights) / sum_weights
            w_avg_e = np.dot(error_clipped, weights) / sum_weights
            ags = results.groupby('error_bin_id')['sample_weight'].sum().reset_index()
            ags.columns = ["error_bin_id", "w_count"]
            distrib = []
            for row in ags.itertuples():
                bin_id = row.error_bin_id
                distrib.append({
                    "bin_id": bin_id,
                    "bin_min": cut_mins[bin_id],
                    "bin_max": cut_mins[bin_id + 1],
                    "count": dku_nonaninf(row.w_count)
                })
            return {
                'error_distribution': distrib,
                'raw_min_error': results['error'].min(),
                'min_error': error_clipped.min(),
                'p25_error': weighted_quantile(errors, results['sample_weight'].values, .25, cumsum_weights=cumsum_weights),
                'median_error': weighted_quantile(errors, results['sample_weight'].values, .5, cumsum_weights=cumsum_weights),
                'average_error': w_avg_e,
                'raw_average_error': raw_w_avg_e,
                'std_error': np.sqrt(max(np.dot(np.square(error_clipped), weights) / sum_weights - w_avg_e * w_avg_e, 0.)),
                'raw_std_error': np.sqrt(max(np.dot(np.square(errors), weights) / sum_weights - raw_w_avg_e * raw_w_avg_e, 0.)),
                'p75_error': weighted_quantile(errors, results['sample_weight'].values, .75, cumsum_weights=cumsum_weights),
                'p90_error': weighted_quantile(errors, results['sample_weight'].values, .90, cumsum_weights=cumsum_weights),
                'max_error': error_clipped.max(),
                'raw_max_error': results['error'].max(),
            }

    def compute_residuals(self, valid_y, preds, sample_weight=None):
        logger.info("Computing residuals")

        # Base data
        residuals = valid_y - preds

        # Clipping
        residuals_clipped = self._winsorize(residuals, 0.02, 0.98)

        # Compute statistics on all the clipped values - the non clipped mean/std are computed on the fly where needed reusing valid_y and preds
        residuals_clipped_mean = residuals_clipped.mean() if sample_weight is None else np.average(residuals_clipped, axis=0, weights=sample_weight)
        residuals_clipped_std = residuals_clipped.std() if sample_weight is None else np.sqrt(np.cov(residuals_clipped, aweights=sample_weight))
        jb, jb_pvalue, skew, kurtosis = jarque_bera(residuals, weights=sample_weight)

        # Keep some sample residual values in order to display an histogram and a Q-Q plot of those in the Residuals model report section
        number_of_residual_values_for_graphs = 10000
        if len(residuals) > number_of_residual_values_for_graphs:
            residuals_clipped =  residuals_clipped.sample(n=number_of_residual_values_for_graphs, random_state=1337).values
        else:
            residuals_clipped = residuals_clipped.values

        residuals_std = np.std(residuals) # Not weighted, should it be ?
        if residuals_std == 0:
        # All data is identical
            residuals_quantiles = np.zeros(1000)
        else:
            std_residuals = (residuals - np.average(residuals)) / residuals_std
            residuals_quantiles = np.quantile(std_residuals, np.linspace(0.0, 1.0, 1000), axis=0)
        theoretical_quantiles = stats.norm.ppf(np.arange(1.0, len(residuals_quantiles) + 1) / (len(residuals_quantiles) + 1))

        return {
            "residuals": residuals_clipped.tolist(),
            "residualsMean": residuals_clipped_mean,
            "residualsStd": residuals_clipped_std, ## residuals_std: single value for the sample standard deviation of the residuals
            "stdResiduals": residuals_quantiles.tolist(), ## std_residuals: series for the standardized values of the residuals
            "theoreticalQuantiles": theoretical_quantiles.tolist(),
            "stats": {
                "jarqueBera": jb,
                "jarqueBeraPValue": jb_pvalue,
                "skew": skew,
                "kurtosis": kurtosis,
            }
        }

    @staticmethod
    def _winsorize(data, lower_quantile=0.02, upper_quantile=0.98):
        """
        Perform winsorization on a pandas Series or DataFrame to replace extreme values
        by replacing them with the nearest value within a specified percentile range.

        For example winsoriation of a series [1, 2, 3, ..., 98, 99, 100] for lower_quantile=0.02 and upper_quantile=0.98
        will return a series [2.98, 2.98, 3, ..., 98, 98.02, 98.02]
        Note that in this example the function is returning 2.98 instead of 3 for the 0.02 quantile
        because we are using pandas.DataFrame.quantile with a 'linear' interpolation.

        Parameters
        ----------
        data : pandas.Series or pandas.DataFrame
            The data to be winsorized.
        lower_quantile : float, default=0.02
            The lower threshold quantile, between 0 and 1.
            Values below this quantile will be set to the quantile value.
        upper_quantile : float, default=0.98
            The upper threshold quantile, between 0 and 1.
            Values above this quantile will be set to the quantile value.

        Returns
        -------
        pandas.Series or pandas.DataFrame
            The winsorized data with the same shape as the input.

        Example
        -------
        winsorized_series = RegressionModelScorer._winsorize(pd.Series(range(100)), lower_quantile=0.02, upper_quantile=0.98)
        assert_array_equal(winsorized_test_series, [1.98, 1.98] + list(range(2, 98)) + [97.02, 97.02])
        """
        min_value = data.quantile(lower_quantile)
        max_value = data.quantile(upper_quantile)
        return data.where(data < max_value, other=max_value).where(data > min_value, other=min_value)


class CVRegressionModelScorer(BaseCVModelScorer):
    def __init__(self, scorers):
        super(CVRegressionModelScorer, self).__init__(scorers)

    def score(self):
        super(CVRegressionModelScorer, self).score()

        self.r1 = self.perfdatas[0]

        self.ret["metrics"] = {}
        for metric in self.r1["metrics"].keys():
            if metric in self.DISCARDED_METRICS_FOR_AGG:
                logger.info("Not aggregating metric '%s'" % metric)
                continue
            if metric == "customMetricsResults":
                custom_metric_data_per_fold = [x["metrics"]["customMetricsResults"] for x in self.perfdatas]
                self.ret["metrics"]["customMetricsResults"] = aggregate_custom_metrics_for_cross_val_model(custom_metric_data_per_fold)
            else:
                data = np.array(
                    [x["metrics"][metric] if x["metrics"][metric] is not None else np.nan for x in self.perfdatas])
                self.ret["metrics"][metric] = dku_nonaninf(np.nanmean(data))
                self.ret["metrics"][metric + "std"] = dku_nonaninf(np.nanstd(data))

        self.ret["scatterPlotData"] = self.r1["scatterPlotData"]
        self.ret["regression_performance"] = self.r1["regression_performance"]

        return self.ret


def make_tree_data(extract, feature_names, rescalers, is_regression, with_sample_weight, class_weight=None):
    denorm = Denormalizer(rescalers)

    def denormalize_sanitize(feat, threshold):
        if feat >= 0:
            # Nodes with actual split i.e. non-leaves
            # Leaves encode a threshold with -2 value (never to be used)
            threshold = denorm.denormalize_feature_value(feature_names[feat], threshold)
        if np.isinf(threshold):
            threshold = np.sign(threshold) * doctor_constants.DKU_JSON_INFINITY
        return threshold

    features = extract.feature.tolist()
    thresholds = [denormalize_sanitize(ft, thresh) for (ft, thresh) in zip(features, extract.threshold.tolist())]
    impurity = [x if not np.isnan(x) else -1 for x in extract.impurity.tolist()]
    tree = {
        "leftChild": extract.children_left.tolist(),
        "rightChild": extract.children_right.tolist(),
        "impurity": impurity,
        "threshold": thresholds,
        "nSamples": extract.n_node_samples.tolist(),
        "feature": features
    }

    if with_sample_weight and class_weight is None:
        tree["nSamplesWeighted"] = extract.weighted_n_node_samples.tolist()

    if is_regression:
        tree["predict"] = [x[0][0] for x in extract.value]
    else:
        tree["probas"] = [[u / y[1] for u in y[0]] for y in [(x[0], sum(x[0])) for x in extract.value]]
        if class_weight is not None:
            try:
                classes_proportions = []
                for probas in extract.value:
                    rebalanced_probas = [probas[0][i] / class_weight[k] for i, k in enumerate(class_weight.keys())]
                    norm = sum(rebalanced_probas)
                    classes_proportions.append([val / norm for val in rebalanced_probas])
                tree["targetClassesProportions"] = classes_proportions
            except:
                logging.warning("Could not compute target classes ratio (division by zero)")
        else:
            tree["targetClassesProportions"] = tree["probas"]
    return tree


class TreeSummaryBuilder(object):
    def __init__(self, model, feature_names, rescalers, is_regression, with_sample_weight, classes=None):
        self.rescalers = rescalers
        self.model = model
        self.featureNames = feature_names
        self.is_regression = is_regression
        self.with_sample_weight = with_sample_weight
        self.classes = classes

    def build(self):
        class_weight = self.model.get_params().get("class_weight", None)
        tree = make_tree_data(self.model.tree_, self.featureNames, self.rescalers, self.is_regression,
                              self.with_sample_weight, class_weight=class_weight)
        summary = {"tree": tree, "featureNames": self.featureNames}
        if not self.is_regression:
            summary["classes"] = self.classes
        return summary


class GradientBoostingSummaryBuilder(object):
    def __init__(self, model, featureNames, rescalers, is_regression, max_nodes, with_sample_weight, classes=None):
        self.rescalers = rescalers
        self.model = model
        self.featureNames = featureNames
        self.is_regression = is_regression
        self.max_nodes = max_nodes
        self.with_sample_weight = with_sample_weight
        self.classes = classes

    def build(self):
        accum = np.cumsum([len(t[0].tree_.feature.tolist()) for t in self.model.estimators_])
        taken = max(1, sum(1 for x in accum if x <= self.max_nodes))
        # scikit-learn GBT does not support class_weight (as of version 0.23)
        trees = [make_tree_data(t[0].tree_, self.featureNames, self.rescalers, True, self.with_sample_weight,
                                class_weight=None)
                 for t in self.model.estimators_[0: taken]]
        summary = {"trees": trees, "featureNames": self.featureNames, "was_clipped": taken != len(self.model.estimators_)}
        if not self.is_regression:
            summary["classes"] = self.classes
        return summary


class RandomForestSummaryBuilder(object):
    def __init__(self, model, featureNames, rescalers, is_regression, max_nodes, with_sample_weight, classes=None):
        self.rescalers = rescalers
        self.model = model
        self.featureNames = featureNames
        self.is_regression = is_regression
        self.max_nodes = max_nodes
        self.with_sample_weight = with_sample_weight
        self.classes = classes

    def build(self):
        accum = np.cumsum([len(t.tree_.feature.tolist()) for t in self.model.estimators_])
        taken = max(sum(1 for x in accum if x <= self.max_nodes), 1)
        class_weight = self.model.get_params().get("class_weight", None)
        trees = [make_tree_data(t.tree_, self.featureNames, self.rescalers, self.is_regression, self.with_sample_weight,
                                class_weight=class_weight)
                 for t in self.model.estimators_[0: taken]]
        summary = {"trees": trees, "featureNames": self.featureNames, "was_clipped": taken != len(self.model.estimators_)}
        if not self.is_regression:
            summary["classes"] = self.classes
        return summary


def save_regression_statistics(pred_df, base_folder_context, filename=None):
    if pred_df.shape[0] == 0:
        return
    try:
        prediction_pdf = RegressionModelScorer.compute_preds_pdf(pred_df)
        prediction_statistics = dict(x=prediction_pdf['x'], pdf=prediction_pdf['pdf'], predictions=pred_df.round(4).tolist())
        base_folder_context.write_json("prediction_statistics.json" if not filename else filename, prediction_statistics)
    except Exception: #NOSONAR (catching all because several different exceptions can be raised by pdf computation)
        logger.warning("Could not compute prediction PDF.")
