from dataiku.doctor.diagnostics import diagnostics
from dataiku.doctor.diagnostics.diagnostics import DiagnosticType
from dataiku.doctor.timeseries.utils import timeseries_iterator
from dataiku.doctor.timeseries.utils import pretty_timeseries_identifiers
from dataiku.doctor.timeseries.utils import SINGLE_TIMESERIES_IDENTIFIER


ZERO_TARGET_MIN_RATIO_THRESHOLD = 0.2


def check_zero_target_ratio(df, timeseries_identifier_columns, target_column):
    for timeseries_identifier, df_of_timeseries_identifier in timeseries_iterator(df, timeseries_identifier_columns):
        # df_of_timeseries_identifier cannot be an empty dataframe
        zero_target_ratio = (df_of_timeseries_identifier[target_column] == 0).sum() / len(df_of_timeseries_identifier.index)
        if zero_target_ratio > ZERO_TARGET_MIN_RATIO_THRESHOLD:
            if timeseries_identifier == SINGLE_TIMESERIES_IDENTIFIER:
                diagnostic_message = "Resampled time series"
            else:
                diagnostic_message = "At least one resampled time series {}".format(pretty_timeseries_identifiers(timeseries_identifier))
            diagnostic_message += " contains more than {}% of zero values in the target column, which might yield undefined values for the MAPE metric.".format(
                int(100 * ZERO_TARGET_MIN_RATIO_THRESHOLD)
            )
            diagnostics.add_or_update(DiagnosticType.ML_DIAGNOSTICS_DATASET_SANITY_CHECKS, diagnostic_message)
            break
