import logging
import shutil
from collections import Counter
from typing import Optional
import re

import pandas as pd

from dataiku.base.folder_context import FolderContext
from dataiku.core import doctor_constants
from dataiku.core.dku_pandas_csv import dataframe_to_csv
from dataiku.doctor.timeseries.models import TimeseriesForecastingAlgorithm
from dataiku.doctor.timeseries.preparation.preprocessing import get_external_features
from dataiku.doctor.timeseries.preparation.resampling.utils import get_monthly_day_alignment, get_frequency
from dataiku.doctor.timeseries.train.split_handler import TimeseriesInteractiveScoringDefaultSplitHandler
from dataiku.doctor.timeseries.utils import get_dataframe_of_timeseries_identifier, get_random_id, future_date_range

class TimeseriesInteractiveScenariosHandler:
    """Manages interactive scoring scenarios for time series models.
    """
    COLOR_NUMBER = 15
    def __init__(self, model_folder_context, modeling_params, preprocessing_params, core_params, resampled_df):
        """Initializes the handler for interactive scenarios.

        :param model_folder_context: The folder context for the model.
        :type model_folder_context: FolderContext
        :param modeling_params: The modeling parameters.
        :type modeling_params: dict
        :param preprocessing_params: The preprocessing parameters.
        :type preprocessing_params: dict
        :param core_params: The core parameters.
        :type core_params: dict
        :param resampled_df: The resampled dataframe.
        :type resampled_df: pd.DataFrame
        """
        self.interactive_scoring_model_folder_context = model_folder_context.get_subfolder_context('interactive-scoring-analysis')
        self.interactive_scoring_model_folder_context.create_if_not_exist()
        external_features = get_external_features(preprocessing_params)
        self.external_features_dtypes = {ef: resampled_df[ef].dtype for ef in external_features}
        self.scenario_columns = [core_params[doctor_constants.TIME_VARIABLE]] + external_features
        self.resampled_df = resampled_df
        self.time_variable = core_params[doctor_constants.TIME_VARIABLE]
        self.prediction_length = core_params[doctor_constants.PREDICTION_LENGTH]
        self.frequency = get_frequency(core_params)
        self.monthly_day_alignment = get_monthly_day_alignment(core_params)

        self.interactive_scoring_split_handler = TimeseriesInteractiveScoringDefaultSplitHandler(core_params)
        algorithm = TimeseriesForecastingAlgorithm.build(modeling_params["algorithm"])
        self.min_size_for_scoring = algorithm.get_min_size_for_scoring(modeling_params, preprocessing_params, core_params[doctor_constants.PREDICTION_LENGTH])

        if self.interactive_scoring_model_folder_context.isfile('identifiers_mapping.json'):
            self.identifiers_mapping = self.interactive_scoring_model_folder_context.read_json('identifiers_mapping.json')
        else:
            self.identifiers_mapping = {}

    def dump_identifiers_mapping(self):
        """Writes `identifiers_mapping.json` in `interactive-scoring-analysis` folder."""
        self.interactive_scoring_model_folder_context.write_json('identifiers_mapping.json', self.identifiers_mapping)

    def get_scenario_df(self, timeseries_identifier, scenario_id, for_scoring=False):
        """Retrieves a scenario as a pandas DataFrame.

        :param timeseries_identifier: The identifier of the time series.
        :type timeseries_identifier: str
        :param scenario_id: The ID of the scenario to retrieve.
        :type scenario_id: str
        :param for_scoring: Whether to apply dtypes for scoring.
        :type for_scoring: bool
        :return: The scenario DataFrame.
        :rtype: pd.DataFrame
        """
        identifier_id = self.identifiers_mapping.get(timeseries_identifier)
        if identifier_id is None:
            raise ValueError("No scenarios found for identifier {}".format(timeseries_identifier))
        identifier_folder_context = self._get_timeseries_identifier_folder_context(timeseries_identifier)
        if identifier_folder_context.isfile("scenario-{}.csv".format(scenario_id)):
            with identifier_folder_context.get_file_path_to_read("scenario-{}.csv".format(scenario_id)) as dataset_path:
                return pd.read_csv(dataset_path, sep="\t", dtype=self.external_features_dtypes if for_scoring else None)
        else:
            raise ValueError("{} scenario dataframe doesn't exist.".format(timeseries_identifier))

    def _get_scenario_with_start_date(self, timeseries_identifier, start_date):
        """Generates a scenario DataFrame starting from a specific date.

        :param timeseries_identifier: The identifier for the time series.
        :type timeseries_identifier: str
        :param start_date: The start date for the new scenario.
        :type start_date: str
        :return: A new scenario DataFrame.
        :rtype: pd.DataFrame
        """
        df_of_timeseries_identifier = get_dataframe_of_timeseries_identifier(self.resampled_df, timeseries_identifier)
        max_start_date = future_date_range(df_of_timeseries_identifier[self.time_variable].iloc[-1], 1, self.frequency, self.monthly_day_alignment)[0]
        if start_date > pd.to_datetime(max_start_date):
            raise ValueError("Specified start date {} is after the last available historical data for identifier {}".format(start_date, timeseries_identifier))
        historical_df_of_timeseries_identifier = df_of_timeseries_identifier[df_of_timeseries_identifier[self.time_variable] < start_date]
        if len(historical_df_of_timeseries_identifier) < self.min_size_for_scoring:
            raise ValueError("Not enough historical data to score for identifier {} with specified start date {}".format(timeseries_identifier, start_date))
        scenario_dates = future_date_range(historical_df_of_timeseries_identifier[self.time_variable].iloc[-1], self.prediction_length, self.frequency, self.monthly_day_alignment)
        new_scenario_df = pd.DataFrame({self.time_variable: pd.to_datetime(scenario_dates)})
        new_scenario_df = pd.merge(new_scenario_df, df_of_timeseries_identifier, on=self.time_variable, how='left')

        return new_scenario_df

    def create_scenario(self, timeseries_identifier, scenario_id=None, start_date=None, scenario_name=None):
        """Creates a new scenario.

        A new scenario can be created either by duplicating an existing one (using `scenario_id`)
        or by generating one from a specific `start_date`. If neither is provided, a default
        scenario is created.

        :param timeseries_identifier: The identifier for the time series.
        :type timeseries_identifier: str
        :param scenario_id: The ID of an existing scenario to duplicate.
        :type scenario_id: str, optional
        :param start_date: The start date for a new scenario.
        :type start_date: str, optional
        :param scenario_name: The name for the new scenario.
        :type scenario_name: str, optional
        """
        identifier_scenarios_folder_context = self._init_identifiers_scenario_folder(timeseries_identifier)
        scenarios_metadata = self.get_scenarios_metadata(timeseries_identifier)
        new_scenario_id = get_random_id()
        new_scenario_name = scenario_name or TimeseriesInteractiveScenariosHandler._get_new_scenario_name(scenarios_metadata["names"].values(), scenarios_metadata["names"].get(scenario_id, "historical"))

        if new_scenario_name in scenarios_metadata["names"].values():
            raise ValueError("Scenario with name {} already exists".format(scenario_name))

        if scenario_id is not None and start_date is not None:
            raise ValueError("Both scenario_id and start_date cannot be specified at the same time when creating a scenario")

        if scenario_id is not None: # Simple use case we create a duplicate
            with identifier_scenarios_folder_context.get_file_path_to_read("scenario-{}.csv".format(scenario_id)) as input_scenario_df_filename:
                with identifier_scenarios_folder_context.get_file_path_to_write("scenario-{}.csv".format(new_scenario_id)) as new_scenario_df_filename:
                    shutil.copy(input_scenario_df_filename, new_scenario_df_filename)
        else:
            if start_date is None:
                # Load default timeseries identifier scenario
                new_scenario_df = self._get_default_scenario(timeseries_identifier)
            else:
                new_scenario_df = self._get_scenario_with_start_date(timeseries_identifier, start_date)

            self._write_scenario_df(timeseries_identifier, new_scenario_id, new_scenario_df)

        # At last scenarios_metadata is updated the scenario dumpled
        scenarios_metadata["names"][new_scenario_id] = new_scenario_name
        scenarios_metadata["colors"][new_scenario_id] = TimeseriesInteractiveScenariosHandler.get_new_color(list(scenarios_metadata["colors"].values()))
        identifier_scenarios_folder_context.write_json("scenarios_metadata.json", scenarios_metadata)

    @staticmethod
    def _get_scenario_id(scenario_name, scenarios_metadata):
        """Finds a scenario ID from its name.

        :param scenario_name: The name of the scenario.
        :type scenario_name: str
        :param scenarios_metadata: The metadata dictionary for scenarios.
        :type scenarios_metadata: dict
        :return: The scenario ID if found, else None.
        :rtype: str, optional
        """
        for scenario_id, name in scenarios_metadata["names"].items():
            if name == scenario_name:
                return scenario_id
        return None

    def _write_scenario_df(self, timeseries_identifier, scenario_id, scenario_df):
        """Writes a scenario DataFrame to a CSV file.

        :param timeseries_identifier: The identifier for the time series.
        :type timeseries_identifier: str
        :param scenario_id: The ID of the scenario.
        :type scenario_id: str
        :param scenario_df: The DataFrame to write.
        :type scenario_df: pd.DataFrame
        """
        identifier_folder_context = self._get_timeseries_identifier_folder_context(timeseries_identifier)
        with identifier_folder_context.get_file_path_to_write("scenario-{}.csv".format(scenario_id)) as scenario_df_file:
            dataframe_to_csv(scenario_df[self.scenario_columns], scenario_df_file, open, header=True)

    def _get_default_scenario(self, timeseries_identifier):
        """Gets the default scenario for a given timeseries identifier.

        :param timeseries_identifier: The identifier for the time series.
        :type timeseries_identifier: str
        :return: The default scenario as a DataFrame.
        :rtype: pd.DataFrame
        """
        df_of_timeseries_identifier = get_dataframe_of_timeseries_identifier(self.resampled_df, timeseries_identifier)
        _, scenario_df, _ = next(self.interactive_scoring_split_handler.split(df_of_timeseries_identifier))
        return scenario_df

    def _init_identifiers_scenario_folder(self, timeseries_identifier):
        """Initializes the folder structure for a time series identifier's scenarios.

        :param timeseries_identifier: The identifier for the time series.
        :type timeseries_identifier: str
        :return: The folder context for the identifier's scenarios.
        :rtype: FolderContext
        """
        identifier_id = self.identifiers_mapping.get(timeseries_identifier, get_random_id())
        self.identifiers_mapping[timeseries_identifier] = identifier_id
        identifier_scenarios_folder_context = self.interactive_scoring_model_folder_context.get_subfolder_context(identifier_id)
        identifier_scenarios_folder_context.create_if_not_exist()
        if not identifier_scenarios_folder_context.isfile("scenarios_metadata.json"):
            scenarios_metadata = {}
        else:
            scenarios_metadata = identifier_scenarios_folder_context.read_json("scenarios_metadata.json")
        if not all(key in scenarios_metadata for key in ["names", "colors", "defaultStartDate", "minStartDate", "maxStartDate"]):
            df_of_timeseries_identifier = get_dataframe_of_timeseries_identifier(self.resampled_df, timeseries_identifier)
            scenarios_metadata = {
                "names": scenarios_metadata.get("names", {}),
                "colors": scenarios_metadata.get("colors", {}),
                "defaultStartDate": self._get_default_scenario(timeseries_identifier)[self.time_variable].iloc[0].strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3],
                "minStartDate": df_of_timeseries_identifier[self.time_variable].iloc[self.min_size_for_scoring].strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3],
                "maxStartDate": future_date_range(df_of_timeseries_identifier[self.time_variable].iloc[-1], 1, self.frequency, self.monthly_day_alignment)[0]
            }
            identifier_scenarios_folder_context.write_json("scenarios_metadata.json", scenarios_metadata)
            self.dump_identifiers_mapping()
        return identifier_scenarios_folder_context

    @staticmethod
    def _get_new_scenario_name(scenarios_names, requested_name):
        """Generates a unique name for a new scenario, handling copies.

        :param scenarios_names: A list of existing scenario names.
        :type scenarios_names: list[str]
        :param requested_name: The desired name for the new scenario.
        :type requested_name: str
        :return: A unique scenario name.
        :rtype: str
        """
        scenarios_names_set = set(scenarios_names)

        if requested_name not in scenarios_names_set:
            return requested_name

        # Extract base name, to avoid "(copy) (copy) (copy)..."
        match = re.match(r'^(.*) \(copy(?: \d*)?\)$', requested_name)
        base_name = match.group(1) if match else requested_name

        # Try "base_name (copy)" first
        new_name = "{} (copy)".format(base_name)
        if new_name not in scenarios_names_set:
            return new_name

        # Iterate over "base_name (copy {idx})" until we have a match
        i = 2
        while True:
            new_name = "{} (copy {})".format(base_name, i)
            if new_name not in scenarios_names_set:
                return new_name
            i += 1

    def get_scenarios_metadata(self, timeseries_identifier):
        """Retrieves the metadata for all scenarios of a given timeseries identifier.

        :param timeseries_identifier: The identifier for the time series.
        :type timeseries_identifier: str
        :return: A dictionary containing the scenarios' metadata.
        :rtype: dict
        """
        identifier_id = self.identifiers_mapping.get(timeseries_identifier)
        if identifier_id is None:
            raise ValueError("No scenarios found for identifier {}".format(timeseries_identifier))
        timeseries_identifier_folder_context = self.interactive_scoring_model_folder_context.get_subfolder_context(identifier_id)
        if not timeseries_identifier_folder_context.isfile("scenarios_metadata.json"):
            raise ValueError("No scenarios metadata found for identifier {}".format(timeseries_identifier))
        return timeseries_identifier_folder_context.read_json("scenarios_metadata.json")

    def write_scenarios(self, scenarios_forecasts, timeseries_identifier, scenario_id):
        """Writes the forecasts for a given scenario.

        :param scenarios_forecasts: The forecast data to write.
        :type scenarios_forecasts: dict
        :param timeseries_identifier: The identifier for the time series.
        :type timeseries_identifier: str
        :param scenario_id: The ID of the scenario.
        :type scenario_id: str
        """
        identifier_id = self.identifiers_mapping.get(timeseries_identifier)
        if identifier_id is None:
            raise ValueError("No scenarios found for identifier {}".format(timeseries_identifier))
        timeseries_identifier_folder_context = self.interactive_scoring_model_folder_context.get_subfolder_context(identifier_id)
        timeseries_identifier_folder_context.write_json('scenario-forecasts-{}.json.gz'.format(scenario_id), scenarios_forecasts)

    def _get_timeseries_identifier_folder_context(self, timeseries_identifier):
        """Gets the folder context for a given timeseries identifier.

        :param timeseries_identifier: The identifier for the time series.
        :type timeseries_identifier: str
        :return: The FolderContext for the identifier.
        :rtype: FolderContext
        """
        identifier_id = self.identifiers_mapping.get(timeseries_identifier)
        if identifier_id is None:
            raise ValueError("No scenarios found for identifier {}".format(timeseries_identifier))
        return self.interactive_scoring_model_folder_context.get_subfolder_context(identifier_id)

    @staticmethod
    def get_new_color(color_ids):
        """Returns the first missing color if there is one. Otherwise, returns the least common one.

        :param color_ids: A list of currently used color IDs.
        :type color_ids: list[int]
        :return: The ID of the new color to use.
        :rtype: int
        """
        colors = Counter(color_ids)
        for color in range(TimeseriesInteractiveScenariosHandler.COLOR_NUMBER):
            if color not in colors:
                return color
        return colors.most_common()[::-1][0][0]
