import logging

import sklearn

from dataiku.base.utils import package_is_at_least
from dataiku.doctor.sparse import prepare_multiframe_with_sparse_support, AlgorithmSparseSupport

logger = logging.getLogger(__name__)


class ClusteringAlgorithmsSparseSupport(AlgorithmSparseSupport):
    ALGORITHMS_WITH_SETTABLE_CSR_SUPPORT = {
        'PY_ISOLATION_FOREST',
        'KMEANS',
        'MiniBatchKMeans'
    }

    def should_use_csr(self, multiframe):
        # bug in scikit https://github.com/scikit-learn/scikit-learn/pull/27645
        if self.algorithm == 'PY_ISOLATION_FOREST':
            use_auto_contamination = self.modeling_params["isolation_forest"].get("use_auto_contamination", False)
            if not use_auto_contamination and package_is_at_least(sklearn, "1.3") and not package_is_at_least(sklearn, "1.4"):
                logger.warning("Disabling sparse matrices support to avoid bug #27645 in scikit (scikit >= 1.3 and < 1.4 and contamination is not set to auto), using NPA")
                return False

        return super(ClusteringAlgorithmsSparseSupport, self).should_use_csr(multiframe)

    def should_allow_sparse_matrices(self):
        # Unlike for prediction models, all clustering hyperparameters are saved directly in self.modeling_params, which is then reused for each cluster type
        # The only case where this isn't true is for the Isolation Forest algorithm, hence the following code
        if self.algorithm == 'PY_ISOLATION_FOREST':
            return self.modeling_params["isolation_forest"].get('allow_sparse_matrices', False)
        else:
            return self.modeling_params.get('allow_sparse_matrices', False)


def prepare_multiframe(train_X, modeling_params):
    sparse_support = ClusteringAlgorithmsSparseSupport(modeling_params)
    return prepare_multiframe_with_sparse_support(train_X, sparse_support)
