# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
from commons.dku_utils.core import get_current_project_and_variables
from commons.dku_utils.datasets.dataset_commons import get_dataset_schema

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Read recipe inputs
applications_initial_filter = dataiku.Dataset("applications_initial_filter")
applications_initial_filter_df = applications_initial_filter.get_dataframe()

applications_statistical_filter = dataiku.Dataset("applications_statistical_filter")
applications_statistical_filter_df = applications_statistical_filter.get_dataframe()

project, variables = get_current_project_and_variables()

input_schema = get_dataset_schema(project, 'applications_initial_filter')

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
ivs = dict()

for i, row in applications_statistical_filter_df.iterrows():
    ivs[row['feature']] = row['information_value']

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
applications_features = applications_initial_filter_df.drop(['credit_event', 'id'], axis=1)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
method = variables['standard']['correlation_method']
threshold = variables['standard']['correlation_threshold']

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
correlation_matrix = applications_features.corr(method=method)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
columns = correlation_matrix.columns
high_correlation_pairs = pd.DataFrame()

for i in range(len(columns)-1):
    for j in range(i+1, len(columns)):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            high_correlation_pairs = high_correlation_pairs.append(pd.DataFrame({'x': [columns[i]], 'y': [columns[j]]}))

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# keep the best of each pair
variables_to_remove = []

for i, row in high_correlation_pairs.iterrows():
    if ivs[row['x']] > ivs[row['y']]:
        variables_to_remove.append(row['y'])
    else:
        variables_to_remove.append(row['x'])

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
variables_to_remove = list(np.unique(variables_to_remove))

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
applications_initial_filter_df = applications_initial_filter_df.drop(variables_to_remove, axis=1)

variables['standard']['correlation_filtered'] = len(variables_to_remove)
variables['standard']['total_filtered'] = variables['standard']['correlation_filtered'] + variables['standard']['chi2_information_value_filtered'] + variables['standard']['information_value_only_filtered'] + variables['standard']['chi2_only_filtered']

project.set_variables(variables)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
output_schema = [column for column in input_schema if column['name'] not in variables_to_remove]
# Write recipe outputs
applications_correlation_filtered = dataiku.Dataset("applications_correlation_filtered")
applications_correlation_filtered.write_schema(output_schema)
applications_correlation_filtered.write_dataframe(applications_initial_filter_df,
                                     infer_schema=False,
                                     dropAndCreate=True)