# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
from dku_utils.projects.project_commons import get_current_project_and_variables
from reconciliation_utils import get_keys


# Read recipe inputs
# Dataset editlog renamed to editlog by lea.senequier@dataiku.com on 2024-11-25 10:35:31
editlog = dataiku.Dataset("editlog")
editlog_df = editlog.get_dataframe()
pending_to_be_matched_prepared_copy = dataiku.Dataset("pending_to_be_matched_prepared")
pending_to_be_matched_prepared_copy_df = pending_to_be_matched_prepared_copy.get_dataframe()

# Read variables
project, variables = get_current_project_and_variables()
id_column_primary = variables["standard"]["id_column_primary"]
id_column_secondary = variables["standard"]["id_column_secondary"]
reconciliation_type = variables["standard"]["reconciliation_type"]
                                            
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Add logic to get keys
primary_keys, secondary_keys, key_types, thresholds, weights = get_keys()

# Force date type columns to string in consistent format if key_types includes "date"
if "date" in key_types:
    date_indices = [i for i, key_type in enumerate(key_types) if key_type == "date"]
    for i in date_indices:
        primary_date_col = primary_keys[i]
        secondary_date_col = "secondary_" + secondary_keys[i]  # Add "secondary_" prefix

        # Convert primary date column to string in ISO 8601 format
        if primary_date_col in pending_to_be_matched_prepared_copy_df.columns:
            pending_to_be_matched_prepared_copy_df[primary_date_col] = pd.to_datetime(
                pending_to_be_matched_prepared_copy_df[primary_date_col], errors='coerce'
            ).dt.strftime('%Y-%m-%dT%H:%M:%S.%fZ')
        # Convert secondary date column to string in ISO 8601 format
        if secondary_date_col in pending_to_be_matched_prepared_copy_df.columns:
            pending_to_be_matched_prepared_copy_df[secondary_date_col] = pd.to_datetime(
                pending_to_be_matched_prepared_copy_df[secondary_date_col], errors='coerce'
            ).dt.strftime('%Y-%m-%dT%H:%M:%S.%fZ')

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Check if editlog is empty
if editlog_df.empty:
    # If editlog is empty, set webapp_dataset to pending_to_be_matched_prepared_copy
    webapp_dataset_df = pending_to_be_matched_prepared_copy_df
else:
    # If editlog is not empty, update pending_to_be_matched_prepared_copy based on editlog
    # Create a copy to avoid modifying the original dataframe
    webapp_dataset_df = pending_to_be_matched_prepared_copy_df.copy()

    # Ensure the "Match" and "Comment" columns are of type object (string-compatible)
    webapp_dataset_df['Match'] = webapp_dataset_df['Match'].astype(object)
    webapp_dataset_df['Comments'] = webapp_dataset_df['Comments'].astype(object)

    # Determine the column names to use for the conditions
    primary_col = "primary_" + id_column_primary if id_column_primary else 'primary_id'
    secondary_col = "secondary_" + id_column_secondary if id_column_secondary else 'secondary_id'

    # Before the main processing loop, sort and get the latest entries
    # Sort editlog_df by primary_id and keep the last entry for each primary_id
    latest_editlog_df = editlog_df.sort_values('date').groupby(['primary_id', 'secondary_id']).last().reset_index()
    print(latest_editlog_df)
    # Then modify the main processing loop to use latest_editlog_df instead of editlog_df
    for _, row in latest_editlog_df.iterrows():
        primary_id = row['primary_id']
        secondary_id = row['secondary_id']

        # Find matching rows in pending_to_be_matched_prepared_copy based on primary_id and secondary_id
        condition = (
            (webapp_dataset_df[primary_col] == primary_id) &
            (webapp_dataset_df[secondary_col] == secondary_id)
        )

        # Update "Match" column if match_value is not empty
        if pd.notna(row['match_value']):
            webapp_dataset_df.loc[condition, 'Match'] = row['match_value']

            # # Additional logic: if match_value is "Approved", set other rows with the same primary_id to "Rejected"
            # if row['match_value'] == "Approved":
            #     reject_condition = (
            #         (webapp_dataset_df[primary_col] == primary_id) &
            #         (webapp_dataset_df[secondary_col] != secondary_id)
            #     )
            #     webapp_dataset_df.loc[reject_condition, 'Match'] = "Rejected"

        # Update "Comment" column if comment_value is not empty
        if pd.notna(row['comment_value']):
            webapp_dataset_df.loc[condition, 'Comments'] = row['comment_value']
            
# --------------------------------------------------------------------------------
# AUTO-REJECTION BASED ON RECONCILIATION TYPE
# --------------------------------------------------------------------------------

# Find all "Approved" matches. This is the source for auto-rejection.
approved_matches_df = webapp_dataset_df[webapp_dataset_df['Match'] == 'Approved']

if not approved_matches_df.empty:
    # Get the unique secondary IDs from all approved matches
    approved_secondary_ids = approved_matches_df[secondary_col].unique()

    # Define the conditions for rows to be rejected
    # Condition 1: Row's secondary_id is in the list of approved secondary_ids
    condition_has_approved_secondary = webapp_dataset_df[secondary_col].isin(approved_secondary_ids)
    # Condition 2: Row is not *itself* an "Approved" match
    condition_not_approved = webapp_dataset_df['Match'] != 'Approved'

    if reconciliation_type == "one_to_many":
        # "one_to_many": Reject other matches for the *same secondary* observation.
        # (i.e., a secondary observation can only be approved once)
        
        reject_mask = condition_has_approved_secondary & condition_not_approved
        webapp_dataset_df.loc[reject_mask, 'Match'] = 'Rejected'

    elif reconciliation_type == "one_to_one":
        # "one_to_one": Reject other matches for the *same primary* AND *same secondary*.
        
        # Get unique primary IDs from all approved matches
        approved_primary_ids = approved_matches_df[primary_col].unique()
        
        # Condition 3: Row's primary_id is in the list of approved primary_ids
        condition_has_approved_primary = webapp_dataset_df[primary_col].isin(approved_primary_ids)
        
        # Combine masks:
        # Reject if (it shares a primary ID OR it shares a secondary ID) AND it is not itself approved.
        reject_mask = (condition_has_approved_primary | condition_has_approved_secondary) & condition_not_approved
        webapp_dataset_df.loc[reject_mask, 'Match'] = 'Rejected'
            
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
webapp_dataset = dataiku.Dataset("webapp_dataset")
webapp_dataset.write_with_schema(webapp_dataset_df)