# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
from dku_utils import read_pickle_from_dss_folder, get_managed_folder_id_with_folder_name

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
from real_estate_pricing.geographic_handling.formating.points import read_geo_point
from real_estate_pricing.geographic_handling.feature_engineering.information_extraction import get_geodesic_distance
from real_estate_pricing.flow.constants import (N_NEIGHBORS_STATIONS_TO_SEARCH,
                                                N_NEIGHBORS_STATIONS_TO_RETRIEVE,
                                                SORT_NEIGHBORS_BY_GEODESIC_DISTANCES)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Read recipe inputs
real_estate_sales = dataiku.Dataset("real_estate_sales_prepared")
real_estate_sales_df = real_estate_sales.get_dataframe()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
stations_data_df = dataiku.Dataset("stations_metadata_indexed").get_dataframe()
stations_data_df.drop(["station_latitude", "station_longitude"], axis=1, inplace=True)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
real_estate_sales_df = real_estate_sales_df[["transaction_id", "address", "property_geo_point"]]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
reverse_coordinates = True

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
project_key = dataiku.get_custom_variables()["projectKey"]
subway_stations_folder_id = get_managed_folder_id_with_folder_name(project_key, "subway_stations_indexing")

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
subway_stations_indexer = read_pickle_from_dss_folder("subway_stations_indexer.p", subway_stations_folder_id)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
unique_properties_df = real_estate_sales_df[["address" ,"property_geo_point"]].drop_duplicates()
unique_addresses = list(unique_properties_df["address"])
unique_geo_points = list(unique_properties_df["property_geo_point"])
properties_geo_points = [read_geo_point(geo_point, reverse_coordinates)
                         for geo_point in unique_geo_points]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
properties_neighboring_stations =\
subway_stations_indexer.search_geo_points_neighbors(properties_geo_points,
                                                    N_NEIGHBORS_STATIONS_TO_SEARCH,
                                                    N_NEIGHBORS_STATIONS_TO_RETRIEVE,
                                                    SORT_NEIGHBORS_BY_GEODESIC_DISTANCES)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
geo_points_neighbor_stations = {address: station for address, station
                                in zip(unique_addresses, properties_neighboring_stations)}

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
real_estate_sales_df["close_stations"] = real_estate_sales_df["address"]\
.apply(lambda x:geo_points_neighbor_stations[x])

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
stations_data_columns = list(stations_data_df.columns)
stations_data_columns = [column for column in stations_data_columns]
stations_data_columns

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
for station_neighbor_rank in range(N_NEIGHBORS_STATIONS_TO_RETRIEVE):
    real_estate_sales_df["close_station_{}".format(station_neighbor_rank)] =\
    real_estate_sales_df["close_stations"].apply(lambda x:x[station_neighbor_rank])
    stations_data_for_join = stations_data_df.copy()
    stations_data_for_join_renaming = {column: "close_{}_{}".format(station_neighbor_rank, column)
                                       for column in stations_data_columns}
    stations_data_for_join.rename(stations_data_for_join_renaming, axis=1, inplace=True)
    left_key = "close_station_{}".format(station_neighbor_rank)
    right_key = "close_{}_station_index".format(station_neighbor_rank)
    real_estate_sales_df = real_estate_sales_df.merge(stations_data_for_join,
                                                          how="left",
                                                          left_on=left_key,
                                                          right_on=right_key)
    real_estate_sales_df.drop(left_key, axis=1, inplace=True)
    real_estate_sales_df.drop(right_key, axis=1, inplace=True)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
real_estate_sales_df.drop("close_stations", axis=1, inplace=True)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
def concatenate_dataframe_columns(dataframe, columns_to_concatenate, columns_separator, resulting_column_name):
    n_columns = len(columns_to_concatenate)
    dataframe[resulting_column_name] = ""
    for column_index, column in enumerate(columns_to_concatenate):
        dataframe[column] = dataframe[column].astype(str)
        dataframe[resulting_column_name] += dataframe[column]
        if (column_index != n_columns - 1):
            dataframe[resulting_column_name] += columns_separator
    return dataframe

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
station_name_columns = ["close_{}_station_name_normalized".format(station_neighbor_rank)
                        for station_neighbor_rank in range(N_NEIGHBORS_STATIONS_TO_RETRIEVE)]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
real_estate_sales_df = \
concatenate_dataframe_columns(real_estate_sales_df, station_name_columns, ",", "close_stations")
real_estate_sales_df["closest_station_name"] = real_estate_sales_df["close_0_station_name_normalized"]
real_estate_sales_df.drop(station_name_columns, axis=1, inplace=True)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
station_line_columns = ["close_{}_line_transfers".format(station_neighbor_rank)
                        for station_neighbor_rank in range(N_NEIGHBORS_STATIONS_TO_RETRIEVE)]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
real_estate_sales_df = \
concatenate_dataframe_columns(real_estate_sales_df, station_line_columns, ",", "close_lines")
real_estate_sales_df.drop(station_line_columns, axis=1, inplace=True)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
line_terminus_columns = ["close_{}_line_terminus".format(station_neighbor_rank)
                        for station_neighbor_rank in range(N_NEIGHBORS_STATIONS_TO_RETRIEVE)]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
real_estate_sales_df = \
concatenate_dataframe_columns(real_estate_sales_df, line_terminus_columns, ",", "close_line_terminus")
real_estate_sales_df.drop(line_terminus_columns, axis=1, inplace=True)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
train_station_columns = ["close_{}_is_train_station".format(station_neighbor_rank)
                         for station_neighbor_rank in range(N_NEIGHBORS_STATIONS_TO_RETRIEVE)]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
real_estate_sales_df = \
concatenate_dataframe_columns(real_estate_sales_df, train_station_columns, ",", "close_train_station")
real_estate_sales_df.drop(train_station_columns, axis=1, inplace=True)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
numerical_columns = ["station_degree", "station_eigenvector_centrality",
                     "station_closeness_centrality", "station_pagerank"]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
for numerical_column in numerical_columns:
    print("Handling numerical column {} ...".format(numerical_column))
    closest_station_numerical_column = "close_0_{}".format(numerical_column)
    closest_station_numerical_column_renaming = "closest_station_{}".format(numerical_column)
    close_station_numerical_columns = ["close_{}_{}".format(station_neighbor_rank, numerical_column)
                                       for station_neighbor_rank in range(N_NEIGHBORS_STATIONS_TO_RETRIEVE)]
    numerical_columns_values = real_estate_sales_df[close_station_numerical_columns].values

    numerical_columns_average = []
    numerical_columns_min = []
    numerical_columns_max = []
    numerical_columns_std = []

    for row_values in numerical_columns_values:
        numerical_columns_average.append(np.mean(row_values))
        numerical_columns_min.append(np.min(row_values))
        numerical_columns_max.append(np.max(row_values))
        numerical_columns_std.append(np.std(row_values))

    real_estate_sales_df["close_stations_average_{}".format(numerical_column)] = numerical_columns_average
    #real_estate_sales_df["close_stations_min_{}".format(numerical_column)] = numerical_columns_min
    #real_estate_sales_df["close_stations_max_{}".format(numerical_column)] = numerical_columns_max
    #real_estate_sales_df["close_stations_std_{}".format(numerical_column)] = numerical_columns_std
    numerical_columns_to_remove = [column for column in close_station_numerical_columns
                                   if column !=closest_station_numerical_column]
    real_estate_sales_df.drop(numerical_columns_to_remove, axis=1, inplace=True)
    real_estate_sales_df.rename({closest_station_numerical_column: closest_station_numerical_column_renaming},
                              axis=1,
                              inplace=True)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
geo_points_columns = ["close_{}_station_geo_point".format(station_neighbor_rank)
                      for station_neighbor_rank in range(N_NEIGHBORS_STATIONS_TO_RETRIEVE)]
geo_points_columns

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
neighbor_stations_geo_points = {}
for station_neighbor_rank, geo_point_column in zip(range(N_NEIGHBORS_STATIONS_TO_RETRIEVE), geo_points_columns):
    neighbor_stations_geo_points[station_neighbor_rank] = [read_geo_point(geo_point, reverse_coordinates)
                                                           for geo_point in real_estate_sales_df[geo_point_column]]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
properties_geo_points = [read_geo_point(geo_point, reverse_coordinates)
                         for geo_point in list(real_estate_sales_df["property_geo_point"])]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
for station_neighbor_rank in range(N_NEIGHBORS_STATIONS_TO_RETRIEVE):
    distance_property_station_column = "distance_from_station_{}".format(station_neighbor_rank)
    neighbor_stations_geo_points_data = neighbor_stations_geo_points[station_neighbor_rank]
    properties_distances_from_stations = []

    for property_geo_point, neighbor_station_geo_point in zip(properties_geo_points, neighbor_stations_geo_points_data):
        property_distance_from_station = get_geodesic_distance(property_geo_point, neighbor_station_geo_point, False)
        properties_distances_from_stations.append(property_distance_from_station)

    real_estate_sales_df[distance_property_station_column] = properties_distances_from_stations

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
distances_columns = ["distance_from_station_{}".format(station_neighbor_rank) for station_neighbor_rank in range(N_NEIGHBORS_STATIONS_TO_RETRIEVE)]
average_distance_from_closest_stations = []
for row_values in real_estate_sales_df[distances_columns].values:
    average_distance_from_closest_stations.append(np.mean(row_values))

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
real_estate_sales_df["average_distance_from_closest_stations"] = average_distance_from_closest_stations

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
properties_closest_suwbway_stations = dataiku.Dataset("properties_closest_suwbway_stations")
properties_closest_suwbway_stations.write_with_schema(real_estate_sales_df)