# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
import warnings
import os
import datamol as dm
from dotenv import load_dotenv

# Import the predeveloped python functions from the project Libraries
# ==============================================================================
from molecular_property_prediction import molecular_featurizer, get_MACCS_keys

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Load python variables
# ==================================================================================================================
load_dotenv()
warnings.simplefilter('ignore')
os.environ['PYTHONWARNINGS'] = 'ignore'
os.environ["TOKENIZERS_PARALLELISM"] = 'false'
dm.disable_rdkit_log()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Read recipe inputs
# ==================================================================================================================
# Read recipe inputs
# Dataset clintox_dataset renamed to clintox_datasetch by georgia.kouyialis@dataiku.com on 2025-03-27 13:49:29
clintox = dataiku.Dataset("clintox_dataset")
clintox_df = clintox.get_dataframe()

# Set the right parameters for featurization
# ==================================================================================================================
new_smiles = clintox_df['canonical_smiles'].dropna()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# ## Featurize the new canonical smiles with the same tranformer_type used for the the regression model
# ==================================================================================================================
transformer_type = dataiku.get_custom_variables()["transformer_type"]
prefix = transformer_type + '_'
molecular_features_vector = molecular_featurizer(new_smiles, transformer_type)
new_features_df = pd.DataFrame(molecular_features_vector)
new_features_df = new_features_df.add_prefix(prefix)
output_dataset = clintox_df.join(new_features_df)

# Remove any records with empty values
# ==================================================================================================================
output_dataset = (output_dataset[output_dataset[transformer_type + '_0'].notnull()])
output_dataset.drop('FDA_APPROVED', axis=1, inplace=True)
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
clintox_featurization = dataiku.Dataset("clintox_featurization")
clintox_featurization.write_with_schema(output_dataset)