import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
from chembl_webresource_client.settings import Settings
Settings.Instance().CACHING = False
from chembl_webresource_client.new_client import new_client
import pubchempy as pcp
import requests
import json

def get_query_text(query_url):
    """
    Input:  URL link
    Arg:    Parse the value of the query parameters
    Returns: Return the results in text
    """
    response = requests.get(query_url)
    return(response.text)

def chembl_database_error_test(target_search):
    """
    Search for a target protein in PubChem database through API    
    Arg:     target_search (str): target protein accession code e.g. "P11551"     
    Returns: Metadata and molecules data or error message
    """
    # Query the target from the database
    # ========================================================================================================================
    target = new_client.target
    target_query = target.search(target_search)
    if not target_query:
        raise ValueError("'No protein found in Chembl for given ProteinAccession(s)''")
    else:
        molecules_df, metadata_df = chembl_database_retreive(target_query)
    return molecules_df, metadata_df

def pubchem_database_error_test(target_search):
    """
    Search for a target protein in PubChem database through API   
    Arg:    target_accession (str): target protein accession code e.g. "P11551"    
    Return: Metadata and molecules data or error message
    """
    # Set the path to the database
    # ========================================================================================================================
    protein_path = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/protein/accession/{}/summary/JSON".format(target_search)
    bioactivity_path = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/protein/accession/{}/concise/JSON".format(target_search)
    bioactivity_data = json.loads(get_query_text(bioactivity_path))
    for key in bioactivity_data.keys():
        validation = key
    if validation=='Fault':
        raise ValueError("'No protein found in PubChem for given ProteinAccession(s)''")
    else:
        molecules_df, metadata_df = pubchem_database_retreive(bioactivity_data, protein_path)
    return molecules_df, metadata_df

def chembl_database_retreive(target_query):
    """
    Retrieve bioactivity data from ChEMBL database for a given target search term.
    Args:    target_query from function chembl_database_error_test
    Returns: Metadata and molecular bioactivity dataframes.
    """  
    # Search for Target Protein through Chembl API
    # ========================================================================================================================
    target_df = pd.DataFrame.from_dict(target_query)
    
    # Create the molecule dataset
    # Retrieve only bioactivity data for selected protein that are reported as IC50 in nanomolar unit
    # ========================================================================================================================
    molecules = (
        new_client.activity.filter(target_chembl_id=target_df['target_chembl_id'].iloc[0]).only('molecule_chembl_id', 'canonical_smiles', 'standard_value').filter(standard_type="IC50", standard_units='nM')
    )
    molecules_df = pd.DataFrame.from_dict(molecules)
    molecules_df = molecules_df[['molecule_chembl_id', 'canonical_smiles', "standard_value"]]
    
    # Convert standard_value to numeric
    # ========================================================================================================================
    molecules_df['standard_value'] = molecules_df['standard_value'].astype(float)
    molecules_df.rename({'molecule_chembl_id': 'molecule_id'}, axis=1, inplace=True)
    
    # Extract metadata
    # ========================================================================================================================  
    metadata_columns = ['accession', 'component_description', 'tax_id', 'organism',
                        'target_component_synonyms', 'studied_molecules', 'no_studied_molecules']
    metadata_df = pd.DataFrame(columns=metadata_columns)
    target_components = target_df['target_components'][0]
    metadata_df['accession'] = [i['accession'] for i in target_components if 'accession' in i]
    metadata_df['component_description'] = [i['component_description'] for i in target_components if 'accession' in i]
    metadata_df['tax_id'] = target_df['tax_id']
    metadata_df['organism'] = target_df['organism']
    metadata_df['target_component_synonyms'] = [i['target_component_synonyms'] for i in target_components if 'accession' in i]
    metadata_df['studied_molecules'] = [molecules_df['molecule_id'].to_list()]
    metadata_df['no_studied_molecules'] = molecules_df['molecule_id'].count()
    return(molecules_df, metadata_df)

def pubchem_database_retreive(bioactivity_data_txt, protein_path):
    """
    Retrieve bioactivity data from PubChem database for a given target search term.
    Args:   bioactivity_data_txt: bioactivity path path, protein_path from function pubchem_database_error_test
    Return: Metadata and molecular bioactivity dataframes.
    """  
    # Extract column names and row data
    # ========================================================================================================================      
    columns = bioactivity_data_txt["Table"]["Columns"]["Column"]
    rows = bioactivity_data_txt["Table"]["Row"]

    # Create DataFrame
    # ========================================================================================================================  
    tested_compounds = pd.DataFrame([row["Cell"] for row in rows], columns=columns)
    tested_compounds_df = tested_compounds[tested_compounds['Activity Name'] == 'IC50']
    
    # Extract CIDs as a list
    # ========================================================================================================================  
    compounds = tested_compounds_df['CID'].to_list()

    # Retrieve SMILES notation efficiently
    # ========================================================================================================================  
    properties = pcp.get_properties('canonical_smiles', compounds)
    
    # Organize the result into a DataFrame
    # ========================================================================================================================  
    smiles_df = pd.DataFrame(properties)
    smiles_df['CID'] = smiles_df['CID'].astype(str)
    # Merge with the original DataFrame
    tested_compounds_df['CID'] = tested_compounds_df['CID']
    result_df = pd.merge(tested_compounds_df, smiles_df, on='CID', how='left').drop_duplicates()
    
    # Convert activity values to standard units
    # ========================================================================================================================  
    result_df['Activity Value [uM]'] = pd.to_numeric(result_df['Activity Value [uM]'], errors='coerce')
    result_df['standard_value'] = result_df['Activity Value [uM]'].apply(lambda x: x * 1000)
    result_df['CID'] = 'CID' + result_df['CID']
   
    # Select relevant columns and rename them
    # ========================================================================================================================  
    molecules_df = result_df[['CID', 'CanonicalSMILES', 'standard_value']].rename({'CID': 'molecule_id', 'CanonicalSMILES': 'canonical_smiles'}, axis=1).reset_index(drop=True)
    
    # Get metadata for the target protein
    # ========================================================================================================================  
    protein_path_txt = json.loads(get_query_text(protein_path))
    protein_summary_list = protein_path_txt['ProteinSummaries']['ProteinSummary']
    protein_dict_keys = ['ProteinAccession', 'Name', 'TaxonomyID', 'Taxonomy', 'Synonym']
    protein_summary_list = [{key: entry.get(key, None) for key in protein_dict_keys} for entry in protein_summary_list]
    protein_summary_df = pd.DataFrame(protein_summary_list)    

    # Add information about studied molecules
    # ========================================================================================================================  
    protein_summary_df['studied_molecules'] = [molecules_df['molecule_id'].tolist()]
    protein_summary_df['no_studied_molecules'] = len(molecules_df)
    
    # Create metadata DataFrame
    # ========================================================================================================================  
    metadata_df = protein_summary_df[['ProteinAccession', 'Name', 'TaxonomyID', 'Taxonomy', 'Synonym', 'studied_molecules', 'no_studied_molecules']].rename({'ProteinAccession': 'accession', 'Name': 'component_description', 'TaxonomyID': 'tax_id', 'Taxonomy': 'organism', 'Synonym': 'target_component_synonyms'}, axis=1)

    
    return molecules_df, metadata_df

def fetch_amino_acid_sequence(uniprot_accession):
    """
    Fetches the amino acid sequence of a protein from UniProt using the accession ID.
    Arg:uniprot_accession (str): UniProt accession ID of the protein.
    Returns: str: Amino acid sequence of the protein, or an error message if the fetch fails.
    """
    # UniProt API URL for fetching FASTA format
    # ==================================================================================================
    uniprot_url = f"https://www.uniprot.org/uniprot/{uniprot_accession}.fasta"
    
    try:
        # Make a GET request to fetch the FASTA data
        # ==============================================================================================
        response = requests.get(uniprot_url)
        response.raise_for_status()  # Raise an error for HTTP issues
        
        # Parse the FASTA format to extract the sequence
        # ===============================================================================================
        fasta_data = response.text
        _, *sequence_lines = fasta_data.splitlines()  # Skip the header line
        amino_acid_sequence = "".join(sequence_lines)  # Combine the sequence lines
        
        return amino_acid_sequence
    
    except requests.exceptions.RequestException as e:
        return f"Error fetching sequence: {e}"