# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
from dataiku import pandasutils as pdu
import pandas as pd
from urllib.request import urlopen
import json
import requests
import io
import os

# custom function that can be found within Libraries tab > G+L
from census_api_functions import get_project_variables, get_query_text, state_name_list, get_tracts_code_table

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# variable specifications
census_api_key = get_project_variables('standard','api_key')
# census API variables specifications
census_code = 'S0601'
year = 2022

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
df_state_url = f"https://api.census.gov/data/{year}/acs/acs5/profile?get=NAME&for=state:*&key={census_api_key}"
df_state_query_text = get_query_text(df_state_url)
df_state_query_result_list = json.loads(df_state_query_text)
df_state = pd.DataFrame(df_state_query_result_list[1:],columns=df_state_query_result_list[0])
# Create a new DataFrame for the row to be added
new_row = pd.DataFrame([{"NAME": "Virgin Islands", "state": "78"}])
df_state = pd.concat([df_state, new_row], ignore_index=True)
df_state.rename(columns={'NAME': 'State_name', 'state': 'State_code'}, inplace=True)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# numerical list of US states
state_nums_list = df_state['State_code'].unique().tolist()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# API request to gather the dataset from U.S. Census Bureau
all_tracts_df = pd.DataFrame()

for state in state_nums_list:
    print(f"Processing state: {state}")
    try:
        # Construct API URL
        state_all_tracts_query_url = (
            f"https://api.census.gov/data/{year}/acs/acs5/subject"
            f"?get=NAME,group({census_code})&for=tract:*&in=state:{state}&key={census_api_key}"
        )
        # Fetch the data using requests
        response = requests.get(state_all_tracts_query_url)
        response.raise_for_status()  # Raise exception for HTTP errors
        
        # Parse the JSON response
        state_all_tract_names_query_result_list = response.json()
        
        # Convert JSON to DataFrame
        state_all_tract_names_df = pd.DataFrame(
            state_all_tract_names_query_result_list[1:], 
            columns=state_all_tract_names_query_result_list[0]
        )
        
        # Exclude NAME column (first column)
        df = state_all_tract_names_df.iloc[:, 1:]
        
        # Concatenate to main DataFrame
        all_tracts_df = pd.concat([all_tracts_df, df], ignore_index=True)
        
    except requests.exceptions.RequestException as req_err:
        print(f"HTTP error for state {state}: {req_err}")
    except json.JSONDecodeError as json_err:
        print(f"JSON decode error for state {state}: {json_err}")
    except Exception as e:
        print(f"Unexpected error for state {state}: {e}")

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# data preprocessing: column mapping, feature generation
all_tracts = all_tracts_df[['GEO_ID', 'state', 'county', 'tract', 'S0601_C01_001E']]
all_tracts.rename(columns={'S0601_C01_001E':'Population', 'state':'State_code','county':'County_code', 'tract': 'Tract' }, inplace=True)
all_tracts[['GEO_ID', 'FIPS']] = all_tracts['GEO_ID'].str.split('US', expand=True)
all_tracts.drop(['GEO_ID'], axis = 1, inplace = True)


# API request to gather the dataset from U.S. Census Bureau
all_tracts_df_county = pd.DataFrame()

for state in state_nums_list:
    print(f"Processing state: {state}")
    try:
        # Construct API URL
        state_all_tracts_query_url = (
            f"https://api.census.gov/data/{year}/acs/acs5/subject"
            f"?get=NAME,group({census_code})&for=county:*&in=state:{state}&key={census_api_key}"
        )
        # Fetch the data using requests
        response = requests.get(state_all_tracts_query_url)
        response.raise_for_status()  # Raise exception for HTTP errors
        
        # Parse the JSON response
        state_all_tract_names_query_result_list = response.json()
        
        # Convert JSON to DataFrame
        state_all_tract_names_df = pd.DataFrame(
            state_all_tract_names_query_result_list[1:], 
            columns=state_all_tract_names_query_result_list[0]
        )
        
        # Exclude NAME column (first column)
        df = state_all_tract_names_df.iloc[:, 1:]
        
        # Concatenate to main DataFrame
        all_tracts_df_county = pd.concat([all_tracts_df_county, df], ignore_index=True)
        
    except requests.exceptions.RequestException as req_err:
        print(f"HTTP error for state {state}: {req_err}")
    except json.JSONDecodeError as json_err:
        print(f"JSON decode error for state {state}: {json_err}")
    except Exception as e:
        print(f"Unexpected error for state {state}: {e}")

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# data preprocessing: column mapping, feature generation
all_tracts_county = all_tracts_df_county[['GEO_ID', 'state', 'county', 'S0601_C01_001E', 'NAME']]
all_tracts_county.rename(columns={'S0601_C01_001E':'Population_county', 'state':'State_code','county':'County_code', 'NAME': 'Name'}, inplace=True)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
final_data_county = pd.merge(all_tracts, all_tracts_county[['Population_county','State_code', 'County_code', 'Name']], how="left", on=['State_code', 'County_code'])
final_data_county['State_County_code'] = final_data_county['State_code'] + final_data_county['County_code']

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
final_data = pd.concat([final_data_county, final_data_county['Name'].str.split(', ', expand=True)], axis=1)
final_data[0] = final_data[0].str.replace('County', '')
final_data.rename(columns={0:'County_name', 1:'State_name'}, inplace=True)
final_data.drop('Name', axis=1, inplace=True)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
tracts_lookup = dataiku.Dataset("tracts_metadata")
tracts_lookup.write_with_schema(final_data)