# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
import re

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Read recipe inputs
eligibilities = dataiku.Dataset("eligibilities_distinct")
eligibility_text_prepared = dataiku.Dataset("eligibility_text_prepared")
schema = [
    {'name': 'NCTId',   'type':'string'},
    {'name': 'EligibilityCriteria',   'type':'string'},
    {'name': 'HealthyVolunteers',   'type':'string'},
    {'name': 'Sex',   'type':'string'},
    {'name': 'MinimumAge',   'type':'string'},
    {'name': 'MaximumAge',   'type':'string'},
    {'name': 'CHILD',   'type':'double'},
    {'name': 'ADULT',   'type':'double'},
    {'name': 'OLDER_ADULT',   'type':'double'},
    {'name': 'StudyPopulation',   'type':'string'},
    {'name': 'SamplingMethod',   'type':'string'},
    {'name': 'inclusion_criteria',   'type':'string'},
    {'name': 'exclusion_criteria',   'type':'string'},
]
eligibility_text_prepared.write_schema(schema)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE

def split_criteria(text):
    text = str(text)
    if len(text) == 0:
        return "", ""
    
    # Define regular expression pattern to find "exclusion" or "exclusion criteria"
    pattern = r'(?i)\n+\W*\b(?:key\s|the\s|main\s)?exclusion(?:\scriteria)?\b'

    # Find all matches of the pattern in the text
    matches = re.finditer(pattern, text)

    # Initialize the index where to split the text
    split_index = None

    # Iterate through the matches
    for match in matches:
        # Update split_index to the start of the match
        split_index = match.start()

    # If split_index is found, split the text at that index
    if split_index is not None:
        return text[:split_index].strip(), text[split_index:].strip()
    else:
        # If no match is found, return the entire text as inclusion criteria and an empty string as exclusion criteria
        return text.strip(), ""

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
with eligibility_text_prepared.get_writer() as writer:
    for df in eligibilities.iter_dataframes(chunksize=3000):
        df['inclusion_criteria'], df['exclusion_criteria'] = zip(*df['EligibilityCriteria'].apply(split_criteria))
        writer.write_dataframe(df)
