#This sample checks that the actual and reference tool calls matches exactly
def evaluate(input_df, recipe_params, interpreted_columns, **kwargs):
    # For this metrics, we need the actuall tool calls (the list of tools 
    # that were called byt he agent), and the reference tool calls (provided
    # by th user)
    # Those are extracted from the interpreted_columns

    # reference tool calls. Will produce one list of strings (tool call 
    # names) per row. So, this is a list of list
    reference = interpreted_columns.reference_tool_calls.to_list()
    # actual_tool_calls have more informations in them, we'll only
    # get the tool names
    actual_full = interpreted_columns.actual_tool_calls.to_list()
    actual = []
    for tool_calls_full in actual_full:
        actual.append([t['toolName'] for t in tool_calls_full])

    # Compute the metrics
    row_by_row_result = []
    for ref, act in zip(reference, actual):
        row_by_row_result.append(1 if ref == act else 0)
    
    # compute overall metric (take the average)
    if len(row_by_row_result) == 0:
        overall = 0
    else:
        overall = sum(row_by_row_result) / len(row_by_row_result)
    
    # Return the metric (average value of the match metrics)
    # Return row-by-row detail (just the match metric)
    return overall, row_by_row_result