# This compute the ROUGE-L precision for each row of the sample, aimed at evaluating the quality of a summarization.
# Requires a summarized text, and the associated (summarized) ground truth.
# The global ROUGE score is taken to be the median of the row-by-row scores.

from rouge_score import rouge_scorer
import numpy as np

def evaluate(input_df, recipe_params, interpreted_columns, **kwargs):
        # For those metrics, we need the text summarized by the llm and the ground truth
        # Those are extracted from the input dataframe with the right column names
        candidate = interpreted_columns.output.to_list()
        reference = interpreted_columns.ground_truth.to_list()

        # Build the scorer object. We only request one rouge metric, the "L" variant
        scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

        # Compute and store the rouge precision, row-by-row
        rouge_L_line_precision = []
        for i in range(len(candidate)):
            scores = scorer.score(reference[i], candidate[i])
            rouge_L_line_precision.append(scores['rougeL'].precision)
        
        global_score = np.median(rouge_L_line_precision)
    
        # Return a single ROUGE-L precision for the entire run (median of all row-by-row ROUGE-L scores)
        # Return also a list of ROUGE-L precision for each row (this is optional)
        return global_score, rouge_L_line_precision
