# This compute the BLEU metric for each row of the sample.
# Requires a translated text, and the associated ground truth.
# The global BLEU score is taken to be the median of the row-by-row score.

from sacrebleu.metrics import BLEU
import numpy as np

def evaluate(input_df, recipe_params, interpreted_columns, **kwargs):
        # For those metrics, we need the text translated by the llm, and the ground truth
        # Those are extracted from the input dataframe with the right column names
        candidate = interpreted_columns.output.to_list()
        reference = interpreted_columns.ground_truth.to_list()

        # Build the scorer object. One could also produce CHRF or TER score, with 
        # line_scorer = CHRF()
        # line_scorer = TER()
        line_scorer = BLEU(effective_order=True)

        # Compute and store the bleu score, row-by-row
        bleu_line_scores = []
        for i in range(len(candidate)):
            ref = [reference[i]]  # BLEU scorer expects multiple references. We are only providing one here
            bleu_line_scores.append(line_scorer.sentence_score(candidate[i], ref).score)
        global_score = np.median(bleu_line_scores)
    
        # Return a single BLEU value for the entire run (median of all row-by-row BLEU scores)
        # Return also a list of BLEU score for each row (this is optional)
        return global_score, bleu_line_scores
