import evaluate class Evaluator(): def __init__(self) -> None: # txt based evaluator self.__rouge = evaluate.load("rouge") self.__rouge_types = ["rougeLsum", "rouge1", "rouge2"] #rougeLsum will work bad because it expect that each sentence are divided with /n self._bleu = evaluate.load("bleu") self._meteor = evaluate.load("meteor") def rdf2txt_rouge_evaluation(self, preds: list[str], refs: list[str]): results = self.__rouge.compute( predictions=preds, references=refs, rouge_types=self.__rouge_types, use_stemmer=True, use_aggregator=True #F1 ) return {k: float(results[k]) for k in self.__rouge_types} def rdf2txt_bleu_evaluation(self, preds: list[str], refs: list[str]) -> float: # sacreBLEU via evaluate; expects references as list-of-lists # each prediction can be evaluated against a list of references, hence [[ref]] results = self._bleu.compute(predictions=preds, references=[[r] for r in refs]) return float(results["bleu"]) # (native sacreBLEU scale) def rdf2txt_meteor_evaluation(self, preds: list[str], refs: list[str]) -> float: # as bleu res = self._meteor.compute(predictions=preds, references=[[r] for r in refs]) return float(res["meteor"]) def txt2rdf_precision_evaluation(self,preds: list[str], refs: list[str]):