import evaluate from sklearn.metrics import accuracy_score, precision_recall_fscore_support class Evaluator(): def __init__(self) -> None: # txt based evaluator self.__rouge = evaluate.load("rouge") self.__rouge_types = ["rougeLsum", "rouge1", "rouge2"] #rougeLsum will work bad because it expect that each sentence are divided with /n self._bleu = evaluate.load("bleu") self._meteor = evaluate.load("meteor") # token based evaluator self.__acc_m = evaluate.load("accuracy") self.__prec_m = evaluate.load("precision") self.__rec_m = evaluate.load("recall") self.__f1_m = evaluate.load("f1") def rdf2txt_rouge_evaluation(self, preds: list[str], refs: list[str]): results = self.__rouge.compute( predictions=preds, references=refs, rouge_types=self.__rouge_types, use_stemmer=True, use_aggregator=True #F1 ) return {k: float(results[k]) for k in self.__rouge_types} def rdf2txt_bleu_evaluation(self, preds: list[str], refs: list[str]) -> float: # sacreBLEU via evaluate; expects references as list-of-lists # each prediction can be evaluated against a list of references, hence [[ref]] results = self._bleu.compute(predictions=preds, references=[[r] for r in refs]) return float(results["bleu"]) # (native sacreBLEU scale) def rdf2txt_meteor_evaluation(self, preds: list[str], refs: list[str]) -> float: # as bleu res = self._meteor.compute(predictions=preds, references=[[r] for r in refs]) return float(res["meteor"]) def __my_accuracy(self,preds: list[list[int]], refs: list[list[int]]): # it is done on token sequence not single token total = len(preds) correct = 0 for p, r in zip(preds, refs): correct += int(p == r) return correct / total def __accuracy(self, preds, refs): return accuracy_score(preds,refs) def __clean_batch_by_pad(self, preds: list[list[int]], refs: list[list[int]]): output_preds = [] output_refs = [] #TODO pad_token: int = 7000 # percolate for pred, ref in zip(preds,refs): try: i = ref.index(pad_token) # first time pad token appears except ValueError: i = len(ref) output_preds.append(pred[:i]) output_refs.append(ref[:i]) return output_preds,output_refs def __precision_recall(self, preds: list[list[int]], refs: list[list[int]]): #TODO p, r, f1, _ = precision_recall_fscore_support( preds, refs, average="binary", zero_division=0 ) #### watch later return {"precision": float(p), "recall": float(r), "f1": float(f1)}