70 lines
2.8 KiB
Python
70 lines
2.8 KiB
Python
import evaluate
|
|
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
|
|
class Evaluator():
|
|
def __init__(self) -> None:
|
|
# txt based evaluator
|
|
self.__rouge = evaluate.load("rouge")
|
|
self.__rouge_types = ["rougeLsum", "rouge1", "rouge2"] #rougeLsum will work bad because it expect that each sentence are divided with /n
|
|
self._bleu = evaluate.load("bleu")
|
|
self._meteor = evaluate.load("meteor")
|
|
# token based evaluator
|
|
self.__acc_m = evaluate.load("accuracy")
|
|
self.__prec_m = evaluate.load("precision")
|
|
self.__rec_m = evaluate.load("recall")
|
|
self.__f1_m = evaluate.load("f1")
|
|
|
|
def rdf2txt_rouge_evaluation(self, preds: list[str], refs: list[str]):
|
|
|
|
results = self.__rouge.compute(
|
|
predictions=preds, references=refs,
|
|
rouge_types=self.__rouge_types,
|
|
use_stemmer=True,
|
|
use_aggregator=True #F1
|
|
)
|
|
|
|
return {k: float(results[k]) for k in self.__rouge_types}
|
|
|
|
def rdf2txt_bleu_evaluation(self, preds: list[str], refs: list[str]) -> float:
|
|
# sacreBLEU via evaluate; expects references as list-of-lists
|
|
# each prediction can be evaluated against a list of references, hence [[ref]]
|
|
results = self._bleu.compute(predictions=preds, references=[[r] for r in refs])
|
|
return float(results["bleu"]) # (native sacreBLEU scale)
|
|
|
|
def rdf2txt_meteor_evaluation(self, preds: list[str], refs: list[str]) -> float:
|
|
# as bleu
|
|
res = self._meteor.compute(predictions=preds, references=[[r] for r in refs])
|
|
return float(res["meteor"])
|
|
|
|
|
|
def __my_accuracy(self,preds: list[list[int]], refs: list[list[int]]):
|
|
# it is done on token sequence not single token
|
|
total = len(preds)
|
|
correct = 0
|
|
for p, r in zip(preds, refs):
|
|
correct += int(p == r)
|
|
return correct / total
|
|
|
|
def __accuracy(self, preds, refs):
|
|
return accuracy_score(preds,refs)
|
|
|
|
def __clean_batch_by_pad(self, preds: list[list[int]], refs: list[list[int]]):
|
|
output_preds = []
|
|
output_refs = []
|
|
#TODO
|
|
pad_token: int = 7000 # percolate
|
|
for pred, ref in zip(preds,refs):
|
|
try:
|
|
i = ref.index(pad_token) # first time pad token appears
|
|
except ValueError:
|
|
i = len(ref)
|
|
output_preds.append(pred[:i])
|
|
output_refs.append(ref[:i])
|
|
|
|
return output_preds,output_refs
|
|
|
|
def __precision_recall(self, preds: list[list[int]], refs: list[list[int]]):
|
|
#TODO
|
|
p, r, f1, _ = precision_recall_fscore_support(
|
|
preds, refs, average="binary", zero_division=0
|
|
) #### watch later
|
|
return {"precision": float(p), "recall": float(r), "f1": float(f1)} |