Spaces:
No application file
No application file
File size: 5,983 Bytes
56bd5b5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
from typing import Dict, List
import evaluate
from datasets import Features, Sequence, Value
from sklearn.metrics import accuracy_score
from research_eval.utils.preprocessing import absa_term_preprocess
_CITATION = """
"""
_DESCRIPTION = """
Evaluation metrics for Aspect-Based Sentiment Analysis (ABSA) including precision, recall, and F1 score for aspect terms and polarities.
"""
_KWARGS_DESCRIPTION = """
Computes precision, recall, and F1 score for aspect terms and polarities in Aspect-Based Sentiment Analysis (ABSA).
Args:
predictions: List of ABSA predictions with the following structure:
- 'aspects': Sequence of aspect annotations, each with the following keys:
- 'term': Aspect term
- 'polarity': Polarity of the aspect term
references: List of ABSA references with the same structure as predictions.
Returns:
aspect_precision: Precision score for aspect terms
aspect_recall: Recall score for aspect terms
aspect_f1: F1 score for aspect terms
polarity_precision: Precision score for aspect polarities
polarity_recall: Recall score for aspect polarities
polarity_f1: F1 score for aspect polarities
"""
class AbsaEvaluatorTest(evaluate.Metric):
def _info(self):
return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=Features(
{
"predictions": Features(
{
"aspects": Features(
{
"term": Sequence(Value("string")),
"polarity": Sequence(Value("string")),
}
),
"category": Features(
{
"category": Sequence(Value("string")),
"polarity": Sequence(Value("string")),
}
),
}
),
"references": Features(
{
"aspects": Features(
{
"term": Sequence(Value("string")),
"polarity": Sequence(Value("string")),
}
),
"category": Features(
{
"category": Sequence(Value("string")),
"polarity": Sequence(Value("string")),
}
),
}
),
}
),
)
def _compute(self, predictions, references):
# preprocess aspect term
(
truth_aspect_terms,
pred_aspect_terms,
truth_term_polarities,
pred_term_polarities,
) = absa_term_preprocess(
references=references,
predictions=predictions,
subtask_key="aspects",
subtask_value="term",
)
# evaluate
term_results = self.semeval_metric(
truth_aspect_terms, pred_aspect_terms
)
term_polarity_acc = accuracy_score(
truth_term_polarities, pred_term_polarities
)
# preprocess category detection
(
truth_categories,
pred_categories,
truth_cat_polarities,
pred_cat_polarities,
) = absa_term_preprocess(
references=references,
predictions=predictions,
subtask_key="category",
subtask_value="category",
)
# evaluate
category_results = self.semeval_metric(
truth_categories, pred_categories
)
cat_polarity_acc = accuracy_score(
truth_cat_polarities, pred_cat_polarities
)
return {
"term_extraction_results": term_results,
"term_polarity_results_accuracy": term_polarity_acc,
"category_detection_results": category_results,
"category_polarity_results_accuracy": cat_polarity_acc,
}
def semeval_metric(
self, truths: List[List[str]], preds: List[List[str]]
) -> Dict[str, float]:
"""
Implements evaluation for extraction tasks using precision, recall, and F1 score.
Parameters:
- truths: List of lists, where each list contains the ground truth labels for a sample.
- preds: List of lists, where each list contains the predicted labels for a sample.
Returns:
- A dictionary containing the precision, recall, F1 score, and counts of common, retrieved, and relevant.
link for code: link for this code: https://github.com/davidsbatista/Aspect-Based-Sentiment-Analysis/blob/1d9c8ec1131993d924e96676fa212db6b53cb870/libraries/baselines.py#L387
"""
b = 1
common, relevant, retrieved = 0.0, 0.0, 0.0
for truth, pred in zip(truths, preds):
common += len([a for a in pred if a in truth])
retrieved += len(pred)
relevant += len(truth)
precision = common / retrieved if retrieved > 0 else 0.0
recall = common / relevant if relevant > 0 else 0.0
f1 = (
(1 + (b**2))
* precision
* recall
/ ((precision * b**2) + recall)
if precision > 0 and recall > 0
else 0.0
)
return {
"precision": precision,
"recall": recall,
"f1_score": f1,
"common": common,
"retrieved": retrieved,
"relevant": relevant,
}
|