Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Minseok Bae
commited on
Commit
·
2c24f05
1
Parent(s):
b46b972
modified the evaluation pipelines.
Browse files- src/backend/model_operations.py +36 -27
- src/backend/util.py +19 -0
src/backend/model_operations.py
CHANGED
|
@@ -6,10 +6,9 @@ import logging
|
|
| 6 |
import numpy as np
|
| 7 |
import pandas as pd
|
| 8 |
import spacy
|
| 9 |
-
# from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 10 |
from sentence_transformers import CrossEncoder
|
| 11 |
-
import litellm
|
| 12 |
from litellm import completion
|
|
|
|
| 13 |
|
| 14 |
import src.backend.util as util
|
| 15 |
import src.envs as envs
|
|
@@ -23,8 +22,6 @@ nlp = spacy.load("en_core_web_sm")
|
|
| 23 |
|
| 24 |
os.environ["HUGGINGFACE_API_KEY"] = envs.TOKEN
|
| 25 |
|
| 26 |
-
litellm.set_verbose=True
|
| 27 |
-
|
| 28 |
|
| 29 |
def load_evaluation_model(model_path):
|
| 30 |
"""Load the evaluation model from the given path
|
|
@@ -105,7 +102,7 @@ class SummaryGenerator:
|
|
| 105 |
source, summary, dataset = [], [], []
|
| 106 |
exceptions = []
|
| 107 |
|
| 108 |
-
for index, row in df.iterrows():
|
| 109 |
_source = row['text']
|
| 110 |
_dataset = row['dataset']
|
| 111 |
|
|
@@ -129,11 +126,12 @@ class SummaryGenerator:
|
|
| 129 |
exceptions.append(index)
|
| 130 |
break
|
| 131 |
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
|
| 136 |
-
|
|
|
|
| 137 |
|
| 138 |
self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)),
|
| 139 |
columns=["source", "summary", "dataset"])
|
|
@@ -147,26 +145,28 @@ class SummaryGenerator:
|
|
| 147 |
"""
|
| 148 |
Compute the average length of non-empty summaries using SpaCy.
|
| 149 |
"""
|
| 150 |
-
|
| 151 |
-
|
| 152 |
|
| 153 |
for summary in self.summaries_df['summary']:
|
| 154 |
-
if summary
|
| 155 |
doc = nlp(summary)
|
| 156 |
words = [token.text for token in doc if token.is_alpha]
|
| 157 |
-
|
| 158 |
-
|
| 159 |
|
| 160 |
-
self.avg_length = 0 if
|
| 161 |
|
| 162 |
def _compute_answer_rate(self):
|
| 163 |
"""
|
| 164 |
Compute the rate of non-empty summaries.
|
| 165 |
"""
|
| 166 |
-
|
| 167 |
-
|
|
|
|
|
|
|
| 168 |
|
| 169 |
-
self.answer_rate = 0 if
|
| 170 |
|
| 171 |
|
| 172 |
class EvaluationModel:
|
|
@@ -193,7 +193,7 @@ class EvaluationModel:
|
|
| 193 |
|
| 194 |
def evaluate_hallucination(self, summaries_df):
|
| 195 |
"""
|
| 196 |
-
Evaluate the hallucination rate in summaries.
|
| 197 |
of the instance with the computed scores.
|
| 198 |
|
| 199 |
Args:
|
|
@@ -202,14 +202,24 @@ class EvaluationModel:
|
|
| 202 |
Returns:
|
| 203 |
list: List of hallucination scores. Also updates the 'scores' attribute of the instance.
|
| 204 |
"""
|
|
|
|
| 205 |
source_summary_pairs = util.create_pairs(summaries_df)
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
|
| 214 |
def compute_factual_consistency_rate(self, threshold=0.5):
|
| 215 |
"""
|
|
@@ -240,4 +250,3 @@ class EvaluationModel:
|
|
| 240 |
self.hallucination_rate = 100 - self.factual_consistency_rate
|
| 241 |
|
| 242 |
return self.factual_consistency_rate
|
| 243 |
-
|
|
|
|
| 6 |
import numpy as np
|
| 7 |
import pandas as pd
|
| 8 |
import spacy
|
|
|
|
| 9 |
from sentence_transformers import CrossEncoder
|
|
|
|
| 10 |
from litellm import completion
|
| 11 |
+
from tqdm import tqdm
|
| 12 |
|
| 13 |
import src.backend.util as util
|
| 14 |
import src.envs as envs
|
|
|
|
| 22 |
|
| 23 |
os.environ["HUGGINGFACE_API_KEY"] = envs.TOKEN
|
| 24 |
|
|
|
|
|
|
|
| 25 |
|
| 26 |
def load_evaluation_model(model_path):
|
| 27 |
"""Load the evaluation model from the given path
|
|
|
|
| 102 |
source, summary, dataset = [], [], []
|
| 103 |
exceptions = []
|
| 104 |
|
| 105 |
+
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
|
| 106 |
_source = row['text']
|
| 107 |
_dataset = row['dataset']
|
| 108 |
|
|
|
|
| 126 |
exceptions.append(index)
|
| 127 |
break
|
| 128 |
|
| 129 |
+
summary.append(_summary)
|
| 130 |
+
source.append(_source)
|
| 131 |
+
dataset.append(_dataset)
|
| 132 |
|
| 133 |
+
# Sleep to prevent hitting rate limits too frequently
|
| 134 |
+
time.sleep(1)
|
| 135 |
|
| 136 |
self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)),
|
| 137 |
columns=["source", "summary", "dataset"])
|
|
|
|
| 145 |
"""
|
| 146 |
Compute the average length of non-empty summaries using SpaCy.
|
| 147 |
"""
|
| 148 |
+
total_word_count = 0
|
| 149 |
+
total_count = 0
|
| 150 |
|
| 151 |
for summary in self.summaries_df['summary']:
|
| 152 |
+
if util.is_summary_valid(summary):
|
| 153 |
doc = nlp(summary)
|
| 154 |
words = [token.text for token in doc if token.is_alpha]
|
| 155 |
+
total_word_count += len(words)
|
| 156 |
+
total_count += 1
|
| 157 |
|
| 158 |
+
self.avg_length = 0 if total_count == 0 else total_word_count / total_count
|
| 159 |
|
| 160 |
def _compute_answer_rate(self):
|
| 161 |
"""
|
| 162 |
Compute the rate of non-empty summaries.
|
| 163 |
"""
|
| 164 |
+
valid_count = sum(1 for summary in self.summaries_df['summary']
|
| 165 |
+
if util.is_summary_valid(summary))
|
| 166 |
+
|
| 167 |
+
total_count = len(self.summaries_df)
|
| 168 |
|
| 169 |
+
self.answer_rate = 0 if total_count == 0 else valid_count / total_count
|
| 170 |
|
| 171 |
|
| 172 |
class EvaluationModel:
|
|
|
|
| 193 |
|
| 194 |
def evaluate_hallucination(self, summaries_df):
|
| 195 |
"""
|
| 196 |
+
Evaluate the hallucination rate in summaries. Updates the 'scores' attribute
|
| 197 |
of the instance with the computed scores.
|
| 198 |
|
| 199 |
Args:
|
|
|
|
| 202 |
Returns:
|
| 203 |
list: List of hallucination scores. Also updates the 'scores' attribute of the instance.
|
| 204 |
"""
|
| 205 |
+
hem_scores = []
|
| 206 |
source_summary_pairs = util.create_pairs(summaries_df)
|
| 207 |
+
|
| 208 |
+
for doc, summary in tqdm(source_summary_pairs, desc="Evaluating hallucinations"):
|
| 209 |
+
if util.is_summary_valid(summary):
|
| 210 |
+
try:
|
| 211 |
+
score = self.model.predict([doc, summary])[0]
|
| 212 |
+
if not isinstance(score, float):
|
| 213 |
+
logging.warning(f"Score type mismatch: Expected float, got {type(score)}.")
|
| 214 |
+
continue
|
| 215 |
+
hem_scores.append(score)
|
| 216 |
+
except Exception as e:
|
| 217 |
+
logging.error(f"Error while running HEM: {e}")
|
| 218 |
+
raise
|
| 219 |
+
|
| 220 |
+
self.scores = hem_scores
|
| 221 |
+
return hem_scores
|
| 222 |
+
|
| 223 |
|
| 224 |
def compute_factual_consistency_rate(self, threshold=0.5):
|
| 225 |
"""
|
|
|
|
| 250 |
self.hallucination_rate = 100 - self.factual_consistency_rate
|
| 251 |
|
| 252 |
return self.factual_consistency_rate
|
|
|
src/backend/util.py
CHANGED
|
@@ -1,3 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
def create_pairs(df):
|
| 2 |
"""
|
| 3 |
Creates pairs of source and summary from the dataframe.
|
|
|
|
| 1 |
+
def is_summary_valid(summary: str) -> bool:
|
| 2 |
+
"""
|
| 3 |
+
Checks if the summary is valid.
|
| 4 |
+
|
| 5 |
+
A summary is valid if it is not empty and contains at least five words.
|
| 6 |
+
|
| 7 |
+
Args:
|
| 8 |
+
summary (str): The summary to check.
|
| 9 |
+
|
| 10 |
+
Returns:
|
| 11 |
+
bool: True if the summary is valid, False otherwise.
|
| 12 |
+
"""
|
| 13 |
+
if isinstance(summary, str):
|
| 14 |
+
words = summary.split()
|
| 15 |
+
if len(words) >= 5:
|
| 16 |
+
return True
|
| 17 |
+
return False
|
| 18 |
+
|
| 19 |
+
|
| 20 |
def create_pairs(df):
|
| 21 |
"""
|
| 22 |
Creates pairs of source and summary from the dataframe.
|