Spaces:

vectara
/

leaderboard

Running on CPU Upgrade

App Files Files Community

Minseok Bae commited on Jan 7, 2024

Commit

2c24f05

1 Parent(s): b46b972

modified the evaluation pipelines.

Browse files

Files changed (2) hide show

src/backend/model_operations.py +36 -27
src/backend/util.py +19 -0

src/backend/model_operations.py CHANGED Viewed

@@ -6,10 +6,9 @@ import logging
 import numpy as np
 import pandas as pd
 import spacy
-# from transformers import AutoModelForCausalLM, AutoTokenizer
 from sentence_transformers import CrossEncoder
-import litellm
 from litellm import completion
 import src.backend.util as util
 import src.envs as envs
@@ -23,8 +22,6 @@ nlp = spacy.load("en_core_web_sm")
 os.environ["HUGGINGFACE_API_KEY"] =  envs.TOKEN
-litellm.set_verbose=True
 def load_evaluation_model(model_path):
     """Load the evaluation model from the given path
@@ -105,7 +102,7 @@ class SummaryGenerator:
         source, summary, dataset = [], [], []
         exceptions = []
-        for index, row in df.iterrows():
             _source = row['text']
             _dataset = row['dataset']
@@ -129,11 +126,12 @@ class SummaryGenerator:
                         exceptions.append(index)
                         break
-        summary.append(_summary)
-        source.append(_source)
-        dataset.append(_dataset)
-        time.sleep(1)
         self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)),
                                         columns=["source", "summary", "dataset"])
@@ -147,26 +145,28 @@ class SummaryGenerator:
         """
         Compute the average length of non-empty summaries using SpaCy.
         """
-        total_words = 0
-        count = 0
         for summary in self.summaries_df['summary']:
-            if summary != "":
                 doc = nlp(summary)
                 words = [token.text for token in doc if token.is_alpha]
-                total_words += len(words)
-                count += 1
-        self.avg_length = 0 if count == 0 else total_words / count
     def _compute_answer_rate(self):
         """
         Compute the rate of non-empty summaries.
         """
-        non_empty_count = sum(1 for summary in self.summaries_df['summary'] if summary)
-        total_rows = len(self.summaries_df)
-        self.answer_rate = 0 if total_rows == 0 else non_empty_count / total_rows
 class EvaluationModel:
@@ -193,7 +193,7 @@ class EvaluationModel:
     def evaluate_hallucination(self, summaries_df):
         """
-        Evaluate the hallucination rate in summaries. This method updates the 'scores' attribute
         of the instance with the computed scores.
         Args:
@@ -202,14 +202,24 @@ class EvaluationModel:
         Returns:
             list: List of hallucination scores. Also updates the 'scores' attribute of the instance.
         """
         source_summary_pairs = util.create_pairs(summaries_df)
-        try:
-            scores = self.model.predict(source_summary_pairs)
-            self.scores = scores
-            return self.scores
-        except Exception as e:
-            logging.error(f"Error evaluating hallucination: {e}")
-            raise
     def compute_factual_consistency_rate(self, threshold=0.5):
         """
@@ -240,4 +250,3 @@ class EvaluationModel:
         self.hallucination_rate = 100 - self.factual_consistency_rate
         return self.factual_consistency_rate

 import numpy as np
 import pandas as pd
 import spacy
 from sentence_transformers import CrossEncoder
 from litellm import completion
+from tqdm import tqdm
 import src.backend.util as util
 import src.envs as envs
 os.environ["HUGGINGFACE_API_KEY"] =  envs.TOKEN
 def load_evaluation_model(model_path):
     """Load the evaluation model from the given path
         source, summary, dataset = [], [], []
         exceptions = []
+        for index, row in tqdm(df.iterrows(), total=df.shape[0]):
             _source = row['text']
             _dataset = row['dataset']
                         exceptions.append(index)
                         break
+            summary.append(_summary)
+            source.append(_source)
+            dataset.append(_dataset)
+            # Sleep to prevent hitting rate limits too frequently
+            time.sleep(1)
         self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)),
                                         columns=["source", "summary", "dataset"])
         """
         Compute the average length of non-empty summaries using SpaCy.
         """
+        total_word_count = 0
+        total_count = 0
         for summary in self.summaries_df['summary']:
+            if util.is_summary_valid(summary):
                 doc = nlp(summary)
                 words = [token.text for token in doc if token.is_alpha]
+                total_word_count += len(words)
+                total_count += 1
+        self.avg_length = 0 if total_count == 0 else total_word_count / total_count
     def _compute_answer_rate(self):
         """
         Compute the rate of non-empty summaries.
         """
+        valid_count = sum(1 for summary in self.summaries_df['summary']
+                            if util.is_summary_valid(summary))
+        total_count = len(self.summaries_df)
+        self.answer_rate = 0 if total_count == 0 else valid_count / total_count
 class EvaluationModel:
     def evaluate_hallucination(self, summaries_df):
         """
+        Evaluate the hallucination rate in summaries. Updates the 'scores' attribute
         of the instance with the computed scores.
         Args:
         Returns:
             list: List of hallucination scores. Also updates the 'scores' attribute of the instance.
         """
+        hem_scores = []
         source_summary_pairs = util.create_pairs(summaries_df)
+        for doc, summary in tqdm(source_summary_pairs, desc="Evaluating hallucinations"):
+            if util.is_summary_valid(summary):
+                try:
+                    score = self.model.predict([doc, summary])[0]
+                    if not isinstance(score, float):
+                        logging.warning(f"Score type mismatch: Expected float, got {type(score)}.")
+                        continue
+                    hem_scores.append(score)
+                except Exception as e:
+                    logging.error(f"Error while running HEM: {e}")
+                    raise
+        self.scores = hem_scores
+        return hem_scores
     def compute_factual_consistency_rate(self, threshold=0.5):
         """
         self.hallucination_rate = 100 - self.factual_consistency_rate
         return self.factual_consistency_rate

src/backend/util.py CHANGED Viewed

@@ -1,3 +1,22 @@
 def create_pairs(df):
     """
     Creates pairs of source and summary from the dataframe.

+def is_summary_valid(summary: str) -> bool:
+    """
+    Checks if the summary is valid.
+    A summary is valid if it is not empty and contains at least five words.
+    Args:
+        summary (str): The summary to check.
+    Returns:
+        bool: True if the summary is valid, False otherwise.
+    """
+    if isinstance(summary, str):
+        words = summary.split()
+        if len(words) >= 5:
+            return True
+    return False
 def create_pairs(df):
     """
     Creates pairs of source and summary from the dataframe.