Spaces:

vectara
/

leaderboard

Running on CPU Upgrade

App Files Files Community

Miaoran000 commited on Apr 17, 2024

Commit

150bb15

1 Parent(s): 8a6bfdc

minor update and extend to support different APIs

Browse files

Files changed (15) hide show

.gitignore +3 -0
generation_results/CohereForAI/c4ai-command-r-plus.csv +0 -0
generation_results/databricks/dbrx-instruct.csv +0 -0
generation_results/google/gemma-1.1-2b-it.csv +0 -0
generation_results/google/gemma-1.1-7b-it.csv +0 -0
generation_results/microsoft/WizardLM-2-8x22B.csv +0 -0
generation_results/mistralai/mixtral-8x22b.csv +0 -0
generation_results/mistralai/mixtral-8x22b_v1.csv +0 -0
generation_results/openai/GPT-4-Turbo.csv +0 -0
src/backend/evaluate_model.py +43 -3
src/backend/manage_requests.py +1 -1
src/backend/model_operations.py +175 -51
src/backend/run_eval_suite.py +23 -10
src/backend/util.py +5 -4
src/envs.py +2 -2

.gitignore CHANGED Viewed

@@ -15,3 +15,6 @@ eval-queue-bk/
 eval-results-bk/
 src/assets/model_counts.html

 eval-results-bk/
 src/assets/model_counts.html
+generated_results/
+Hallucination Leaderboard Results

generation_results/CohereForAI/c4ai-command-r-plus.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

generation_results/databricks/dbrx-instruct.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

generation_results/google/gemma-1.1-2b-it.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

generation_results/google/gemma-1.1-7b-it.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

generation_results/microsoft/WizardLM-2-8x22B.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

generation_results/mistralai/mixtral-8x22b.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

generation_results/mistralai/mixtral-8x22b_v1.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

generation_results/openai/GPT-4-Turbo.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

src/backend/evaluate_model.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import logging
 import pandas as pd
 import src.envs as envs
@@ -70,13 +72,16 @@ class Evaluator:
         """
         try:
             df = pd.read_csv(envs.DATASET_PATH)
-            generated_summaries_df = self.summary_generator.generate_summaries(df)
             avg_summary_len = self.summary_generator.avg_length
             answer_rate = self.summary_generator.answer_rate
-            hallucination_scores = self.eval_model.evaluate_hallucination(
-                generated_summaries_df)
             factual_consistency_rate = self.eval_model.compute_factual_consistency_rate()
             hallucination_rate = self.eval_model.hallucination_rate
@@ -93,3 +98,38 @@ class Evaluator:
         except Exception as e:
             logging.error(f"Error during evaluation: {e}")
             raise

 import logging
 import pandas as pd
+import os
+import csv
 import src.envs as envs
         """
         try:
             df = pd.read_csv(envs.DATASET_PATH)
+            # print(envs.DATASET_PATH)
+            # print(df.shape)
+            # print(df.iloc[-1])
+            self.generated_summaries_df = self.summary_generator.generate_summaries(df, save_path=f"generation_results/{self.model}.csv")
             avg_summary_len = self.summary_generator.avg_length
             answer_rate = self.summary_generator.answer_rate
+            self.hallucination_scores, self.eval_results = self.eval_model.evaluate_hallucination(
+                self.generated_summaries_df)
             factual_consistency_rate = self.eval_model.compute_factual_consistency_rate()
             hallucination_rate = self.eval_model.hallucination_rate
         except Exception as e:
             logging.error(f"Error during evaluation: {e}")
             raise
+    def write_results(self):
+        print('Updating result files')
+        leaderboard_path = os.getcwd() # the path of leaderboard folder
+        print(leaderboard_path)
+        working_path = os.path.join(leaderboard_path, 'Hallucination Leaderboard Results')
+        if not os.path.exists(working_path):
+            logging.error(f"Need to first download the results from google drive to the learderboard folder")
+            raise
+        source_summary_df = self.generated_summaries_df[["source", "summary"]]
+        # #update leaderboard_summaries.csv
+        # #first remove previous results for the current model
+        # existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_summaries.csv'), encoding='utf-8', sep="\t")
+        # mask = existing_df['model'] == self.model
+        # existing_df = existing_df[~mask]
+        # # get new result
+        leaderboard_summaries_df = source_summary_df
+        leaderboard_summaries_df.insert(2, "model", [self.model]*leaderboard_summaries_df.shape[0])
+        leaderboard_summaries_df.to_csv(os.path.join(working_path, 'leaderboard_summaries.csv'), mode='a', index=False, header=False)
+        print('leaderboard_summaries.csv has been updated')
+        # update leaderboard_summaries_with_scores.csv
+        # BUG: get error when opening the file
+        # existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'),
+        #                         encoding='utf-8', sep=",", on_bad_lines='warn', quotechar='"', quoting=2)
+        # print(existing_df.shape)
+        # mask = existing_df['model'] == self.model
+        # existing_df = existing_df[~mask]
+        # get new result
+        leaderboard_summaries_with_scores_df = pd.DataFrame.from_dict(self.eval_results)
+        leaderboard_summaries_with_scores_df.insert(3, "model", [self.model]*leaderboard_summaries_with_scores_df.shape[0])
+        leaderboard_summaries_with_scores_df.to_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'), mode='a', index=False, header=False)
+        print('leaderboard_summaries_with_scores.csv has been updated')

src/backend/manage_requests.py CHANGED Viewed

@@ -12,7 +12,7 @@ class EvalRequest:
     model: str
     # private: bool
     status: str
-    json_filepath: str
     private: bool = False
     weight_type: str = "Original"
     model_type: str = ""  # pretrained, finetuned, with RL

     model: str
     # private: bool
     status: str
+    json_filepath: str = None
     private: bool = False
     weight_type: str = "Original"
     model_type: str = ""  # pretrained, finetuned, with RL

src/backend/model_operations.py CHANGED Viewed

@@ -2,17 +2,30 @@ import os
 import time
 from datetime import datetime
 import logging
 import numpy as np
 import pandas as pd
 import spacy
 from sentence_transformers import CrossEncoder
-from litellm import completion
 from tqdm import tqdm
 import src.backend.util as util
 import src.envs as envs
 # Set up basic configuration for logging
 logging.basicConfig(level=logging.INFO,
                     format='%(asctime)s - %(levelname)s - %(message)s')
@@ -36,18 +49,6 @@ def load_evaluation_model(model_path):
     return model
-def generate_summary(model: str, system_prompt: str, user_prompt: str, api_base: str):
-    response = completion(
-        model=model,
-        messages=[{"role": "system", "content": system_prompt},
-                    {"role": "user", "content": user_prompt}],
-        temperature=0.0,
-        max_tokens=1024,
-        api_base=api_base,
-    )
-    return response['choices'][0]['message']['content']
 class ModelLoadingException(Exception):
     """Exception raised for errors in loading a model.
@@ -82,6 +83,7 @@ class SummaryGenerator:
             model_id (str): Identifier for the model.
             revision (str): Revision of the model.
         """
         self.model = f"huggingface/{model_id}"
         self.api_base = f"https://api-inference.huggingface.co/models/{model_id}"
         self.summaries_df = pd.DataFrame()
@@ -89,8 +91,9 @@ class SummaryGenerator:
         self.avg_length = None
         self.answer_rate = None
         self.exceptions = None
-    def generate_summaries(self, df):
         """Generate summaries for a given DataFrame of source docs.
         Args:
@@ -99,47 +102,155 @@ class SummaryGenerator:
         Returns:
             summaries_df (DataFrame): Generated summaries by the model.
         """
-        source, summary, dataset = [], [], []
         exceptions = []
-        for index, row in tqdm(df.iterrows(), total=df.shape[0]):
-            _source = row['text']
-            _dataset = row['dataset']
-            system_prompt = envs.SYSTEM_PROMPT
-            user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
-            while True:
-                try:
-                    _summary = generate_summary(self.model, system_prompt,
-                                                user_prompt, self.api_base)
-                    break
-                except Exception as e:
-                    if 'Rate limit reached' in str(e):
-                        wait_time = 3660
-                        current_time = datetime.now().strftime('%H:%M:%S')
-                        print(f"Rate limit hit at {current_time}. Waiting for 1 hour before retrying...")
-                        time.sleep(wait_time)
-                    else:
-                        print(f"Error at index {index}: {e}")
-                        _summary = ""
-                        exceptions.append(index)
                         break
-            summary.append(_summary)
-            source.append(_source)
-            dataset.append(_dataset)
-            # Sleep to prevent hitting rate limits too frequently
-            time.sleep(1)
-        self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)),
-                                        columns=["source", "summary", "dataset"])
         self.exceptions = exceptions
         self._compute_avg_length()
         self._compute_answer_rate()
         return self.summaries_df
     def _compute_avg_length(self):
         """
@@ -203,22 +314,35 @@ class EvaluationModel:
             list: List of hallucination scores. Also updates the 'scores' attribute of the instance.
         """
         hem_scores = []
         source_summary_pairs = util.create_pairs(summaries_df)
         for doc, summary in tqdm(source_summary_pairs, desc="Evaluating hallucinations"):
             if util.is_summary_valid(summary):
                 try:
-                    score = self.model.predict([doc, summary])[0]
                     if not isinstance(score, float):
-                        logging.warning(f"Score type mismatch: Expected float, got {type(score)}.")
-                        continue
                     hem_scores.append(score)
                 except Exception as e:
                     logging.error(f"Error while running HEM: {e}")
                     raise
         self.scores = hem_scores
-        return hem_scores
     def compute_factual_consistency_rate(self, threshold=0.5):

 import time
 from datetime import datetime
 import logging
+from pathlib import Path
+import requests
+import json
 import numpy as np
 import pandas as pd
 import spacy
 from sentence_transformers import CrossEncoder
+import litellm
+# from litellm import completion
 from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig
+# from accelerate import PartialState
+# from accelerate.inference import prepare_pippy
+import torch
+import cohere
+from openai import OpenAI
 import src.backend.util as util
 import src.envs as envs
+litellm.set_verbose=False
 # Set up basic configuration for logging
 logging.basicConfig(level=logging.INFO,
                     format='%(asctime)s - %(levelname)s - %(message)s')
     return model
 class ModelLoadingException(Exception):
     """Exception raised for errors in loading a model.
             model_id (str): Identifier for the model.
             revision (str): Revision of the model.
         """
+        self.model_id = model_id
         self.model = f"huggingface/{model_id}"
         self.api_base = f"https://api-inference.huggingface.co/models/{model_id}"
         self.summaries_df = pd.DataFrame()
         self.avg_length = None
         self.answer_rate = None
         self.exceptions = None
+        self.local_model = None
+    def generate_summaries(self, df, save_path=None):
         """Generate summaries for a given DataFrame of source docs.
         Args:
         Returns:
             summaries_df (DataFrame): Generated summaries by the model.
         """
         exceptions = []
+        if (save_path is not None) and os.path.exists(save_path):
+            self.summaries_df = pd.read_csv(save_path)
+            print(f'Loaded generated summaries from {save_path}')
+        else:
+            source, summary, dataset = [], [], []
+            print(f"Total: {df.shape[0]}")
+            for index, row in tqdm(df.iterrows(), total=df.shape[0]):
+                _source = row['text']
+                _dataset = row['dataset']
+                system_prompt = envs.SYSTEM_PROMPT
+                user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
+                while True:
+                    try:
+                        _summary = self.generate_summary(system_prompt, user_prompt)
+                        # print(f"Finish index {index}")
                         break
+                    except Exception as e:
+                        if 'Rate limit reached' in str(e):
+                            wait_time = 3660
+                            current_time = datetime.now().strftime('%H:%M:%S')
+                            print(f"Rate limit hit at {current_time}. Waiting for 1 hour before retrying...")
+                            time.sleep(wait_time)
+                        elif 'is currently loading' in str(e):
+                            wait_time = 200
+                            print(f"Model is loading, wait for {wait_time}")
+                            time.sleep(wait_time)
+                        else:
+                            print(f"Error at index {index}: {e}")
+                            _summary = ""
+                            exceptions.append(index)
+                            break
+                summary.append(_summary)
+                source.append(_source)
+                dataset.append(_dataset)
+                # Sleep to prevent hitting rate limits too frequently
+                time.sleep(1)
+            self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)),
+                                            columns=["source", "summary", "dataset"])
+            if save_path is not None:
+                print(f'Save summaries to {save_path}')
+                fpath = Path(save_path)
+                fpath.parent.mkdir(parents=True, exist_ok=True)
+                self.summaries_df.to_csv(fpath)
         self.exceptions = exceptions
         self._compute_avg_length()
         self._compute_answer_rate()
         return self.summaries_df
+    def generate_summary(self, system_prompt: str, user_prompt: str):
+        # Using Together AI API
+        if 'mixtral' in self.model_id.lower() or 'dbrx' in self.model_id.lower() or 'wizardlm' in self.model_id.lower(): # For mixtral and dbrx models, use Together AI API
+            suffix = "completions" if ('mixtral' in self.model_id.lower() or 'base' in self.model_id.lower()) else "chat/completions"
+            url = f"https://api.together.xyz/v1/{suffix}"
+            payload = {
+                "model": self.model_id,
+                # "max_tokens": 4096,
+                'max_new_tokens': 250,
+                "temperature": 0.0,
+                'repetition_penalty': 1.1 if 'mixtral' in self.model_id.lower() else 1
+            }
+            if 'mixtral' in self.model_id.lower():
+                # payload['prompt'] = user_prompt
+                # payload['prompt'] = "Write a summary of the following passage:\nPassage:\n" + user_prompt.split('Passage:\n')[-1] + '\n\nSummary:'
+                payload['prompt'] = 'You must stick to the passage provided. Provide a concise summary of the following passage, covering the core pieces of information described:\nPassage:\n' + user_prompt.split('Passage:\n')[-1] + '\n\nSummary:'
+                print(payload)
+            else:
+                payload['messages'] = [{"role": "system", "content": system_prompt},
+                                        {"role": "user", "content": user_prompt}]
+            headers = {
+                "accept": "application/json",
+                "content-type": "application/json",
+                "Authorization": f"Bearer {os.environ['TOGETHER_API_KEY']}"
+            }
+            response = requests.post(url, json=payload, headers=headers)
+            try:
+                result = json.loads(response.text)
+                # print(result)
+                result = result["choices"][0]
+                if 'message' in result:
+                    result = result["message"]["content"].strip()
+                else:
+                    result = result["text"]
+                    result_candidates = [result_cancdidate for result_cancdidate in result.split('\n\n') if len(result_cancdidate) > 0]
+                    result = result_candidates[0]
+                print(result)
+            except:
+                print(response)
+                result = ''
+            return result
+        # Using OpenAI API
+        elif 'gpt' in self.model_id.lower():
+            response = litellm.completion(
+                model=self.model_id.replace('openai/',''),
+                messages=[{"role": "system", "content": system_prompt},
+                        {"role": "user", "content": user_prompt}],
+                temperature=0.0,
+                max_tokens=250,
+            )
+            result = response['choices'][0]['message']['content']
+            print(result)
+            return result
+        # Using HF API or download checkpoints
+        if self.local_model is None:
+            try: # try use HuggingFace API
+                response = litellm.completion(
+                    model='command-r-plus' if 'command' in self.model else self.model,
+                    messages=[{"role": "system", "content": system_prompt},
+                                {"role": "user", "content": user_prompt}],
+                    temperature=0.0,
+                    max_tokens=1024,
+                    api_base=self.api_base,
+                )
+                result = response['choices'][0]['message']['content']
+            except: # fail to call api. run it locally.
+                self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)
+                print("Tokenizer loaded")
+                self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, trust_remote_code=True, device_map="auto", torch_dtype="auto")
+                print("Local model loaded")
+        # Using local model
+        if self.local_model: # cannot call API. using local model
+            messages=[
+                {"role": "system", "content": system_prompt}, # gemma-1.1 does not accept system role
+                {"role": "user", "content": user_prompt}
+            ],
+            prompt = self.tokenizer.apply_chat_template(messages,add_generation_prompt=True, tokenize=False)
+            print(prompt)
+            input_ids = self.tokenizer(prompt, return_tensors="pt").to('cuda')
+            with torch.no_grad():
+                outputs = self.local_model.generate(**input_ids, max_new_tokens=250, do_sample=True, temperature=0.01, pad_token_id=self.tokenizer.eos_token_id)
+            result = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            result = result.replace(prompt[0], '')
+            print(result)
+        return result
     def _compute_avg_length(self):
         """
             list: List of hallucination scores. Also updates the 'scores' attribute of the instance.
         """
         hem_scores = []
+        sources = []
+        summaries = []
         source_summary_pairs = util.create_pairs(summaries_df)
         for doc, summary in tqdm(source_summary_pairs, desc="Evaluating hallucinations"):
             if util.is_summary_valid(summary):
                 try:
+                    # summary_pieces = summary.split('\n')
+                    # summary = summary_pieces[0] if len(summary_pieces[0].strip()) > 0 else summary_pieces[1]
+                    summary = summary.replace('<bos>','').replace('<eos>','')
+                    # print([doc, summary])
+                    # print(self.model.predict([doc, summary]))
+                    score = self.model.predict([doc, summary])# [0]
                     if not isinstance(score, float):
+                        try:
+                            score = score.item()
+                        except:
+                            logging.warning(f"Score type mismatch: Expected float, got {type(score)}.")
+                            continue
                     hem_scores.append(score)
+                    sources.append(doc)
+                    summaries.append(summary)
                 except Exception as e:
                     logging.error(f"Error while running HEM: {e}")
                     raise
         self.scores = hem_scores
+        eval_results = {'source': sources, 'summary': summaries, 'HEM scores': hem_scores}
+        return hem_scores, eval_results
     def compute_factual_consistency_rate(self, threshold=0.5):

src/backend/run_eval_suite.py CHANGED Viewed

@@ -14,7 +14,8 @@ logging.getLogger("openai").setLevel(logging.WARNING)
 def run_evaluation(eval_request: EvalRequest, batch_size, device,
-                local_dir: str, results_repo: str, no_cache=True, limit=None):
     """
     Run the evaluation for a given model and upload the results.
@@ -34,11 +35,20 @@ def run_evaluation(eval_request: EvalRequest, batch_size, device,
     if limit:
         logging.warning("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
     try:
         evaluator = Evaluator(eval_request.model, eval_request.revision, eval_request.precision,
                             batch_size, device, no_cache, limit, write_out=True,
                             output_base_path='logs')
         results = evaluator.evaluate()
     except Exception as e:
         logging.error(f"Error during evaluation: {e}")
         raise
@@ -46,17 +56,20 @@ def run_evaluation(eval_request: EvalRequest, batch_size, device,
     dumped = json.dumps(results, indent=2)
     logging.info(dumped)
-    output_path = os.path.join(local_dir, *eval_request.model.split("/"),
-                            f"results_{datetime.now()}.json")
-    os.makedirs(os.path.dirname(output_path), exist_ok=True)
     with open(output_path, "w") as f:
         f.write(dumped)
-    envs.API.upload_file(
-        path_or_fileobj=output_path,
-        path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
-        repo_id=results_repo,
-        repo_type="dataset",
-    )
     return results

 def run_evaluation(eval_request: EvalRequest, batch_size, device,
+                local_dir: str, results_repo: str, no_cache=True, limit=None,
+                need_check=True, write_results=True):
     """
     Run the evaluation for a given model and upload the results.
     if limit:
         logging.warning("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
+    output_folder = os.path.join(local_dir, *eval_request.model.split("/"))
+    # if os.path.exists(output_folder):
+    #     f_name = os.listdir(output_folder)[-1]
+    #     print(f"Loading results from {os.path.join(output_folder, f_name)}")
+    #     results = json.loads(os.path.join(output_folder, f_name))
+    #     dumped = json.dumps(results, indent=2)
+    #     logging.info(dumped)
+    # else:
     try:
         evaluator = Evaluator(eval_request.model, eval_request.revision, eval_request.precision,
                             batch_size, device, no_cache, limit, write_out=True,
                             output_base_path='logs')
         results = evaluator.evaluate()
+        evaluator.write_results()
     except Exception as e:
         logging.error(f"Error during evaluation: {e}")
         raise
     dumped = json.dumps(results, indent=2)
     logging.info(dumped)
+    output_path = os.path.join(output_folder,
+                            f"results_{datetime.now()}.json") #
+    os.makedirs(output_folder, exist_ok=True)
     with open(output_path, "w") as f:
         f.write(dumped)
+    print(f"Results have been saved to{output_path}")
+    if not need_check:
+        print("Path in the repo:", f"{eval_request.model}/results_{datetime.now()}.json")
+        envs.API.upload_file(
+            path_or_fileobj=output_path,
+            path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
+            repo_id=results_repo,
+            repo_type="dataset",
+        )
     return results

src/backend/util.py CHANGED Viewed

@@ -14,6 +14,7 @@ def is_summary_valid(summary: str) -> bool:
         words = summary.split()
         if len(words) >= 5:
             return True
     return False
@@ -60,16 +61,16 @@ def format_results(model_name: str, revision: str, precision: str,
         },
         "results": {
             "hallucination_rate": {
-                "hallucination_rate": hallucination_rate
             },
             "factual_consistency_rate": {
-                "factual_consistency_rate": factual_consistency_rate
             },
             "answer_rate": {
-                "answer_rate": answer_rate
             },
             "average_summary_length": {
-                "average_summary_length": avg_summary_len
             },
         }
     }

         words = summary.split()
         if len(words) >= 5:
             return True
+    # print(summary)
     return False
         },
         "results": {
             "hallucination_rate": {
+                "hallucination_rate": round(hallucination_rate,1)
             },
             "factual_consistency_rate": {
+                "factual_consistency_rate": round(factual_consistency_rate,1)
             },
             "answer_rate": {
+                "answer_rate": round(answer_rate*100,1)
             },
             "average_summary_length": {
+                "average_summary_length": round(avg_summary_len,1)
             },
         }
     }

src/envs.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
 from huggingface_hub import HfApi
@@ -19,7 +19,7 @@ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
 EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
-DEVICE = "cpu"
 API = HfApi(token=TOKEN)
 DATASET_PATH = "src/datasets/leaderboard_dataset.csv"

 import os
+import torch
 from huggingface_hub import HfApi
 EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
 EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #"cpu"
 API = HfApi(token=TOKEN)
 DATASET_PATH = "src/datasets/leaderboard_dataset.csv"