cyx96 commited on
Commit
3193aca
Β·
1 Parent(s): f10e39e

added phi4

Browse files
.gitignore CHANGED
@@ -17,6 +17,7 @@ eval-results-bk_hhem21/
17
  eval-results_hhem21/
18
  hhem21_server/
19
  leaderboard_results/
 
20
 
21
  src/assets/model_counts.html
22
 
 
17
  eval-results_hhem21/
18
  hhem21_server/
19
  leaderboard_results/
20
+ leaderboard-bk/
21
 
22
  src/assets/model_counts.html
23
 
app.py CHANGED
@@ -21,6 +21,13 @@ try:
21
  )
22
  except Exception:
23
  restart_space()
 
 
 
 
 
 
 
24
  try:
25
  print(envs.EVAL_RESULTS_PATH)
26
  snapshot_download(
@@ -32,24 +39,15 @@ except Exception:
32
  raw_data, original_df = populate.get_leaderboard_df(envs.EVAL_RESULTS_PATH, envs.EVAL_REQUESTS_PATH, utils.COLS, utils.BENCHMARK_COLS)
33
  leaderboard_df = original_df.copy()
34
 
35
- (
36
- finished_eval_queue_df,
37
- running_eval_queue_df,
38
- pending_eval_queue_df,
39
- ) = populate.get_evaluation_queue_df(envs.EVAL_REQUESTS_PATH, utils.EVAL_COLS)
40
-
41
-
42
  # Searching and filtering
43
  def update_table(
44
  hidden_df: pd.DataFrame,
45
  columns: list,
46
  type_query: list,
47
- precision_query: str,
48
- size_query: list,
49
- show_deleted: bool,
50
  query: str,
51
  ):
52
- filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
 
53
  filtered_df = filter_queries(query, filtered_df)
54
  df = select_columns(filtered_df, columns)
55
  return df
@@ -83,32 +81,23 @@ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
83
  final_df.append(temp_filtered_df)
84
  if len(final_df) > 0:
85
  filtered_df = pd.concat(final_df)
86
- filtered_df = filtered_df.drop_duplicates(
87
- subset=[utils.AutoEvalColumn.model.name, utils.AutoEvalColumn.precision.name, utils.AutoEvalColumn.revision.name]
88
- )
89
 
90
  return filtered_df
91
 
92
 
93
- def filter_models(
94
- df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
95
- ) -> pd.DataFrame:
96
- # Show all models
97
- # if show_deleted:
98
- # filtered_df = df
99
- # else: # Show only still on the hub models
100
- # filtered_df = df[df[utils.AutoEvalColumn.still_on_hub.name]]
101
-
102
  filtered_df = df
103
 
104
  type_emoji = [t[0] for t in type_query]
105
  filtered_df = filtered_df.loc[df[utils.AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
106
- filtered_df = filtered_df.loc[df[utils.AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
107
 
108
- numeric_interval = pd.IntervalIndex(sorted([utils.NUMERIC_INTERVALS[s] for s in size_query]))
109
- params_column = pd.to_numeric(df[utils.AutoEvalColumn.params.name], errors="coerce")
110
- mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
111
- filtered_df = filtered_df.loc[mask]
112
 
113
  return filtered_df
114
 
@@ -148,12 +137,8 @@ with demo:
148
  elem_id="column-select",
149
  interactive=True,
150
  )
151
- # with gr.Row():
152
- # deleted_models_visibility = gr.Checkbox(
153
- # value=False, label="Show gated/private/deleted models", interactive=True
154
- # )
155
  with gr.Column(min_width=320):
156
- #with gr.Box(elem_id="box-filter"):
157
  filter_columns_type = gr.CheckboxGroup(
158
  label="Model types",
159
  choices=[t.to_str() for t in utils.ModelType],
@@ -161,20 +146,6 @@ with demo:
161
  interactive=True,
162
  elem_id="filter-columns-type",
163
  )
164
- # filter_columns_precision = gr.CheckboxGroup(
165
- # label="Precision",
166
- # choices=[i.value.name for i in utils.Precision],
167
- # value=[i.value.name for i in utils.Precision],
168
- # interactive=True,
169
- # elem_id="filter-columns-precision",
170
- # )
171
- # filter_columns_size = gr.CheckboxGroup(
172
- # label="Model sizes (in billions of parameters)",
173
- # choices=list(utils.NUMERIC_INTERVALS.keys()),
174
- # value=list(utils.NUMERIC_INTERVALS.keys()),
175
- # interactive=True,
176
- # elem_id="filter-columns-size",
177
- # )
178
 
179
  leaderboard_table = gr.components.Dataframe(
180
  value=leaderboard_df[
@@ -203,23 +174,17 @@ with demo:
203
  hidden_leaderboard_table_for_search,
204
  shown_columns,
205
  filter_columns_type,
206
- # filter_columns_precision,
207
- # filter_columns_size,
208
- # deleted_models_visibility,
209
  search_bar,
210
  ],
211
  leaderboard_table,
212
  )
213
- for selector in [shown_columns, filter_columns_type]: #, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
214
  selector.change(
215
  update_table,
216
  [
217
  hidden_leaderboard_table_for_search,
218
  shown_columns,
219
  filter_columns_type,
220
- # filter_columns_precision,
221
- # filter_columns_size,
222
- # deleted_models_visibility,
223
  search_bar,
224
  ],
225
  leaderboard_table,
 
21
  )
22
  except Exception:
23
  restart_space()
24
+
25
+ (
26
+ finished_eval_queue_df,
27
+ running_eval_queue_df,
28
+ pending_eval_queue_df,
29
+ ) = populate.get_evaluation_queue_df(envs.EVAL_REQUESTS_PATH, utils.EVAL_COLS)
30
+
31
  try:
32
  print(envs.EVAL_RESULTS_PATH)
33
  snapshot_download(
 
39
  raw_data, original_df = populate.get_leaderboard_df(envs.EVAL_RESULTS_PATH, envs.EVAL_REQUESTS_PATH, utils.COLS, utils.BENCHMARK_COLS)
40
  leaderboard_df = original_df.copy()
41
 
 
 
 
 
 
 
 
42
  # Searching and filtering
43
  def update_table(
44
  hidden_df: pd.DataFrame,
45
  columns: list,
46
  type_query: list,
 
 
 
47
  query: str,
48
  ):
49
+ print(f"filter: columns={columns}, type_query={type_query}, query={query}")
50
+ filtered_df = filter_models(hidden_df, type_query)
51
  filtered_df = filter_queries(query, filtered_df)
52
  df = select_columns(filtered_df, columns)
53
  return df
 
81
  final_df.append(temp_filtered_df)
82
  if len(final_df) > 0:
83
  filtered_df = pd.concat(final_df)
84
+ #filtered_df = filtered_df.drop_duplicates(subset=[utils.AutoEvalColumn.model.name, utils.AutoEvalColumn.precision.name, utils.AutoEvalColumn.revision.name])
85
+ filtered_df = filtered_df.drop_duplicates(subset=[utils.AutoEvalColumn.model.name])
 
86
 
87
  return filtered_df
88
 
89
 
90
+ def filter_models(df: pd.DataFrame, type_query: list) -> pd.DataFrame:
 
 
 
 
 
 
 
 
91
  filtered_df = df
92
 
93
  type_emoji = [t[0] for t in type_query]
94
  filtered_df = filtered_df.loc[df[utils.AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
95
+ # filtered_df = filtered_df.loc[df[utils.AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
96
 
97
+ # numeric_interval = pd.IntervalIndex(sorted([utils.NUMERIC_INTERVALS[s] for s in size_query]))
98
+ # params_column = pd.to_numeric(df[utils.AutoEvalColumn.params.name], errors="coerce")
99
+ # mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
100
+ # filtered_df = filtered_df.loc[mask]
101
 
102
  return filtered_df
103
 
 
137
  elem_id="column-select",
138
  interactive=True,
139
  )
140
+
 
 
 
141
  with gr.Column(min_width=320):
 
142
  filter_columns_type = gr.CheckboxGroup(
143
  label="Model types",
144
  choices=[t.to_str() for t in utils.ModelType],
 
146
  interactive=True,
147
  elem_id="filter-columns-type",
148
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
  leaderboard_table = gr.components.Dataframe(
151
  value=leaderboard_df[
 
174
  hidden_leaderboard_table_for_search,
175
  shown_columns,
176
  filter_columns_type,
 
 
 
177
  search_bar,
178
  ],
179
  leaderboard_table,
180
  )
181
+ for selector in [shown_columns, filter_columns_type]:
182
  selector.change(
183
  update_table,
184
  [
185
  hidden_leaderboard_table_for_search,
186
  shown_columns,
187
  filter_columns_type,
 
 
 
188
  search_bar,
189
  ],
190
  leaderboard_table,
main_backend.py CHANGED
@@ -20,10 +20,8 @@ RUNNING_STATUS = "RUNNING"
20
  FINISHED_STATUS = "FINISHED"
21
  FAILED_STATUS = "FAILED"
22
 
23
- snapshot_download(repo_id=envs.RESULTS_REPO, revision="main",
24
- local_dir=envs.EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
25
- snapshot_download(repo_id=envs.QUEUE_REPO, revision="main",
26
- local_dir=envs.EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
27
 
28
 
29
  def run_auto_eval(args):
@@ -41,9 +39,9 @@ def run_auto_eval(args):
41
  local_dir_results=envs.EVAL_RESULTS_PATH_BACKEND
42
  )
43
  logging.info("Checked completed evals")
44
- eval_requests = manage_requests.get_eval_requests(job_status=current_pending_status,
45
- hf_repo=envs.QUEUE_REPO,
46
- local_dir=envs.EVAL_REQUESTS_PATH_BACKEND)
47
  logging.info("Got eval requests")
48
  eval_requests = sort_queue.sort_models_by_priority(api=envs.API, models=eval_requests)
49
  logging.info("Sorted eval requests")
@@ -65,20 +63,11 @@ def run_auto_eval(args):
65
  eval_request = eval_requests[0]
66
  pp.pprint(eval_request)
67
 
68
- # manage_requests.set_eval_request(
69
- # api=envs.API,
70
- # eval_request=eval_request,
71
- # new_status=RUNNING_STATUS,
72
- # hf_repo=envs.QUEUE_REPO,
73
- # local_dir=envs.EVAL_REQUESTS_PATH_BACKEND
74
- # )
75
- # logging.info("Set eval request to running, now running eval")
76
-
77
  run_eval_suite.run_evaluation(
78
  eval_request=eval_request,
79
  local_dir=envs.EVAL_RESULTS_PATH_BACKEND,
80
  results_repo=envs.RESULTS_REPO,
81
- batch_size=1,
82
  device=envs.DEVICE,
83
  no_cache=True,
84
  need_check=not args.publish,
@@ -88,6 +77,7 @@ def run_auto_eval(args):
88
  else:
89
  eval_request = manage_requests.EvalRequest(
90
  model=args.model,
 
91
  status=PENDING_STATUS,
92
  precision=args.precision
93
  )
@@ -98,10 +88,13 @@ def run_auto_eval(args):
98
  eval_request=eval_request,
99
  local_dir=envs.EVAL_RESULTS_PATH_BACKEND,
100
  results_repo=envs.RESULTS_REPO,
101
- batch_size=1,
102
  device=envs.DEVICE,
103
  need_check=not args.publish,
104
- write_results=args.update
 
 
 
105
  )
106
  logging.info("Reproducibility eval finished")
107
 
@@ -112,9 +105,14 @@ def main():
112
  # Optional arguments
113
  parser.add_argument("--reproduce", type=bool, default=False, help="Reproduce the evaluation results")
114
  parser.add_argument("--model", type=str, default=None, help="Your Model ID")
 
115
  parser.add_argument("--precision", type=str, default="float16", help="Precision of your model")
116
  parser.add_argument("--publish", type=bool, default=False, help="whether directly publish the evaluation results on HF")
117
  parser.add_argument("--update", type=bool, default=False, help="whether to update google drive files")
 
 
 
 
118
 
119
  args = parser.parse_args()
120
 
 
20
  FINISHED_STATUS = "FINISHED"
21
  FAILED_STATUS = "FAILED"
22
 
23
+ snapshot_download(repo_id=envs.RESULTS_REPO, revision="main", local_dir=envs.EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
24
+ snapshot_download(repo_id=envs.QUEUE_REPO, revision="main", local_dir=envs.EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
 
 
25
 
26
 
27
  def run_auto_eval(args):
 
39
  local_dir_results=envs.EVAL_RESULTS_PATH_BACKEND
40
  )
41
  logging.info("Checked completed evals")
42
+ eval_requests = manage_requests.get_eval_requests(
43
+ job_status=current_pending_status, hf_repo=envs.QUEUE_REPO, local_dir=envs.EVAL_REQUESTS_PATH_BACKEND
44
+ )
45
  logging.info("Got eval requests")
46
  eval_requests = sort_queue.sort_models_by_priority(api=envs.API, models=eval_requests)
47
  logging.info("Sorted eval requests")
 
63
  eval_request = eval_requests[0]
64
  pp.pprint(eval_request)
65
 
 
 
 
 
 
 
 
 
 
66
  run_eval_suite.run_evaluation(
67
  eval_request=eval_request,
68
  local_dir=envs.EVAL_RESULTS_PATH_BACKEND,
69
  results_repo=envs.RESULTS_REPO,
70
+ batch_size=args.batch_size,
71
  device=envs.DEVICE,
72
  no_cache=True,
73
  need_check=not args.publish,
 
77
  else:
78
  eval_request = manage_requests.EvalRequest(
79
  model=args.model,
80
+ model_path=args.model_path,
81
  status=PENDING_STATUS,
82
  precision=args.precision
83
  )
 
88
  eval_request=eval_request,
89
  local_dir=envs.EVAL_RESULTS_PATH_BACKEND,
90
  results_repo=envs.RESULTS_REPO,
91
+ batch_size=args.batch_size,
92
  device=envs.DEVICE,
93
  need_check=not args.publish,
94
+ write_results=args.update,
95
+ limit=args.limit,
96
+ use_vllm=args.use_vllm,
97
+ tensor_parallel_size=args.tensor_parallel_size,
98
  )
99
  logging.info("Reproducibility eval finished")
100
 
 
105
  # Optional arguments
106
  parser.add_argument("--reproduce", type=bool, default=False, help="Reproduce the evaluation results")
107
  parser.add_argument("--model", type=str, default=None, help="Your Model ID")
108
+ parser.add_argument("--model_path", type=str, default=None, help="Full path of model")
109
  parser.add_argument("--precision", type=str, default="float16", help="Precision of your model")
110
  parser.add_argument("--publish", type=bool, default=False, help="whether directly publish the evaluation results on HF")
111
  parser.add_argument("--update", type=bool, default=False, help="whether to update google drive files")
112
+ parser.add_argument("--limit", type=int, default=None, help="Limit on the number of items to process")
113
+ parser.add_argument("--use_vllm", type=bool, default=False, help="Whether to infer with vllm or not")
114
+ parser.add_argument("--tensor_parallel_size", type=int, default=1)
115
+ parser.add_argument("--batch_size", type=int, default=1)
116
 
117
  args = parser.parse_args()
118
 
requirements.txt CHANGED
@@ -27,6 +27,7 @@ google-cloud-aiplatform>=1.38
27
  qwen-vl-utils
28
  vertexai
29
  # git+https://github.com/huggingface/transformers
30
- transformers==4.45.2
31
  together==1.3.0
32
- spacy
 
 
27
  qwen-vl-utils
28
  vertexai
29
  # git+https://github.com/huggingface/transformers
30
+ transformers==4.51.3
31
  together==1.3.0
32
+ spacy
33
+ vllm==0.8.5
src/backend/evaluate_model.py CHANGED
@@ -29,9 +29,14 @@ class Evaluator:
29
  summary_generator (SummaryGenerator): Instance for generating summaries.
30
  eval_model (EvaluationModel): Instance for evaluating summaries.
31
  """
32
- def __init__(self, model, revision, precision, batch_size,
33
- device, no_cache, limit, write_out=True,
34
- output_base_path='logs'):
 
 
 
 
 
35
  """Initializes the Evaluator with the given model and settings.
36
 
37
  Args:
@@ -47,6 +52,7 @@ class Evaluator:
47
  output_base_path (str): Base path for output files.
48
  """
49
  self.model = model
 
50
  self.revision = revision
51
  self.precision = precision
52
  self.batch_size = batch_size
@@ -56,7 +62,7 @@ class Evaluator:
56
  self.write_out = write_out
57
  self.output_base_path = output_base_path
58
  try:
59
- self.summary_generator = SummaryGenerator(model, revision, self.device)
60
  self.eval_model = EvaluationModel(envs.HEM_PATH, self.device)
61
  except Exception as e:
62
  logging.error(f"Error initializing Evaluator: {e}")
@@ -71,26 +77,35 @@ class Evaluator:
71
  dict: A dictionary containing evaluation results.
72
  """
73
  try:
 
 
 
 
 
74
  df = pd.read_csv(envs.DATASET_PATH)
75
- self.generated_summaries_df = self.summary_generator.generate_summaries(df, save_path=f"generation_results/{self.model}.csv")
 
 
76
 
77
  avg_summary_len = self.summary_generator.avg_length
78
  answer_rate = self.summary_generator.answer_rate
79
 
80
- self.hallucination_scores, self.eval_results = self.eval_model.evaluate_hallucination(
81
- self.generated_summaries_df)
82
  factual_consistency_rate = self.eval_model.compute_factual_consistency_rate()
83
  hallucination_rate = self.eval_model.hallucination_rate
84
 
85
- results = util.format_results(model_name=self.model, revision=self.revision,
86
- precision=self.precision,
87
- factual_consistency_rate=factual_consistency_rate,
88
- hallucination_rate=hallucination_rate,
89
- answer_rate=answer_rate,
90
- avg_summary_len=avg_summary_len)
 
 
 
91
  return results
92
  except FileNotFoundError:
93
- logging.error(f"File not found: {envs.DATASET_PATH}")
94
  raise
95
  except Exception as e:
96
  logging.error(f"Error during evaluation: {e}")
 
29
  summary_generator (SummaryGenerator): Instance for generating summaries.
30
  eval_model (EvaluationModel): Instance for evaluating summaries.
31
  """
32
+ def __init__(
33
+ self, model, revision, precision, batch_size,
34
+ device, no_cache, limit, write_out=True,
35
+ output_base_path='logs',
36
+ model_path=None,
37
+ use_vllm=False,
38
+ tensor_parallel_size=1
39
+ ):
40
  """Initializes the Evaluator with the given model and settings.
41
 
42
  Args:
 
52
  output_base_path (str): Base path for output files.
53
  """
54
  self.model = model
55
+ self.model_path = model_path
56
  self.revision = revision
57
  self.precision = precision
58
  self.batch_size = batch_size
 
62
  self.write_out = write_out
63
  self.output_base_path = output_base_path
64
  try:
65
+ self.summary_generator = SummaryGenerator(model, revision, self.device, model_path=self.model_path, use_vllm=use_vllm, tensor_parallel_size=tensor_parallel_size)
66
  self.eval_model = EvaluationModel(envs.HEM_PATH, self.device)
67
  except Exception as e:
68
  logging.error(f"Error initializing Evaluator: {e}")
 
77
  dict: A dictionary containing evaluation results.
78
  """
79
  try:
80
+ # print(envs.DATA_LEADERBOARD_REPO)
81
+ # snapshot_download(
82
+ # repo_id=envs.DATA_LEADERBOARD_REPO, local_dir=envs.DATA_LEADERBOARD_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
83
+ # )
84
+
85
  df = pd.read_csv(envs.DATASET_PATH)
86
+ if self.limit is not None and self.limit > 0:
87
+ df = df.head(self.limit)
88
+ self.generated_summaries_df = self.summary_generator.generate_summaries(df, save_path=f"generation_results/{self.model}.csv", batch_size=self.batch_size)
89
 
90
  avg_summary_len = self.summary_generator.avg_length
91
  answer_rate = self.summary_generator.answer_rate
92
 
93
+ self.hallucination_scores, self.eval_results = self.eval_model.evaluate_hallucination(self.generated_summaries_df)
 
94
  factual_consistency_rate = self.eval_model.compute_factual_consistency_rate()
95
  hallucination_rate = self.eval_model.hallucination_rate
96
 
97
+ results = util.format_results(
98
+ model_name=self.model,
99
+ revision=self.revision,
100
+ precision=self.precision,
101
+ factual_consistency_rate=factual_consistency_rate,
102
+ hallucination_rate=hallucination_rate,
103
+ answer_rate=answer_rate,
104
+ avg_summary_len=avg_summary_len
105
+ )
106
  return results
107
  except FileNotFoundError:
108
+ logging.error(f"File not found: {envs.DATA_LEADERBOARD_NAME}")
109
  raise
110
  except Exception as e:
111
  logging.error(f"Error during evaluation: {e}")
src/backend/manage_requests.py CHANGED
@@ -24,6 +24,7 @@ class EvalRequest:
24
  likes: Optional[int] = 0
25
  params: Optional[int] = None
26
  license: Optional[str] = ""
 
27
 
28
  def get_model_args(self):
29
  model_args = f"pretrained={self.model},revision={self.revision}"
@@ -36,8 +37,7 @@ class EvalRequest:
36
  return model_args
37
 
38
 
39
- def set_eval_request(api: HfApi, eval_request: EvalRequest, new_status: str,
40
- hf_repo: str, local_dir: str):
41
  """Updates a given eval request with its new status on the hub (running, completed, failed,)"""
42
  json_filepath = eval_request.json_filepath
43
 
@@ -65,8 +65,7 @@ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[Ev
65
  Returns:
66
  list[EvalRequest]: a list of model info dicts.
67
  """
68
- snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir,
69
- repo_type="dataset", max_workers=60)
70
  json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
71
 
72
  eval_requests = []
 
24
  likes: Optional[int] = 0
25
  params: Optional[int] = None
26
  license: Optional[str] = ""
27
+ model_path: Optional[str] = None
28
 
29
  def get_model_args(self):
30
  model_args = f"pretrained={self.model},revision={self.revision}"
 
37
  return model_args
38
 
39
 
40
+ def set_eval_request(api: HfApi, eval_request: EvalRequest, new_status: str, hf_repo: str, local_dir: str):
 
41
  """Updates a given eval request with its new status on the hub (running, completed, failed,)"""
42
  json_filepath = eval_request.json_filepath
43
 
 
65
  Returns:
66
  list[EvalRequest]: a list of model info dicts.
67
  """
68
+ snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60)
 
69
  json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
70
 
71
  eval_requests = []
src/backend/model_operations.py CHANGED
@@ -11,7 +11,7 @@ import pandas as pd
11
  import spacy
12
  import litellm
13
  from tqdm import tqdm
14
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForTokenClassification, AutoConfig, Qwen2VLForConditionalGeneration, AutoProcessor
15
  from peft import PeftModel
16
  import torch
17
  import cohere
@@ -19,11 +19,12 @@ from openai import OpenAI
19
  from together import Together
20
  import anthropic
21
  import replicate
22
- # import google.generativeai as genai
23
  import vertexai
24
  from vertexai.generative_models import GenerativeModel, Part, SafetySetting, FinishReason
25
  from mistralai import Mistral
26
  from qwen_vl_utils import process_vision_info
 
 
27
 
28
 
29
  import src.backend.util as util
@@ -32,8 +33,7 @@ import src.envs as envs
32
  litellm.set_verbose=True
33
 
34
  # Set up basic configuration for logging
35
- logging.basicConfig(level=logging.INFO,
36
- format='%(asctime)s - %(levelname)s - %(message)s')
37
 
38
  # Load spacy model for word tokenization
39
  nlp = spacy.load("en_core_web_sm")
@@ -66,7 +66,7 @@ class SummaryGenerator:
66
  answer_rate (float): Rate of non-empty summaries.
67
  """
68
 
69
- def __init__(self, model_id, revision, device):
70
  """
71
  Initializes the SummaryGenerator with a model.
72
 
@@ -76,6 +76,7 @@ class SummaryGenerator:
76
  """
77
  self.model_id = model_id
78
  self.model = f"huggingface/{model_id}"
 
79
  self.api_base = f"https://api-inference.huggingface.co/models/{model_id}"
80
  self.summaries_df = pd.DataFrame()
81
  self.revision = revision
@@ -86,7 +87,10 @@ class SummaryGenerator:
86
  self.local_model = None
87
  self.local_pipeline = None
88
 
89
- def generate_summaries(self, df, save_path=None):
 
 
 
90
  """Generate summaries for a given DataFrame of source docs.
91
 
92
  Args:
@@ -113,7 +117,6 @@ class SummaryGenerator:
113
  while not _summary:
114
  try:
115
  _summary = self.generate_summary(system_prompt, user_prompt)
116
- # print(f"Finish index {index}")
117
  break
118
  except Exception as e:
119
  if 'Rate limit reached' in str(e):
@@ -142,8 +145,7 @@ class SummaryGenerator:
142
  # Sleep to prevent hitting rate limits too frequently
143
  time.sleep(1)
144
 
145
- self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)),
146
- columns=["source", "summary", "dataset"])
147
 
148
  if save_path is not None:
149
  print(f'Save summaries to {save_path}')
@@ -164,7 +166,7 @@ class SummaryGenerator:
164
  using_replicate_api = False
165
  replicate_api_models = ['snowflake', 'llama-3.1-405b']
166
  using_pipeline = False
167
- pipeline_models = ['llama-3.1', 'phi-3-mini','falcon-7b', 'phi-3.5', 'mistral-nemo', 'llama-3.3']
168
 
169
  for replicate_api_model in replicate_api_models:
170
  if replicate_api_model in self.model_id.lower():
@@ -405,40 +407,32 @@ class SummaryGenerator:
405
  trust_remote_code=True
406
  )
407
  else:
408
- if 'ragamuffin' in self.model_id.lower():
409
- self.tokenizer = AutoTokenizer.from_pretrained(os.path.join('/home/miaoran', self.model_id))
410
-
411
- else:
412
- self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf" if 'openelm' in self.model_id.lower() else self.model_id, trust_remote_code=True)
413
- print("Tokenizer loaded")
414
- if 'jamba' in self.model_id.lower():
415
- self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id,
416
- torch_dtype=torch.bfloat16,
417
- attn_implementation="flash_attention_2",
418
- device_map="auto",
419
- use_mamba_kernels=False)
420
 
 
 
 
 
 
 
 
 
 
421
  elif 'qwen2-vl' in self.model_id.lower():
422
  self.local_model = Qwen2VLForConditionalGeneration.from_pretrained(
423
  self.model_id, torch_dtype="auto", device_map="auto"
424
  )
425
  self.processor = AutoProcessor.from_pretrained(self.model_id)
426
-
427
- # elif 'ragamuffin' in self.model_id.lower():
428
- # print('Using ragamuffin')
429
- # self.local_model = AutoModelForCausalLM.from_pretrained(os.path.join('/home/miaoran', self.model_id),
430
- # torch_dtype=torch.bfloat16, # forcing bfloat16 for now
431
- # attn_implementation="flash_attention_2")
432
  elif 'olmo' in self.model_id.lower():
433
- self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id)#torch_dtype="auto"
434
-
435
  elif 'qwq-' in self.model_id.lower():
436
  self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype="auto", device_map="auto")
437
-
438
  else:
439
- self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, trust_remote_code=True, device_map="auto")#torch_dtype="auto"
440
- # print(self.local_model.device)
441
- print("Local model loaded")
 
442
 
443
  # Using local model/pipeline
444
  if self.local_pipeline:
@@ -502,7 +496,10 @@ class SummaryGenerator:
502
  elif 'qwq' in self.model_id.lower():
503
  input_ids = self.tokenizer([prompt], return_tensors="pt").to(self.device)
504
  else:
505
- input_ids = self.tokenizer(prompt, return_tensors="pt").to(self.device)
 
 
 
506
 
507
  # Generate outputs
508
  if 'granite' in self.model_id.lower():
@@ -513,14 +510,18 @@ class SummaryGenerator:
513
  elif 'qwq' in self.model_id.lower():
514
  outputs = self.local_model.generate(**input_ids, max_new_tokens=512, do_sample=True, temperature=0.01)
515
  else:
516
- with torch.no_grad():
517
- outputs = self.local_model.generate(**input_ids, do_sample=True, max_new_tokens=250, temperature=0.01)#, pad_token_id=self.tokenizer.eos_token_id
 
 
 
 
 
518
  if 'glm' in self.model_id.lower() or 'ragamuffin' in self.model_id.lower() or 'granite' in self.model_id.lower():
519
  outputs = outputs[:, input_ids['input_ids'].shape[1]:]
520
  elif 'qwen2-vl' in self.model_id.lower() or 'qwen2.5' in self.model_id.lower() or 'qwq-' in self.model_id.lower():
521
- outputs = [
522
- out_ids[len(in_ids) :] for in_ids, out_ids in zip(input_ids.input_ids, outputs)
523
- ]
524
 
525
  # Decode outputs
526
  if 'qwen2-vl' in self.model_id.lower():
@@ -530,7 +531,10 @@ class SummaryGenerator:
530
  elif 'olmo' in self.model_id.lower() or 'qwq' in self.model_id.lower():
531
  result = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
532
  else:
533
- result = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
 
534
 
535
  if 'gemma-2' in self.model_id.lower():
536
  result = result.split(user_prompt + '\nmodel')[-1].strip()
@@ -593,13 +597,14 @@ class EvaluationModel:
593
  Args:
594
  model_path (str): Path to the CrossEncoder model.
595
  """
596
- config = AutoConfig.from_pretrained('google/flan-t5-large')
597
- self.model = AutoModelForTokenClassification.from_pretrained(model_path, config=config)
598
  self.device = device
599
  self.model.to(self.device)
600
  self.scores = []
601
  self.factual_consistency_rate = None
602
  self.hallucination_rate = None
 
 
603
 
604
  def predict(self, text_pairs):
605
  """Load LoRA adapters of HHEM and make predictions
@@ -609,20 +614,9 @@ class EvaluationModel:
609
  checkpoint: model ID on Hugging Face
610
  """
611
 
612
- prompt = "<pad> Determine if the hypothesis is true given the premise?\n\nPremise: {text1}\n\nHypothesis: {text2}"
613
-
614
- tokenizer = AutoTokenizer.from_pretrained('t5-base')
615
- inputs = tokenizer(
616
- [prompt.format(text1=pair[0], text2=pair[1]) for pair in text_pairs],
617
- return_tensors='pt', padding='longest').to(self.device)
618
-
619
- self.model.eval()
620
  with torch.no_grad():
621
- output = self.model(**inputs)
622
- logits = output.logits
623
- logits = logits[:,0,:] # get the logits on the first token
624
- logits = torch.softmax(logits, dim=-1)
625
- scores = [round(x, 5) for x in logits[:, 1].tolist()] # list of float
626
  return scores
627
 
628
  def evaluate_hallucination(self, summaries_df):
 
11
  import spacy
12
  import litellm
13
  from tqdm import tqdm
14
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForSequenceClassification, AutoConfig, Qwen2VLForConditionalGeneration, AutoProcessor
15
  from peft import PeftModel
16
  import torch
17
  import cohere
 
19
  from together import Together
20
  import anthropic
21
  import replicate
 
22
  import vertexai
23
  from vertexai.generative_models import GenerativeModel, Part, SafetySetting, FinishReason
24
  from mistralai import Mistral
25
  from qwen_vl_utils import process_vision_info
26
+ from vllm import LLM
27
+ from vllm import SamplingParams
28
 
29
 
30
  import src.backend.util as util
 
33
  litellm.set_verbose=True
34
 
35
  # Set up basic configuration for logging
36
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
37
 
38
  # Load spacy model for word tokenization
39
  nlp = spacy.load("en_core_web_sm")
 
66
  answer_rate (float): Rate of non-empty summaries.
67
  """
68
 
69
+ def __init__(self, model_id, revision, device, model_path=None, use_vllm=False, tensor_parallel_size=1):
70
  """
71
  Initializes the SummaryGenerator with a model.
72
 
 
76
  """
77
  self.model_id = model_id
78
  self.model = f"huggingface/{model_id}"
79
+ self.model_path = model_path
80
  self.api_base = f"https://api-inference.huggingface.co/models/{model_id}"
81
  self.summaries_df = pd.DataFrame()
82
  self.revision = revision
 
87
  self.local_model = None
88
  self.local_pipeline = None
89
 
90
+ self.use_vllm = use_vllm
91
+ self.tensor_parallel_size = tensor_parallel_size
92
+
93
+ def generate_summaries(self, df, save_path=None, batch_size=1):
94
  """Generate summaries for a given DataFrame of source docs.
95
 
96
  Args:
 
117
  while not _summary:
118
  try:
119
  _summary = self.generate_summary(system_prompt, user_prompt)
 
120
  break
121
  except Exception as e:
122
  if 'Rate limit reached' in str(e):
 
145
  # Sleep to prevent hitting rate limits too frequently
146
  time.sleep(1)
147
 
148
+ self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)), columns=["source", "summary", "dataset"])
 
149
 
150
  if save_path is not None:
151
  print(f'Save summaries to {save_path}')
 
166
  using_replicate_api = False
167
  replicate_api_models = ['snowflake', 'llama-3.1-405b']
168
  using_pipeline = False
169
+ pipeline_models = ['llama-3.1', 'phi-3-mini','falcon-7b', 'phi-3.5', 'mistral-nemo', 'llama-3.3', 'phi-4']
170
 
171
  for replicate_api_model in replicate_api_models:
172
  if replicate_api_model in self.model_id.lower():
 
407
  trust_remote_code=True
408
  )
409
  else:
410
+ print(f"loading tokenizer from {self.model_path or self.model_id}")
411
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_path or self.model_id, trust_remote_code=True)
 
 
 
 
 
 
 
 
 
 
412
 
413
+ print(f"loading model from {self.model_path or self.model_id}")
414
+ if 'jamba' in self.model_id.lower():
415
+ self.local_model = AutoModelForCausalLM.from_pretrained(
416
+ self.model_id,
417
+ torch_dtype=torch.bfloat16,
418
+ attn_implementation="flash_attention_2",
419
+ device_map="auto",
420
+ use_mamba_kernels=False
421
+ )
422
  elif 'qwen2-vl' in self.model_id.lower():
423
  self.local_model = Qwen2VLForConditionalGeneration.from_pretrained(
424
  self.model_id, torch_dtype="auto", device_map="auto"
425
  )
426
  self.processor = AutoProcessor.from_pretrained(self.model_id)
 
 
 
 
 
 
427
  elif 'olmo' in self.model_id.lower():
428
+ self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id)
 
429
  elif 'qwq-' in self.model_id.lower():
430
  self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype="auto", device_map="auto")
 
431
  else:
432
+ if self.use_vllm:
433
+ self.local_model = LLM(self.model_path or self.model_id, tensor_parallel_size=self.tensor_parallel_size, trust_remote_code=True)
434
+ else:
435
+ self.local_model = AutoModelForCausalLM.from_pretrained(self.model_path or self.model_id, trust_remote_code=True, device_map="auto")
436
 
437
  # Using local model/pipeline
438
  if self.local_pipeline:
 
496
  elif 'qwq' in self.model_id.lower():
497
  input_ids = self.tokenizer([prompt], return_tensors="pt").to(self.device)
498
  else:
499
+ if self.use_vllm:
500
+ input_ids = [prompt]
501
+ else:
502
+ input_ids = self.tokenizer(prompt, return_tensors="pt").to(self.device)
503
 
504
  # Generate outputs
505
  if 'granite' in self.model_id.lower():
 
510
  elif 'qwq' in self.model_id.lower():
511
  outputs = self.local_model.generate(**input_ids, max_new_tokens=512, do_sample=True, temperature=0.01)
512
  else:
513
+ if self.use_vllm:
514
+ sampling_params = SamplingParams(temperature=0.01, max_tokens=250)
515
+ outputs = self.local_model.generate(input_ids, sampling_params, use_tqdm=False)
516
+ else:
517
+ with torch.no_grad():
518
+ outputs = self.local_model.generate(**input_ids, do_sample=True, max_new_tokens=250, temperature=0.01)#, pad_token_id=self.tokenizer.eos_token_id
519
+
520
  if 'glm' in self.model_id.lower() or 'ragamuffin' in self.model_id.lower() or 'granite' in self.model_id.lower():
521
  outputs = outputs[:, input_ids['input_ids'].shape[1]:]
522
  elif 'qwen2-vl' in self.model_id.lower() or 'qwen2.5' in self.model_id.lower() or 'qwq-' in self.model_id.lower():
523
+ if not self.use_vllm:
524
+ outputs = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(input_ids.input_ids, outputs)]
 
525
 
526
  # Decode outputs
527
  if 'qwen2-vl' in self.model_id.lower():
 
531
  elif 'olmo' in self.model_id.lower() or 'qwq' in self.model_id.lower():
532
  result = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
533
  else:
534
+ if self.use_vllm:
535
+ result = outputs[0].outputs[0].text
536
+ else:
537
+ result = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
538
 
539
  if 'gemma-2' in self.model_id.lower():
540
  result = result.split(user_prompt + '\nmodel')[-1].strip()
 
597
  Args:
598
  model_path (str): Path to the CrossEncoder model.
599
  """
600
+ self.model = AutoModelForSequenceClassification.from_pretrained(model_path, trust_remote_code=True)
 
601
  self.device = device
602
  self.model.to(self.device)
603
  self.scores = []
604
  self.factual_consistency_rate = None
605
  self.hallucination_rate = None
606
+
607
+ self.model.eval()
608
 
609
  def predict(self, text_pairs):
610
  """Load LoRA adapters of HHEM and make predictions
 
614
  checkpoint: model ID on Hugging Face
615
  """
616
 
 
 
 
 
 
 
 
 
617
  with torch.no_grad():
618
+ output = self.model.predict(text_pairs)
619
+ scores = output.tolist()
 
 
 
620
  return scores
621
 
622
  def evaluate_hallucination(self, summaries_df):
src/backend/run_eval_suite.py CHANGED
@@ -8,14 +8,15 @@ from src.backend.manage_requests import EvalRequest
8
  from src.backend.evaluate_model import Evaluator
9
 
10
  # Configure logging
11
- logging.basicConfig(level=logging.INFO,
12
- format='%(asctime)s - %(levelname)s - %(message)s')
13
  logging.getLogger("openai").setLevel(logging.WARNING)
14
 
15
 
16
- def run_evaluation(eval_request: EvalRequest, batch_size, device,
17
- local_dir: str, results_repo: str, no_cache=True, limit=None,
18
- need_check=True, write_results=False):
 
 
19
  """
20
  Run the evaluation for a given model and upload the results.
21
 
@@ -32,21 +33,20 @@ def run_evaluation(eval_request: EvalRequest, batch_size, device,
32
  Returns:
33
  dict: A dictionary containing evaluation results.
34
  """
35
- if limit:
36
  logging.warning("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
37
 
38
  output_folder = os.path.join(local_dir, *eval_request.model.split("/"))
39
- # if os.path.exists(output_folder):
40
- # f_name = os.listdir(output_folder)[-1]
41
- # print(f"Loading results from {os.path.join(output_folder, f_name)}")
42
- # results = json.loads(os.path.join(output_folder, f_name))
43
- # dumped = json.dumps(results, indent=2)
44
- # logging.info(dumped)
45
- # else:
46
  try:
47
- evaluator = Evaluator(eval_request.model, eval_request.revision, eval_request.precision,
48
- batch_size, device, no_cache, limit, write_out=True,
49
- output_base_path='logs')
 
 
 
 
 
 
50
  results = evaluator.evaluate()
51
  if write_results:
52
  evaluator.write_results()
@@ -67,8 +67,7 @@ def run_evaluation(eval_request: EvalRequest, batch_size, device,
67
  dumped = json.dumps(results, indent=2)
68
  logging.info(dumped)
69
 
70
- output_path = os.path.join(output_folder,
71
- f"results_{datetime.now()}.json") #
72
  os.makedirs(output_folder, exist_ok=True)
73
  with open(output_path, "w") as f:
74
  f.write(dumped)
 
8
  from src.backend.evaluate_model import Evaluator
9
 
10
  # Configure logging
11
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
12
  logging.getLogger("openai").setLevel(logging.WARNING)
13
 
14
 
15
+ def run_evaluation(
16
+ eval_request: EvalRequest, batch_size, device,
17
+ local_dir: str, results_repo: str, no_cache=True, limit=None,
18
+ need_check=True, write_results=False, use_vllm=False, tensor_parallel_size=1,
19
+ ):
20
  """
21
  Run the evaluation for a given model and upload the results.
22
 
 
33
  Returns:
34
  dict: A dictionary containing evaluation results.
35
  """
36
+ if limit is not None and limit > 0:
37
  logging.warning("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
38
 
39
  output_folder = os.path.join(local_dir, *eval_request.model.split("/"))
 
 
 
 
 
 
 
40
  try:
41
+ evaluator = Evaluator(
42
+ eval_request.model, eval_request.revision, eval_request.precision,
43
+ batch_size, device, no_cache, limit, write_out=True,
44
+ output_base_path='logs',
45
+ model_path=eval_request.model_path,
46
+ use_vllm=use_vllm,
47
+ tensor_parallel_size=tensor_parallel_size
48
+ )
49
+
50
  results = evaluator.evaluate()
51
  if write_results:
52
  evaluator.write_results()
 
67
  dumped = json.dumps(results, indent=2)
68
  logging.info(dumped)
69
 
70
+ output_path = os.path.join(output_folder, f"results_{datetime.now()}.json") #
 
71
  os.makedirs(output_folder, exist_ok=True)
72
  with open(output_path, "w") as f:
73
  f.write(dumped)
src/display/about.py CHANGED
@@ -10,12 +10,10 @@ class Task:
10
 
11
  class Tasks(Enum):
12
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
13
- hallucination_rate = Task("hallucination_rate",
14
- "hallucination_rate", "Hallucination Rate (%)")
15
  factual_consistency_rate = Task("factual_consistency_rate", "factual_consistency_rate", "Factual Consistency Rate (%)")
16
  answer_rate = Task("answer_rate", "answer_rate", "Answer Rate (%)")
17
- average_summary_length = Task("average_summary_length",
18
- "average_summary_length", "Average Summary Length")
19
 
20
 
21
  # Your leaderboard name
 
10
 
11
  class Tasks(Enum):
12
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
13
+ hallucination_rate = Task("hallucination_rate", "hallucination_rate", "Hallucination Rate (%)")
 
14
  factual_consistency_rate = Task("factual_consistency_rate", "factual_consistency_rate", "Factual Consistency Rate (%)")
15
  answer_rate = Task("answer_rate", "answer_rate", "Answer Rate (%)")
16
+ average_summary_length = Task("average_summary_length", "average_summary_length", "Average Summary Length")
 
17
 
18
 
19
  # Your leaderboard name
src/display/utils.py CHANGED
@@ -1,4 +1,4 @@
1
- from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
 
4
  import pandas as pd
@@ -21,13 +21,19 @@ class ColumnContent:
21
  never_hidden: bool = False
22
  dummy: bool = False
23
 
 
 
 
 
 
 
 
 
24
  ## Leaderboard columns
25
  auto_eval_column_dict = []
26
  # Init
27
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent,
28
- ColumnContent("T", "str", True, never_hidden=True)])
29
- auto_eval_column_dict.append(["model", ColumnContent,
30
- ColumnContent("Model", "markdown", True, never_hidden=True)])
31
  for task in Tasks:
32
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
33
 
 
1
+ from dataclasses import dataclass, make_dataclass, field
2
  from enum import Enum
3
 
4
  import pandas as pd
 
21
  never_hidden: bool = False
22
  dummy: bool = False
23
 
24
+ def __hash__(self) -> int:
25
+ import time
26
+ import random
27
+ seed = hash(self.name) + hash(self.type) + hash(self.displayed_by_default) + \
28
+ hash(self.hidden) + hash(self.never_hidden) + hash(self.dummy) + \
29
+ hash(time.time()) + random.randint(0, 10000)
30
+ return seed
31
+
32
  ## Leaderboard columns
33
  auto_eval_column_dict = []
34
  # Init
35
+ auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
36
+ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 
 
37
  for task in Tasks:
38
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
39
 
src/envs.py CHANGED
@@ -6,7 +6,7 @@ from huggingface_hub import HfApi
6
  # replace this with our token
7
  TOKEN = os.environ.get("HF_TOKEN", None)
8
 
9
- OWNER = "vectara"
10
  REPO_ID = f"{OWNER}/leaderboard"
11
  QUEUE_REPO = f"{OWNER}/requests"
12
  RESULTS_REPO = f"{OWNER}/results"
@@ -20,13 +20,18 @@ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
20
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
21
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
22
 
 
 
 
 
23
  DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #"cpu"
24
  API = HfApi(token=TOKEN)
25
 
26
  LEADERBOARD_DATASET_PATH = "leaderboard_results/leaderboard_summaries.csv"
27
  DATASET_PATH = "src/datasets/leaderboard_dataset.csv"
28
  SAMPLE_DATASET_PATH = "src/datasets/sample_dataset.csv"
29
- HEM_PATH = 'vectara/HHEM-2.1'
 
30
 
31
  SYSTEM_PROMPT = "You are a chat bot answering questions using data. You must stick to the answers provided solely by the text in the passage provided."
32
  USER_PROMPT = "You are asked the question 'Provide a concise summary of the following passage, covering the core pieces of information described': "
 
6
  # replace this with our token
7
  TOKEN = os.environ.get("HF_TOKEN", None)
8
 
9
+ OWNER = "airlsyn"
10
  REPO_ID = f"{OWNER}/leaderboard"
11
  QUEUE_REPO = f"{OWNER}/requests"
12
  RESULTS_REPO = f"{OWNER}/results"
 
20
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
21
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
22
 
23
+ DATA_LEADERBOARD_REPO = f"{OWNER}/leaderboard_dataset"
24
+ DATA_LEADERBOARD_PATH = os.path.join(CACHE_PATH, "leaderboard-bk")
25
+ DATA_LEADERBOARD_NAME = os.path.join(DATA_LEADERBOARD_PATH, "leaderboard_dataset_16k.csv")
26
+
27
  DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #"cpu"
28
  API = HfApi(token=TOKEN)
29
 
30
  LEADERBOARD_DATASET_PATH = "leaderboard_results/leaderboard_summaries.csv"
31
  DATASET_PATH = "src/datasets/leaderboard_dataset.csv"
32
  SAMPLE_DATASET_PATH = "src/datasets/sample_dataset.csv"
33
+ # HEM_PATH = 'vectara/HHEM-2.1'
34
+ HEM_PATH = 'vectara/hallucination_evaluation_model'
35
 
36
  SYSTEM_PROMPT = "You are a chat bot answering questions using data. You must stick to the answers provided solely by the text in the passage provided."
37
  USER_PROMPT = "You are asked the question 'Provide a concise summary of the following passage, covering the core pieces of information described': "
src/leaderboard/read_evals.py CHANGED
@@ -104,7 +104,7 @@ class EvalResult:
104
 
105
  data_dict = {
106
  "eval_name": self.eval_name, # not a column, just a save name,
107
- # utils.AutoEvalColumn.precision.name: self.precision.value.name,
108
  utils.AutoEvalColumn.model_type.name: self.model_type.value.name,
109
  utils.AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
110
  utils.AutoEvalColumn.weight_type.name: self.weight_type.value.name,
@@ -114,7 +114,7 @@ class EvalResult:
114
  # utils.AutoEvalColumn.revision.name: self.revision,
115
  # utils.AutoEvalColumn.license.name: self.license,
116
  # utils.AutoEvalColumn.likes.name: self.likes,
117
- # utils.AutoEvalColumn.params.name: self.num_params,
118
  # utils.AutoEvalColumn.still_on_hub.name: self.still_on_hub,
119
  }
120
 
@@ -172,8 +172,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
172
  # Store results of same eval together
173
  eval_name = eval_result.eval_name
174
  if eval_name in eval_results.keys():
175
- eval_results[eval_name].results.update({k: v for k, v in
176
- eval_result.results.items() if v is not None})
177
  else:
178
  eval_results[eval_name] = eval_result
179
 
 
104
 
105
  data_dict = {
106
  "eval_name": self.eval_name, # not a column, just a save name,
107
+ # utils.AutoEvalColumn.precision.name: self.precision.value.name,
108
  utils.AutoEvalColumn.model_type.name: self.model_type.value.name,
109
  utils.AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
110
  utils.AutoEvalColumn.weight_type.name: self.weight_type.value.name,
 
114
  # utils.AutoEvalColumn.revision.name: self.revision,
115
  # utils.AutoEvalColumn.license.name: self.license,
116
  # utils.AutoEvalColumn.likes.name: self.likes,
117
+ # utils.AutoEvalColumn.params.name: self.num_params,
118
  # utils.AutoEvalColumn.still_on_hub.name: self.still_on_hub,
119
  }
120
 
 
172
  # Store results of same eval together
173
  eval_name = eval_result.eval_name
174
  if eval_name in eval_results.keys():
175
+ eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
 
176
  else:
177
  eval_results[eval_name] = eval_result
178