PeterKruger commited on
Commit
1a66bbf
·
verified ·
1 Parent(s): 9d9a69a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -21
app.py CHANGED
@@ -64,7 +64,7 @@ def retry_api_request(max_retries=3, wait_time=10):
64
 
65
  # --- Single model request function for Hugging Face ---
66
  @retry_api_request()
67
- def make_hf_request(model_name, messages, temperature, max_tokens):
68
  """
69
  Send request to a Hugging Face model using InferenceClient
70
 
@@ -73,11 +73,12 @@ def make_hf_request(model_name, messages, temperature, max_tokens):
73
  messages: Messages in the format [{"role": "user", "content": "..."}]
74
  temperature: Temperature parameter for generation
75
  max_tokens: Maximum tokens to generate
 
76
 
77
  Returns:
78
  Generated text or None if request fails
79
  """
80
- client = InferenceClient(model=model_name)
81
 
82
  # Convert messages to a prompt string
83
  prompt = ""
@@ -304,7 +305,6 @@ def rank_answer_prompt(question, answer, topic):
304
  3: Good answer - clear, relevant to the topic, well-formulated, with correct statements. For creative writing, this includes demonstrating good originality, imagination, and adherence to the prompt, including the 3000-character limit.
305
  4: Very good answer - very clear, very relevant to the topic, expertly formulated, with highly correct statements. For creative writing, shows strong originality, a compelling narrative or poetic voice, and excellent adherence to the prompt, including the 3000-character limit.
306
  5: Exceptionally good answer - only appliable to exceptional answers that match all the criteria of the previous "4: Very good answer", but also bring additional unique insights, perfectly sound original arguments, or other exceptional unexpected contributions to the topic. For creative writing, this indicates a truly outstanding piece of writing with exceptional creativity, emotional resonance, and masterful execution, while adhering to the 3000-character limit.
307
-
308
  Consider these criteria in your ranking:
309
  - Clarity: Is the answer easy to understand? Is it ambiguous or confusing?
310
  - Relevance: Is the answer relevant to the specified topic?
@@ -323,7 +323,6 @@ Consider these criteria in your ranking:
323
 
324
  prompt += f"""
325
  Just return a single number (the rank from 1 to 5), do not add any other text.
326
-
327
  Question: {question}
328
  Answer: {answer}
329
  Rank:"""
@@ -345,7 +344,6 @@ def rank_question_prompt(question, topic, difficulty):
345
  3: Good question - clear, relevant to the topic, generally appropriate for the difficulty level, and reasonably well-formulated. For creative writing, the prompt is clear, provides a reasonable starting point for creative work, and sets a clear 3000-character limit.
346
  4: Very good question - clear, highly relevant to the topic, appropriate for the difficulty level, and well-formulated. For creative writing, the prompt is engaging, sparks imagination, and offers a good balance of direction and freedom, with a clear 3000-character limit.
347
  5: Excellent question - exceptionally clear, insightful, highly relevant to the topic, perfectly matched to the difficulty level, and expertly formulated. For creative writing, the prompt is exceptionally creative, thought-provoking, and likely to inspire high-quality writing, with a clear 3000-character limit.
348
-
349
  Consider these criteria in your ranking:
350
  - Clarity: Is the question easy to understand? Is it ambiguous or confusing?
351
  - Relevance: Is the question relevant to the specified topic ({topic})?
@@ -361,7 +359,6 @@ Consider these criteria in your ranking:
361
  """
362
  prompt += f"""
363
  Just return a single number (the rank from 1 to 5), do not add any other text.
364
-
365
  Question: {question}
366
  Rank:"""
367
  return prompt
@@ -385,14 +382,14 @@ def parse_rank_string(rank_str, ranking_model_id):
385
  return None
386
 
387
  # --- Helper Function for Parallel Ranking ---
388
- def get_rank_from_model(ranking_model_id, question, answer, consecutive_failures, failure_threshold, unresponsive_models, model_config, topic, timeout=60):
389
  start_time = time.time()
390
  rank = None # Initialize rank to None, indicating potential failure
391
 
392
  rank_prompt = rank_answer_prompt(question, answer, topic)
393
 
394
  try:
395
- response = make_hf_request(model_config[ranking_model_id]["name"], [{"role": "user", "content": rank_prompt}], base_temp, 5)
396
  if response:
397
  try:
398
  rank_str = response.strip()
@@ -416,14 +413,14 @@ def get_rank_from_model(ranking_model_id, question, answer, consecutive_failures
416
  return ranking_model_id, rank
417
 
418
  # --- Helper Function for Parallel Ranking of questions ---
419
- def get_question_rank_from_model(ranking_model_id, question, topic, difficulty, consecutive_failures, failure_threshold, unresponsive_models, model_config, timeout=60):
420
  start_time = time.time()
421
  rank = None # Initialize rank to None, indicating potential failure
422
 
423
  rank_prompt = rank_question_prompt(question, topic, difficulty) # Use question rank prompt
424
 
425
  try:
426
- response = make_hf_request(model_config[ranking_model_id]["name"], [{"role": "user", "content": rank_prompt}], base_temp, 5)
427
  if response:
428
  try:
429
  rank_str = response.strip()
@@ -447,7 +444,7 @@ def get_question_rank_from_model(ranking_model_id, question, topic, difficulty,
447
  return ranking_model_id, rank
448
 
449
  # --- Helper Function for Parallel Answering ---
450
- def get_answer_from_model(model_id, question, consecutive_failures, failure_threshold, unresponsive_models, model_config, topic, timeout=60):
451
  start_time = time.time() # Start timer
452
  answer_prompt = answer_question_prompt(question)
453
  answer = "Error answering" # Default answer
@@ -459,7 +456,7 @@ def get_answer_from_model(model_id, question, consecutive_failures, failure_thre
459
  max_tok = long_max_tokens
460
 
461
  try:
462
- response = make_hf_request(model_config[model_id]["name"], [{"role": "user", "content": answer_prompt}], temp, max_tok)
463
  if response:
464
  answer = response.strip()
465
  except Exception as e:
@@ -475,7 +472,7 @@ def get_answer_from_model(model_id, question, consecutive_failures, failure_thre
475
  return answer, duration # Return answer and duration
476
 
477
  # --- Core Logic ---
478
- def run_benchmark(hf_models, topics, difficulties, t, model_config):
479
  results = {
480
  "model_name": [],
481
  "topic": [],
@@ -563,7 +560,8 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config):
563
  response = make_hf_request(model_config[question_generator_model_id]["name"],
564
  [{"role": "user", "content": question_prompt}],
565
  question_temp,
566
- question_max_tokens)
 
567
 
568
  if response:
569
  question = response.strip()
@@ -603,6 +601,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config):
603
  failure_threshold,
604
  unresponsive_models,
605
  model_config,
 
606
  timeout=60
607
  )
608
  question_ranking_futures.append(future)
@@ -667,6 +666,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config):
667
  unresponsive_models,
668
  model_config,
669
  topic,
 
670
  timeout=60
671
  )
672
  answer_futures.append(future)
@@ -726,6 +726,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config):
726
  unresponsive_models,
727
  model_config,
728
  topic,
 
729
  timeout=60
730
  )
731
  ranking_futures.append(future)
@@ -870,12 +871,6 @@ if st.sidebar.button("Start Benchmark"):
870
  if 'results_df' not in st.session_state:
871
  st.session_state.results_df = pd.DataFrame()
872
 
873
- # Modify make_hf_request to use the token
874
- def make_hf_request_with_token(model_name, messages, temperature, max_tokens):
875
- client = InferenceClient(model=model_name, token=hf_token)
876
- # Rest of the function is the same...
877
- # Return response
878
-
879
  # Run the benchmark
880
  try:
881
  # Update status
@@ -885,7 +880,7 @@ if st.sidebar.button("Start Benchmark"):
885
  results, cumulative_avg_rank, total_successful = run_benchmark(
886
  selected_models, selected_topics,
887
  ["a very simple", "a simple", "a", "a difficult", "a very difficult"],
888
- num_iterations, model_config
889
  )
890
 
891
  # Update progress to complete
 
64
 
65
  # --- Single model request function for Hugging Face ---
66
  @retry_api_request()
67
+ def make_hf_request(model_name, messages, temperature, max_tokens, token=None):
68
  """
69
  Send request to a Hugging Face model using InferenceClient
70
 
 
73
  messages: Messages in the format [{"role": "user", "content": "..."}]
74
  temperature: Temperature parameter for generation
75
  max_tokens: Maximum tokens to generate
76
+ token: Hugging Face API token
77
 
78
  Returns:
79
  Generated text or None if request fails
80
  """
81
+ client = InferenceClient(model=model_name, token=token)
82
 
83
  # Convert messages to a prompt string
84
  prompt = ""
 
305
  3: Good answer - clear, relevant to the topic, well-formulated, with correct statements. For creative writing, this includes demonstrating good originality, imagination, and adherence to the prompt, including the 3000-character limit.
306
  4: Very good answer - very clear, very relevant to the topic, expertly formulated, with highly correct statements. For creative writing, shows strong originality, a compelling narrative or poetic voice, and excellent adherence to the prompt, including the 3000-character limit.
307
  5: Exceptionally good answer - only appliable to exceptional answers that match all the criteria of the previous "4: Very good answer", but also bring additional unique insights, perfectly sound original arguments, or other exceptional unexpected contributions to the topic. For creative writing, this indicates a truly outstanding piece of writing with exceptional creativity, emotional resonance, and masterful execution, while adhering to the 3000-character limit.
 
308
  Consider these criteria in your ranking:
309
  - Clarity: Is the answer easy to understand? Is it ambiguous or confusing?
310
  - Relevance: Is the answer relevant to the specified topic?
 
323
 
324
  prompt += f"""
325
  Just return a single number (the rank from 1 to 5), do not add any other text.
 
326
  Question: {question}
327
  Answer: {answer}
328
  Rank:"""
 
344
  3: Good question - clear, relevant to the topic, generally appropriate for the difficulty level, and reasonably well-formulated. For creative writing, the prompt is clear, provides a reasonable starting point for creative work, and sets a clear 3000-character limit.
345
  4: Very good question - clear, highly relevant to the topic, appropriate for the difficulty level, and well-formulated. For creative writing, the prompt is engaging, sparks imagination, and offers a good balance of direction and freedom, with a clear 3000-character limit.
346
  5: Excellent question - exceptionally clear, insightful, highly relevant to the topic, perfectly matched to the difficulty level, and expertly formulated. For creative writing, the prompt is exceptionally creative, thought-provoking, and likely to inspire high-quality writing, with a clear 3000-character limit.
 
347
  Consider these criteria in your ranking:
348
  - Clarity: Is the question easy to understand? Is it ambiguous or confusing?
349
  - Relevance: Is the question relevant to the specified topic ({topic})?
 
359
  """
360
  prompt += f"""
361
  Just return a single number (the rank from 1 to 5), do not add any other text.
 
362
  Question: {question}
363
  Rank:"""
364
  return prompt
 
382
  return None
383
 
384
  # --- Helper Function for Parallel Ranking ---
385
+ def get_rank_from_model(ranking_model_id, question, answer, consecutive_failures, failure_threshold, unresponsive_models, model_config, topic, token=None, timeout=60):
386
  start_time = time.time()
387
  rank = None # Initialize rank to None, indicating potential failure
388
 
389
  rank_prompt = rank_answer_prompt(question, answer, topic)
390
 
391
  try:
392
+ response = make_hf_request(model_config[ranking_model_id]["name"], [{"role": "user", "content": rank_prompt}], base_temp, 5, token=token)
393
  if response:
394
  try:
395
  rank_str = response.strip()
 
413
  return ranking_model_id, rank
414
 
415
  # --- Helper Function for Parallel Ranking of questions ---
416
+ def get_question_rank_from_model(ranking_model_id, question, topic, difficulty, consecutive_failures, failure_threshold, unresponsive_models, model_config, token=None, timeout=60):
417
  start_time = time.time()
418
  rank = None # Initialize rank to None, indicating potential failure
419
 
420
  rank_prompt = rank_question_prompt(question, topic, difficulty) # Use question rank prompt
421
 
422
  try:
423
+ response = make_hf_request(model_config[ranking_model_id]["name"], [{"role": "user", "content": rank_prompt}], base_temp, 5, token=token)
424
  if response:
425
  try:
426
  rank_str = response.strip()
 
444
  return ranking_model_id, rank
445
 
446
  # --- Helper Function for Parallel Answering ---
447
+ def get_answer_from_model(model_id, question, consecutive_failures, failure_threshold, unresponsive_models, model_config, topic, token=None, timeout=60):
448
  start_time = time.time() # Start timer
449
  answer_prompt = answer_question_prompt(question)
450
  answer = "Error answering" # Default answer
 
456
  max_tok = long_max_tokens
457
 
458
  try:
459
+ response = make_hf_request(model_config[model_id]["name"], [{"role": "user", "content": answer_prompt}], temp, max_tok, token=token)
460
  if response:
461
  answer = response.strip()
462
  except Exception as e:
 
472
  return answer, duration # Return answer and duration
473
 
474
  # --- Core Logic ---
475
+ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
476
  results = {
477
  "model_name": [],
478
  "topic": [],
 
560
  response = make_hf_request(model_config[question_generator_model_id]["name"],
561
  [{"role": "user", "content": question_prompt}],
562
  question_temp,
563
+ question_max_tokens,
564
+ token=token)
565
 
566
  if response:
567
  question = response.strip()
 
601
  failure_threshold,
602
  unresponsive_models,
603
  model_config,
604
+ token,
605
  timeout=60
606
  )
607
  question_ranking_futures.append(future)
 
666
  unresponsive_models,
667
  model_config,
668
  topic,
669
+ token,
670
  timeout=60
671
  )
672
  answer_futures.append(future)
 
726
  unresponsive_models,
727
  model_config,
728
  topic,
729
+ token,
730
  timeout=60
731
  )
732
  ranking_futures.append(future)
 
871
  if 'results_df' not in st.session_state:
872
  st.session_state.results_df = pd.DataFrame()
873
 
 
 
 
 
 
 
874
  # Run the benchmark
875
  try:
876
  # Update status
 
880
  results, cumulative_avg_rank, total_successful = run_benchmark(
881
  selected_models, selected_topics,
882
  ["a very simple", "a simple", "a", "a difficult", "a very difficult"],
883
+ num_iterations, model_config, hf_token
884
  )
885
 
886
  # Update progress to complete