Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -64,7 +64,7 @@ def retry_api_request(max_retries=3, wait_time=10):
|
|
64 |
|
65 |
# --- Single model request function for Hugging Face ---
|
66 |
@retry_api_request()
|
67 |
-
def make_hf_request(model_name, messages, temperature, max_tokens):
|
68 |
"""
|
69 |
Send request to a Hugging Face model using InferenceClient
|
70 |
|
@@ -73,11 +73,12 @@ def make_hf_request(model_name, messages, temperature, max_tokens):
|
|
73 |
messages: Messages in the format [{"role": "user", "content": "..."}]
|
74 |
temperature: Temperature parameter for generation
|
75 |
max_tokens: Maximum tokens to generate
|
|
|
76 |
|
77 |
Returns:
|
78 |
Generated text or None if request fails
|
79 |
"""
|
80 |
-
client = InferenceClient(model=model_name)
|
81 |
|
82 |
# Convert messages to a prompt string
|
83 |
prompt = ""
|
@@ -304,7 +305,6 @@ def rank_answer_prompt(question, answer, topic):
|
|
304 |
3: Good answer - clear, relevant to the topic, well-formulated, with correct statements. For creative writing, this includes demonstrating good originality, imagination, and adherence to the prompt, including the 3000-character limit.
|
305 |
4: Very good answer - very clear, very relevant to the topic, expertly formulated, with highly correct statements. For creative writing, shows strong originality, a compelling narrative or poetic voice, and excellent adherence to the prompt, including the 3000-character limit.
|
306 |
5: Exceptionally good answer - only appliable to exceptional answers that match all the criteria of the previous "4: Very good answer", but also bring additional unique insights, perfectly sound original arguments, or other exceptional unexpected contributions to the topic. For creative writing, this indicates a truly outstanding piece of writing with exceptional creativity, emotional resonance, and masterful execution, while adhering to the 3000-character limit.
|
307 |
-
|
308 |
Consider these criteria in your ranking:
|
309 |
- Clarity: Is the answer easy to understand? Is it ambiguous or confusing?
|
310 |
- Relevance: Is the answer relevant to the specified topic?
|
@@ -323,7 +323,6 @@ Consider these criteria in your ranking:
|
|
323 |
|
324 |
prompt += f"""
|
325 |
Just return a single number (the rank from 1 to 5), do not add any other text.
|
326 |
-
|
327 |
Question: {question}
|
328 |
Answer: {answer}
|
329 |
Rank:"""
|
@@ -345,7 +344,6 @@ def rank_question_prompt(question, topic, difficulty):
|
|
345 |
3: Good question - clear, relevant to the topic, generally appropriate for the difficulty level, and reasonably well-formulated. For creative writing, the prompt is clear, provides a reasonable starting point for creative work, and sets a clear 3000-character limit.
|
346 |
4: Very good question - clear, highly relevant to the topic, appropriate for the difficulty level, and well-formulated. For creative writing, the prompt is engaging, sparks imagination, and offers a good balance of direction and freedom, with a clear 3000-character limit.
|
347 |
5: Excellent question - exceptionally clear, insightful, highly relevant to the topic, perfectly matched to the difficulty level, and expertly formulated. For creative writing, the prompt is exceptionally creative, thought-provoking, and likely to inspire high-quality writing, with a clear 3000-character limit.
|
348 |
-
|
349 |
Consider these criteria in your ranking:
|
350 |
- Clarity: Is the question easy to understand? Is it ambiguous or confusing?
|
351 |
- Relevance: Is the question relevant to the specified topic ({topic})?
|
@@ -361,7 +359,6 @@ Consider these criteria in your ranking:
|
|
361 |
"""
|
362 |
prompt += f"""
|
363 |
Just return a single number (the rank from 1 to 5), do not add any other text.
|
364 |
-
|
365 |
Question: {question}
|
366 |
Rank:"""
|
367 |
return prompt
|
@@ -385,14 +382,14 @@ def parse_rank_string(rank_str, ranking_model_id):
|
|
385 |
return None
|
386 |
|
387 |
# --- Helper Function for Parallel Ranking ---
|
388 |
-
def get_rank_from_model(ranking_model_id, question, answer, consecutive_failures, failure_threshold, unresponsive_models, model_config, topic, timeout=60):
|
389 |
start_time = time.time()
|
390 |
rank = None # Initialize rank to None, indicating potential failure
|
391 |
|
392 |
rank_prompt = rank_answer_prompt(question, answer, topic)
|
393 |
|
394 |
try:
|
395 |
-
response = make_hf_request(model_config[ranking_model_id]["name"], [{"role": "user", "content": rank_prompt}], base_temp, 5)
|
396 |
if response:
|
397 |
try:
|
398 |
rank_str = response.strip()
|
@@ -416,14 +413,14 @@ def get_rank_from_model(ranking_model_id, question, answer, consecutive_failures
|
|
416 |
return ranking_model_id, rank
|
417 |
|
418 |
# --- Helper Function for Parallel Ranking of questions ---
|
419 |
-
def get_question_rank_from_model(ranking_model_id, question, topic, difficulty, consecutive_failures, failure_threshold, unresponsive_models, model_config, timeout=60):
|
420 |
start_time = time.time()
|
421 |
rank = None # Initialize rank to None, indicating potential failure
|
422 |
|
423 |
rank_prompt = rank_question_prompt(question, topic, difficulty) # Use question rank prompt
|
424 |
|
425 |
try:
|
426 |
-
response = make_hf_request(model_config[ranking_model_id]["name"], [{"role": "user", "content": rank_prompt}], base_temp, 5)
|
427 |
if response:
|
428 |
try:
|
429 |
rank_str = response.strip()
|
@@ -447,7 +444,7 @@ def get_question_rank_from_model(ranking_model_id, question, topic, difficulty,
|
|
447 |
return ranking_model_id, rank
|
448 |
|
449 |
# --- Helper Function for Parallel Answering ---
|
450 |
-
def get_answer_from_model(model_id, question, consecutive_failures, failure_threshold, unresponsive_models, model_config, topic, timeout=60):
|
451 |
start_time = time.time() # Start timer
|
452 |
answer_prompt = answer_question_prompt(question)
|
453 |
answer = "Error answering" # Default answer
|
@@ -459,7 +456,7 @@ def get_answer_from_model(model_id, question, consecutive_failures, failure_thre
|
|
459 |
max_tok = long_max_tokens
|
460 |
|
461 |
try:
|
462 |
-
response = make_hf_request(model_config[model_id]["name"], [{"role": "user", "content": answer_prompt}], temp, max_tok)
|
463 |
if response:
|
464 |
answer = response.strip()
|
465 |
except Exception as e:
|
@@ -475,7 +472,7 @@ def get_answer_from_model(model_id, question, consecutive_failures, failure_thre
|
|
475 |
return answer, duration # Return answer and duration
|
476 |
|
477 |
# --- Core Logic ---
|
478 |
-
def run_benchmark(hf_models, topics, difficulties, t, model_config):
|
479 |
results = {
|
480 |
"model_name": [],
|
481 |
"topic": [],
|
@@ -563,7 +560,8 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config):
|
|
563 |
response = make_hf_request(model_config[question_generator_model_id]["name"],
|
564 |
[{"role": "user", "content": question_prompt}],
|
565 |
question_temp,
|
566 |
-
question_max_tokens
|
|
|
567 |
|
568 |
if response:
|
569 |
question = response.strip()
|
@@ -603,6 +601,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config):
|
|
603 |
failure_threshold,
|
604 |
unresponsive_models,
|
605 |
model_config,
|
|
|
606 |
timeout=60
|
607 |
)
|
608 |
question_ranking_futures.append(future)
|
@@ -667,6 +666,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config):
|
|
667 |
unresponsive_models,
|
668 |
model_config,
|
669 |
topic,
|
|
|
670 |
timeout=60
|
671 |
)
|
672 |
answer_futures.append(future)
|
@@ -726,6 +726,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config):
|
|
726 |
unresponsive_models,
|
727 |
model_config,
|
728 |
topic,
|
|
|
729 |
timeout=60
|
730 |
)
|
731 |
ranking_futures.append(future)
|
@@ -870,12 +871,6 @@ if st.sidebar.button("Start Benchmark"):
|
|
870 |
if 'results_df' not in st.session_state:
|
871 |
st.session_state.results_df = pd.DataFrame()
|
872 |
|
873 |
-
# Modify make_hf_request to use the token
|
874 |
-
def make_hf_request_with_token(model_name, messages, temperature, max_tokens):
|
875 |
-
client = InferenceClient(model=model_name, token=hf_token)
|
876 |
-
# Rest of the function is the same...
|
877 |
-
# Return response
|
878 |
-
|
879 |
# Run the benchmark
|
880 |
try:
|
881 |
# Update status
|
@@ -885,7 +880,7 @@ if st.sidebar.button("Start Benchmark"):
|
|
885 |
results, cumulative_avg_rank, total_successful = run_benchmark(
|
886 |
selected_models, selected_topics,
|
887 |
["a very simple", "a simple", "a", "a difficult", "a very difficult"],
|
888 |
-
num_iterations, model_config
|
889 |
)
|
890 |
|
891 |
# Update progress to complete
|
|
|
64 |
|
65 |
# --- Single model request function for Hugging Face ---
|
66 |
@retry_api_request()
|
67 |
+
def make_hf_request(model_name, messages, temperature, max_tokens, token=None):
|
68 |
"""
|
69 |
Send request to a Hugging Face model using InferenceClient
|
70 |
|
|
|
73 |
messages: Messages in the format [{"role": "user", "content": "..."}]
|
74 |
temperature: Temperature parameter for generation
|
75 |
max_tokens: Maximum tokens to generate
|
76 |
+
token: Hugging Face API token
|
77 |
|
78 |
Returns:
|
79 |
Generated text or None if request fails
|
80 |
"""
|
81 |
+
client = InferenceClient(model=model_name, token=token)
|
82 |
|
83 |
# Convert messages to a prompt string
|
84 |
prompt = ""
|
|
|
305 |
3: Good answer - clear, relevant to the topic, well-formulated, with correct statements. For creative writing, this includes demonstrating good originality, imagination, and adherence to the prompt, including the 3000-character limit.
|
306 |
4: Very good answer - very clear, very relevant to the topic, expertly formulated, with highly correct statements. For creative writing, shows strong originality, a compelling narrative or poetic voice, and excellent adherence to the prompt, including the 3000-character limit.
|
307 |
5: Exceptionally good answer - only appliable to exceptional answers that match all the criteria of the previous "4: Very good answer", but also bring additional unique insights, perfectly sound original arguments, or other exceptional unexpected contributions to the topic. For creative writing, this indicates a truly outstanding piece of writing with exceptional creativity, emotional resonance, and masterful execution, while adhering to the 3000-character limit.
|
|
|
308 |
Consider these criteria in your ranking:
|
309 |
- Clarity: Is the answer easy to understand? Is it ambiguous or confusing?
|
310 |
- Relevance: Is the answer relevant to the specified topic?
|
|
|
323 |
|
324 |
prompt += f"""
|
325 |
Just return a single number (the rank from 1 to 5), do not add any other text.
|
|
|
326 |
Question: {question}
|
327 |
Answer: {answer}
|
328 |
Rank:"""
|
|
|
344 |
3: Good question - clear, relevant to the topic, generally appropriate for the difficulty level, and reasonably well-formulated. For creative writing, the prompt is clear, provides a reasonable starting point for creative work, and sets a clear 3000-character limit.
|
345 |
4: Very good question - clear, highly relevant to the topic, appropriate for the difficulty level, and well-formulated. For creative writing, the prompt is engaging, sparks imagination, and offers a good balance of direction and freedom, with a clear 3000-character limit.
|
346 |
5: Excellent question - exceptionally clear, insightful, highly relevant to the topic, perfectly matched to the difficulty level, and expertly formulated. For creative writing, the prompt is exceptionally creative, thought-provoking, and likely to inspire high-quality writing, with a clear 3000-character limit.
|
|
|
347 |
Consider these criteria in your ranking:
|
348 |
- Clarity: Is the question easy to understand? Is it ambiguous or confusing?
|
349 |
- Relevance: Is the question relevant to the specified topic ({topic})?
|
|
|
359 |
"""
|
360 |
prompt += f"""
|
361 |
Just return a single number (the rank from 1 to 5), do not add any other text.
|
|
|
362 |
Question: {question}
|
363 |
Rank:"""
|
364 |
return prompt
|
|
|
382 |
return None
|
383 |
|
384 |
# --- Helper Function for Parallel Ranking ---
|
385 |
+
def get_rank_from_model(ranking_model_id, question, answer, consecutive_failures, failure_threshold, unresponsive_models, model_config, topic, token=None, timeout=60):
|
386 |
start_time = time.time()
|
387 |
rank = None # Initialize rank to None, indicating potential failure
|
388 |
|
389 |
rank_prompt = rank_answer_prompt(question, answer, topic)
|
390 |
|
391 |
try:
|
392 |
+
response = make_hf_request(model_config[ranking_model_id]["name"], [{"role": "user", "content": rank_prompt}], base_temp, 5, token=token)
|
393 |
if response:
|
394 |
try:
|
395 |
rank_str = response.strip()
|
|
|
413 |
return ranking_model_id, rank
|
414 |
|
415 |
# --- Helper Function for Parallel Ranking of questions ---
|
416 |
+
def get_question_rank_from_model(ranking_model_id, question, topic, difficulty, consecutive_failures, failure_threshold, unresponsive_models, model_config, token=None, timeout=60):
|
417 |
start_time = time.time()
|
418 |
rank = None # Initialize rank to None, indicating potential failure
|
419 |
|
420 |
rank_prompt = rank_question_prompt(question, topic, difficulty) # Use question rank prompt
|
421 |
|
422 |
try:
|
423 |
+
response = make_hf_request(model_config[ranking_model_id]["name"], [{"role": "user", "content": rank_prompt}], base_temp, 5, token=token)
|
424 |
if response:
|
425 |
try:
|
426 |
rank_str = response.strip()
|
|
|
444 |
return ranking_model_id, rank
|
445 |
|
446 |
# --- Helper Function for Parallel Answering ---
|
447 |
+
def get_answer_from_model(model_id, question, consecutive_failures, failure_threshold, unresponsive_models, model_config, topic, token=None, timeout=60):
|
448 |
start_time = time.time() # Start timer
|
449 |
answer_prompt = answer_question_prompt(question)
|
450 |
answer = "Error answering" # Default answer
|
|
|
456 |
max_tok = long_max_tokens
|
457 |
|
458 |
try:
|
459 |
+
response = make_hf_request(model_config[model_id]["name"], [{"role": "user", "content": answer_prompt}], temp, max_tok, token=token)
|
460 |
if response:
|
461 |
answer = response.strip()
|
462 |
except Exception as e:
|
|
|
472 |
return answer, duration # Return answer and duration
|
473 |
|
474 |
# --- Core Logic ---
|
475 |
+
def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
|
476 |
results = {
|
477 |
"model_name": [],
|
478 |
"topic": [],
|
|
|
560 |
response = make_hf_request(model_config[question_generator_model_id]["name"],
|
561 |
[{"role": "user", "content": question_prompt}],
|
562 |
question_temp,
|
563 |
+
question_max_tokens,
|
564 |
+
token=token)
|
565 |
|
566 |
if response:
|
567 |
question = response.strip()
|
|
|
601 |
failure_threshold,
|
602 |
unresponsive_models,
|
603 |
model_config,
|
604 |
+
token,
|
605 |
timeout=60
|
606 |
)
|
607 |
question_ranking_futures.append(future)
|
|
|
666 |
unresponsive_models,
|
667 |
model_config,
|
668 |
topic,
|
669 |
+
token,
|
670 |
timeout=60
|
671 |
)
|
672 |
answer_futures.append(future)
|
|
|
726 |
unresponsive_models,
|
727 |
model_config,
|
728 |
topic,
|
729 |
+
token,
|
730 |
timeout=60
|
731 |
)
|
732 |
ranking_futures.append(future)
|
|
|
871 |
if 'results_df' not in st.session_state:
|
872 |
st.session_state.results_df = pd.DataFrame()
|
873 |
|
|
|
|
|
|
|
|
|
|
|
|
|
874 |
# Run the benchmark
|
875 |
try:
|
876 |
# Update status
|
|
|
880 |
results, cumulative_avg_rank, total_successful = run_benchmark(
|
881 |
selected_models, selected_topics,
|
882 |
["a very simple", "a simple", "a", "a difficult", "a very difficult"],
|
883 |
+
num_iterations, model_config, hf_token
|
884 |
)
|
885 |
|
886 |
# Update progress to complete
|