SLM-RAG-Arena

Running on Zero

App Files Files Community

oliver-aizip commited on 22 days ago

Commit

665e5a3

1 Parent(s): 8151596

first pass at async handling

Browse files

Files changed (2) hide show

app.py +55 -3
utils/models.py +6 -4

app.py CHANGED Viewed

@@ -41,7 +41,7 @@ def load_context():
         show_full
     ]
-def generate_model_summaries_with_timeout(example, timeout=30):
     """Run model inference in a separate thread with timeout for interruptibility"""
     import threading
     import time
@@ -75,6 +75,7 @@ def generate_model_summaries_with_timeout(example, timeout=30):
     generation_thread.daemon = True
     generation_thread.start()
     start_time = time.time()
     while time.time() - start_time < timeout:
         if generation_interrupt.is_set() or not generation_thread.is_alive() or result["completed"]:
@@ -83,6 +84,50 @@ def generate_model_summaries_with_timeout(example, timeout=30):
     return result
 def process_generation_result(result):
     """Process the results from the threaded generation function"""
     if not result["completed"]:
@@ -122,6 +167,13 @@ def process_generation_result(result):
         gr.update(interactive=True),
         gr.update(elem_classes=[])
     ]
 def select_vote_improved(winner_choice):
     """Updates UI based on vote selection"""
@@ -346,7 +398,7 @@ with gr.Blocks(theme=gr.themes.Default(
         outputs=[current_example, query_display, context_description, context_display,
                 context_toggle_btn, show_full_context]
     ).then(
-        fn=lambda example: process_generation_result(generate_model_summaries_with_timeout(example)),
         inputs=[current_example],
         outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text,
                 selected_winner, feedback_list, show_results_state, results_agg,
@@ -367,7 +419,7 @@ with gr.Blocks(theme=gr.themes.Default(
             outputs=[query_display, context_description, context_display,
                     context_toggle_btn, show_full_context]
         ).then(
-            fn=lambda example: process_generation_result(generate_model_summaries_with_timeout(example)),
             inputs=[current_example],
             outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text,
                     selected_winner, feedback_list, show_results_state, results_agg,

         show_full
     ]
+def generate_model_summaries_with_timeout(example, timeout=60):
     """Run model inference in a separate thread with timeout for interruptibility"""
     import threading
     import time
     generation_thread.daemon = True
     generation_thread.start()
+    # Uncomment this critical waiting code
     start_time = time.time()
     while time.time() - start_time < timeout:
         if generation_interrupt.is_set() or not generation_thread.is_alive() or result["completed"]:
     return result
+async def generate_model_summaries_with_timeout_async(example, timeout=30):
+    """Async version that properly waits for the thread"""
+    import asyncio
+    import threading
+    import time
+    result = {
+        "model_a": "",
+        "model_b": "",
+        "summary_a": "",
+        "summary_b": "",
+        "completed": False
+    }
+    if generation_interrupt.is_set():
+        return result
+    def run_generation():
+        try:
+            m_a_name, m_b_name = random.sample(model_names, 2)
+            s_a, s_b = generate_summaries(example, m_a_name, m_b_name)
+            if not generation_interrupt.is_set():
+                result["model_a"] = m_a_name
+                result["model_b"] = m_b_name
+                result["summary_a"] = s_a
+                result["summary_b"] = s_b
+                result["completed"] = True
+        except Exception as e:
+            print(f"Error in generation thread: {e}")
+    generation_thread = threading.Thread(target=run_generation)
+    generation_thread.daemon = True
+    generation_thread.start()
+    # Use asyncio.sleep instead of time.sleep for async waiting
+    start_time = time.time()
+    while time.time() - start_time < timeout:
+        if generation_interrupt.is_set() or not generation_thread.is_alive() or result["completed"]:
+            break
+        await asyncio.sleep(0.1)  # Non-blocking sleep
+    return result
 def process_generation_result(result):
     """Process the results from the threaded generation function"""
     if not result["completed"]:
         gr.update(interactive=True),
         gr.update(elem_classes=[])
     ]
+async def process_example_async(example):
+    result = await generate_model_summaries_with_timeout_async(example)
+    return process_generation_result(result)
+def process_example_sync(example):
+    result = generate_model_summaries_with_timeout(example)
+    return process_generation_result(result)
 def select_vote_improved(winner_choice):
     """Updates UI based on vote selection"""
         outputs=[current_example, query_display, context_description, context_display,
                 context_toggle_btn, show_full_context]
     ).then(
+        fn=process_example_async,
         inputs=[current_example],
         outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text,
                 selected_winner, feedback_list, show_results_state, results_agg,
             outputs=[query_display, context_description, context_display,
                     context_toggle_btn, show_full_context]
         ).then(
+            fn=process_example_sync,
             inputs=[current_example],
             outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text,
                     selected_winner, feedback_list, show_results_state, results_agg,

utils/models.py CHANGED Viewed

@@ -14,7 +14,10 @@ from .prompts import format_rag_prompt
 models = {
     "Qwen2.5-1.5b-Instruct": "qwen/qwen2.5-1.5b-instruct",
     "Qwen2.5-3b-Instruct": "qwen/qwen2.5-3b-instruct", # remove gated for now
-    #"Llama-3.2-1b-Instruct": "meta-llama/llama-3.2-1b-instruct",
     #TODO add more models
 }
@@ -47,7 +50,6 @@ def generate_summaries(example, model_a_name, model_b_name):
     summary_b = run_inference(models[model_b_name], context_text, question)
     return summary_a, summary_b
 def run_inference(model_name, context, question):
     """
     Run inference using the specified model.
@@ -55,7 +57,7 @@ def run_inference(model_name, context, question):
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     # Load the model and tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
     accepts_sys = (
         "System role not supported" not in tokenizer.chat_template
     )  # Workaround for Gemma
@@ -65,7 +67,7 @@ def run_inference(model_name, context, question):
         tokenizer.pad_token = tokenizer.eos_token
     model = AutoModelForCausalLM.from_pretrained(
-        model_name, torch_dtype=torch.bfloat16, attn_implementation="eager"
     ).to(device)
     text_input = format_rag_prompt(question, context, accepts_sys)

 models = {
     "Qwen2.5-1.5b-Instruct": "qwen/qwen2.5-1.5b-instruct",
     "Qwen2.5-3b-Instruct": "qwen/qwen2.5-3b-instruct", # remove gated for now
+    "Llama-3.2-3b-Instruct": "meta-llama/llama-3.2-3b-instruct",
+    "Llama-3.2-1b-Instruct": "meta-llama/llama-3.2-1b-instruct",
+    "Gemma-3-1b-it" : "google/gemma-3-1b-it",
+    #"Bitnet-b1.58-2B-4T": "microsoft/bitnet-b1.58-2B-4T",
     #TODO add more models
 }
     summary_b = run_inference(models[model_b_name], context_text, question)
     return summary_a, summary_b
 def run_inference(model_name, context, question):
     """
     Run inference using the specified model.
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     # Load the model and tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left", token=True)
     accepts_sys = (
         "System role not supported" not in tokenizer.chat_template
     )  # Workaround for Gemma
         tokenizer.pad_token = tokenizer.eos_token
     model = AutoModelForCausalLM.from_pretrained(
+        model_name, torch_dtype=torch.bfloat16, attn_implementation="eager", token=True
     ).to(device)
     text_input = format_rag_prompt(question, context, accepts_sys)