Spaces:

naa142
/

llmfinetune

Sleeping

App Files Files Community

naa142 commited on Apr 30

Commit

cc65161

verified ·

1 Parent(s): 8aba9cf

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -10

app.py CHANGED Viewed

@@ -6,10 +6,10 @@ from transformers import AutoTokenizer, AutoModel
 # ✅ Device setup
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# ✅ Load tokenizer from local files (same repo)
 tokenizer = AutoTokenizer.from_pretrained(".")
-# ✅ Define model class
 class ScoringModel(nn.Module):
     def __init__(self, base_model_name="microsoft/deberta-v3-small", dropout_rate=0.242):
         super().__init__()
@@ -27,31 +27,35 @@ class ScoringModel(nn.Module):
                   self.classifier(self.dropout3(hidden))) / 3
         return logits
-# ✅ Load model
 model = ScoringModel()
 model.load_state_dict(torch.load("scoring_model.pt", map_location=device))
 model = model.to(device)
 model.eval()
-# ✅ Streamlit UI
 st.set_page_config(page_title="🧠 LLM Response Evaluator", page_icon="📝", layout="wide")
 st.markdown("<h1 style='text-align: center;'>🧠 LLM Response Evaluator</h1>", unsafe_allow_html=True)
 st.markdown("---")
 with st.sidebar:
     st.header("ℹ️ About")
     st.markdown("""
     This app evaluates *which AI response is better* given a prompt.
-    - Enter a **prompt** and two **responses**.
-    - The model predicts **which response** is better.
-    Powered by *fine-tuned DeBERTa-v3-small* 🚀
     """)
 col1, col2 = st.columns(2)
 with col1:
     prompt = st.text_area("📝 Enter the Prompt", height=150)
 with col2:
     st.markdown("<br>", unsafe_allow_html=True)
     st.markdown("👉 Provide two possible responses below:")
@@ -59,30 +63,44 @@ with col2:
 response_a = st.text_area("✏️ Response A", height=100)
 response_b = st.text_area("✏️ Response B", height=100)
 if st.button("🔍 Evaluate Responses"):
     if prompt and response_a and response_b:
         text_a = f"Prompt: {prompt} [SEP] {response_a}"
         text_b = f"Prompt: {prompt} [SEP] {response_b}"
         encoded_a = tokenizer(text_a, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
         encoded_b = tokenizer(text_b, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
-        encoded_a = {k: v.to(device) for k, v in encoded_a.items()}
-        encoded_b = {k: v.to(device) for k, v in encoded_b.items()}
         with torch.no_grad():
             score_a = model(**encoded_a).squeeze()
             score_b = model(**encoded_b).squeeze()
         prob_a = torch.sigmoid(score_a).item()
         prob_b = torch.sigmoid(score_b).item()
         st.subheader("🔮 Prediction Result")
         if prob_b > prob_a:
             st.success(f"✅ *Response B is better!* (Confidence: {prob_b:.4f})")
         else:
             st.success(f"✅ *Response A is better!* (Confidence: {prob_a:.4f})")
         mcol1, mcol2 = st.columns(2)
         mcol1.metric(label="Confidence A", value=f"{prob_a:.4f}")
         mcol2.metric(label="Confidence B", value=f"{prob_b:.4f}")

 # ✅ Device setup
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# ✅ Load tokenizer from the local files in the same repo
 tokenizer = AutoTokenizer.from_pretrained(".")
+# ✅ Define the model architecture
 class ScoringModel(nn.Module):
     def __init__(self, base_model_name="microsoft/deberta-v3-small", dropout_rate=0.242):
         super().__init__()
                   self.classifier(self.dropout3(hidden))) / 3
         return logits
+# ✅ Load the fine-tuned model weights
 model = ScoringModel()
 model.load_state_dict(torch.load("scoring_model.pt", map_location=device))
 model = model.to(device)
 model.eval()
+# ✅ Streamlit UI setup
 st.set_page_config(page_title="🧠 LLM Response Evaluator", page_icon="📝", layout="wide")
 st.markdown("<h1 style='text-align: center;'>🧠 LLM Response Evaluator</h1>", unsafe_allow_html=True)
 st.markdown("---")
+# ✅ Sidebar info
 with st.sidebar:
     st.header("ℹ️ About")
     st.markdown("""
     This app evaluates *which AI response is better* given a prompt.
+    - Enter a **prompt** and two **responses**
+    - The model predicts **which one is higher quality**
+    Powered by a fine-tuned **DeBERTa-v3-small** model 🚀
     """)
+# ✅ Main input form
 col1, col2 = st.columns(2)
 with col1:
     prompt = st.text_area("📝 Enter the Prompt", height=150)
 with col2:
     st.markdown("<br>", unsafe_allow_html=True)
     st.markdown("👉 Provide two possible responses below:")
 response_a = st.text_area("✏️ Response A", height=100)
 response_b = st.text_area("✏️ Response B", height=100)
+# ✅ Button logic
 if st.button("🔍 Evaluate Responses"):
     if prompt and response_a and response_b:
+        # Create formatted inputs
         text_a = f"Prompt: {prompt} [SEP] {response_a}"
         text_b = f"Prompt: {prompt} [SEP] {response_b}"
+        # Tokenize
         encoded_a = tokenizer(text_a, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
         encoded_b = tokenizer(text_b, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
+        # Only keep required inputs (fixes token_type_ids error)
+        encoded_a = {
+            "input_ids": encoded_a["input_ids"].to(device),
+            "attention_mask": encoded_a["attention_mask"].to(device)
+        }
+        encoded_b = {
+            "input_ids": encoded_b["input_ids"].to(device),
+            "attention_mask": encoded_b["attention_mask"].to(device)
+        }
+        # Run prediction
         with torch.no_grad():
             score_a = model(**encoded_a).squeeze()
             score_b = model(**encoded_b).squeeze()
+        # Convert to probability
         prob_a = torch.sigmoid(score_a).item()
         prob_b = torch.sigmoid(score_b).item()
+        # Display result
         st.subheader("🔮 Prediction Result")
         if prob_b > prob_a:
             st.success(f"✅ *Response B is better!* (Confidence: {prob_b:.4f})")
         else:
             st.success(f"✅ *Response A is better!* (Confidence: {prob_a:.4f})")
+        # Metrics and bar chart
         mcol1, mcol2 = st.columns(2)
         mcol1.metric(label="Confidence A", value=f"{prob_a:.4f}")
         mcol2.metric(label="Confidence B", value=f"{prob_b:.4f}")