Spaces:

naa142
/

llmfinetune

Sleeping

App Files Files Community

naa142 commited on Apr 30

Commit

8aba9cf

verified ·

1 Parent(s): 1df92de

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -29

app.py CHANGED Viewed

@@ -1,19 +1,18 @@
-# app.py
 import streamlit as st
 import torch
 import torch.nn as nn
 from transformers import AutoTokenizer, AutoModel
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# ✅ 1. Load tokenizer from current directory
-tokenizer = AutoTokenizer.from_pretrained(".", local_files_only=True)
-# ✅ 2. Define the model
 class ScoringModel(nn.Module):
-    def _init_(self, base_model_name="microsoft/deberta-v3-small", dropout_rate=0.242):
-        super()._init_()
         self.base = AutoModel.from_pretrained(base_model_name)
         self.base.gradient_checkpointing_enable()
         self.dropout1 = nn.Dropout(dropout_rate)
@@ -28,37 +27,31 @@ class ScoringModel(nn.Module):
                   self.classifier(self.dropout3(hidden))) / 3
         return logits
-# ✅ 3. Initialize and load weights
 model = ScoringModel()
 model.load_state_dict(torch.load("scoring_model.pt", map_location=device))
 model = model.to(device)
 model.eval()
-# ✅ 4. Setup Streamlit page
 st.set_page_config(page_title="🧠 LLM Response Evaluator", page_icon="📝", layout="wide")
 st.markdown("<h1 style='text-align: center;'>🧠 LLM Response Evaluator</h1>", unsafe_allow_html=True)
 st.markdown("---")
-# ✅ Sidebar Info
 with st.sidebar:
     st.header("ℹ️ About")
     st.markdown("""
     This app evaluates *which AI response is better* given a prompt.
-    *How it works:*
-    - You enter a *prompt* and two *responses*.
-    - The model predicts *which response* is of *higher quality*.
-    Powered by a *fine-tuned DeBERTa-v3-small* model 🚀
     """)
-# ✅ Main input section
 col1, col2 = st.columns(2)
 with col1:
     prompt = st.text_area("📝 Enter the Prompt", height=150)
 with col2:
     st.markdown("<br>", unsafe_allow_html=True)
     st.markdown("👉 Provide two possible responses below:")
@@ -66,7 +59,6 @@ with col2:
 response_a = st.text_area("✏️ Response A", height=100)
 response_b = st.text_area("✏️ Response B", height=100)
-# ✅ Evaluation
 if st.button("🔍 Evaluate Responses"):
     if prompt and response_a and response_b:
         text_a = f"Prompt: {prompt} [SEP] {response_a}"
@@ -75,8 +67,8 @@ if st.button("🔍 Evaluate Responses"):
         encoded_a = tokenizer(text_a, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
         encoded_b = tokenizer(text_b, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
-        encoded_a = {k: v.to(device) for k, v in encoded_a.items() if k in ["input_ids", "attention_mask"]}
-        encoded_b = {k: v.to(device) for k, v in encoded_b.items() if k in ["input_ids", "attention_mask"]}
         with torch.no_grad():
             score_a = model(**encoded_a).squeeze()
@@ -85,24 +77,18 @@ if st.button("🔍 Evaluate Responses"):
         prob_a = torch.sigmoid(score_a).item()
         prob_b = torch.sigmoid(score_b).item()
-        # ✅ Nice result display
         st.subheader("🔮 Prediction Result")
         if prob_b > prob_a:
             st.success(f"✅ *Response B is better!* (Confidence: {prob_b:.4f})")
         else:
             st.success(f"✅ *Response A is better!* (Confidence: {prob_a:.4f})")
-        # ✅ Probability metrics in 2 columns
         mcol1, mcol2 = st.columns(2)
         mcol1.metric(label="Confidence A", value=f"{prob_a:.4f}")
         mcol2.metric(label="Confidence B", value=f"{prob_b:.4f}")
-        # ✅ Bar chart comparison
         st.markdown("---")
         st.subheader("📊 Confidence Comparison")
         st.bar_chart({"Confidence": [prob_a, prob_b]})
     else:
         st.warning("⚠️ Please fill in *all fields* before evaluating!")

 import streamlit as st
 import torch
 import torch.nn as nn
 from transformers import AutoTokenizer, AutoModel
+# ✅ Device setup
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# ✅ Load tokenizer from local files (same repo)
+tokenizer = AutoTokenizer.from_pretrained(".")
+# ✅ Define model class
 class ScoringModel(nn.Module):
+    def __init__(self, base_model_name="microsoft/deberta-v3-small", dropout_rate=0.242):
+        super().__init__()
         self.base = AutoModel.from_pretrained(base_model_name)
         self.base.gradient_checkpointing_enable()
         self.dropout1 = nn.Dropout(dropout_rate)
                   self.classifier(self.dropout3(hidden))) / 3
         return logits
+# ✅ Load model
 model = ScoringModel()
 model.load_state_dict(torch.load("scoring_model.pt", map_location=device))
 model = model.to(device)
 model.eval()
+# ✅ Streamlit UI
 st.set_page_config(page_title="🧠 LLM Response Evaluator", page_icon="📝", layout="wide")
 st.markdown("<h1 style='text-align: center;'>🧠 LLM Response Evaluator</h1>", unsafe_allow_html=True)
 st.markdown("---")
 with st.sidebar:
     st.header("ℹ️ About")
     st.markdown("""
     This app evaluates *which AI response is better* given a prompt.
+    - Enter a **prompt** and two **responses**.
+    - The model predicts **which response** is better.
+    Powered by *fine-tuned DeBERTa-v3-small* 🚀
     """)
 col1, col2 = st.columns(2)
 with col1:
     prompt = st.text_area("📝 Enter the Prompt", height=150)
 with col2:
     st.markdown("<br>", unsafe_allow_html=True)
     st.markdown("👉 Provide two possible responses below:")
 response_a = st.text_area("✏️ Response A", height=100)
 response_b = st.text_area("✏️ Response B", height=100)
 if st.button("🔍 Evaluate Responses"):
     if prompt and response_a and response_b:
         text_a = f"Prompt: {prompt} [SEP] {response_a}"
         encoded_a = tokenizer(text_a, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
         encoded_b = tokenizer(text_b, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
+        encoded_a = {k: v.to(device) for k, v in encoded_a.items()}
+        encoded_b = {k: v.to(device) for k, v in encoded_b.items()}
         with torch.no_grad():
             score_a = model(**encoded_a).squeeze()
         prob_a = torch.sigmoid(score_a).item()
         prob_b = torch.sigmoid(score_b).item()
         st.subheader("🔮 Prediction Result")
         if prob_b > prob_a:
             st.success(f"✅ *Response B is better!* (Confidence: {prob_b:.4f})")
         else:
             st.success(f"✅ *Response A is better!* (Confidence: {prob_a:.4f})")
         mcol1, mcol2 = st.columns(2)
         mcol1.metric(label="Confidence A", value=f"{prob_a:.4f}")
         mcol2.metric(label="Confidence B", value=f"{prob_b:.4f}")
         st.markdown("---")
         st.subheader("📊 Confidence Comparison")
         st.bar_chart({"Confidence": [prob_a, prob_b]})
     else:
         st.warning("⚠️ Please fill in *all fields* before evaluating!")