naa142 commited on
Commit
cc65161
ยท
verified ยท
1 Parent(s): 8aba9cf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -10
app.py CHANGED
@@ -6,10 +6,10 @@ from transformers import AutoTokenizer, AutoModel
6
  # โœ… Device setup
7
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
8
 
9
- # โœ… Load tokenizer from local files (same repo)
10
  tokenizer = AutoTokenizer.from_pretrained(".")
11
 
12
- # โœ… Define model class
13
  class ScoringModel(nn.Module):
14
  def __init__(self, base_model_name="microsoft/deberta-v3-small", dropout_rate=0.242):
15
  super().__init__()
@@ -27,31 +27,35 @@ class ScoringModel(nn.Module):
27
  self.classifier(self.dropout3(hidden))) / 3
28
  return logits
29
 
30
- # โœ… Load model
31
  model = ScoringModel()
32
  model.load_state_dict(torch.load("scoring_model.pt", map_location=device))
33
  model = model.to(device)
34
  model.eval()
35
 
36
- # โœ… Streamlit UI
37
  st.set_page_config(page_title="๐Ÿง  LLM Response Evaluator", page_icon="๐Ÿ“", layout="wide")
38
  st.markdown("<h1 style='text-align: center;'>๐Ÿง  LLM Response Evaluator</h1>", unsafe_allow_html=True)
39
  st.markdown("---")
40
 
 
41
  with st.sidebar:
42
  st.header("โ„น๏ธ About")
43
  st.markdown("""
44
  This app evaluates *which AI response is better* given a prompt.
45
 
46
- - Enter a **prompt** and two **responses**.
47
- - The model predicts **which response** is better.
48
 
49
- Powered by *fine-tuned DeBERTa-v3-small* ๐Ÿš€
50
  """)
51
 
 
52
  col1, col2 = st.columns(2)
 
53
  with col1:
54
  prompt = st.text_area("๐Ÿ“ Enter the Prompt", height=150)
 
55
  with col2:
56
  st.markdown("<br>", unsafe_allow_html=True)
57
  st.markdown("๐Ÿ‘‰ Provide two possible responses below:")
@@ -59,30 +63,44 @@ with col2:
59
  response_a = st.text_area("โœ๏ธ Response A", height=100)
60
  response_b = st.text_area("โœ๏ธ Response B", height=100)
61
 
 
62
  if st.button("๐Ÿ” Evaluate Responses"):
63
  if prompt and response_a and response_b:
 
64
  text_a = f"Prompt: {prompt} [SEP] {response_a}"
65
  text_b = f"Prompt: {prompt} [SEP] {response_b}"
66
 
 
67
  encoded_a = tokenizer(text_a, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
68
  encoded_b = tokenizer(text_b, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
69
 
70
- encoded_a = {k: v.to(device) for k, v in encoded_a.items()}
71
- encoded_b = {k: v.to(device) for k, v in encoded_b.items()}
72
-
 
 
 
 
 
 
 
 
73
  with torch.no_grad():
74
  score_a = model(**encoded_a).squeeze()
75
  score_b = model(**encoded_b).squeeze()
76
 
 
77
  prob_a = torch.sigmoid(score_a).item()
78
  prob_b = torch.sigmoid(score_b).item()
79
 
 
80
  st.subheader("๐Ÿ”ฎ Prediction Result")
81
  if prob_b > prob_a:
82
  st.success(f"โœ… *Response B is better!* (Confidence: {prob_b:.4f})")
83
  else:
84
  st.success(f"โœ… *Response A is better!* (Confidence: {prob_a:.4f})")
85
 
 
86
  mcol1, mcol2 = st.columns(2)
87
  mcol1.metric(label="Confidence A", value=f"{prob_a:.4f}")
88
  mcol2.metric(label="Confidence B", value=f"{prob_b:.4f}")
 
6
  # โœ… Device setup
7
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
8
 
9
+ # โœ… Load tokenizer from the local files in the same repo
10
  tokenizer = AutoTokenizer.from_pretrained(".")
11
 
12
+ # โœ… Define the model architecture
13
  class ScoringModel(nn.Module):
14
  def __init__(self, base_model_name="microsoft/deberta-v3-small", dropout_rate=0.242):
15
  super().__init__()
 
27
  self.classifier(self.dropout3(hidden))) / 3
28
  return logits
29
 
30
+ # โœ… Load the fine-tuned model weights
31
  model = ScoringModel()
32
  model.load_state_dict(torch.load("scoring_model.pt", map_location=device))
33
  model = model.to(device)
34
  model.eval()
35
 
36
+ # โœ… Streamlit UI setup
37
  st.set_page_config(page_title="๐Ÿง  LLM Response Evaluator", page_icon="๐Ÿ“", layout="wide")
38
  st.markdown("<h1 style='text-align: center;'>๐Ÿง  LLM Response Evaluator</h1>", unsafe_allow_html=True)
39
  st.markdown("---")
40
 
41
+ # โœ… Sidebar info
42
  with st.sidebar:
43
  st.header("โ„น๏ธ About")
44
  st.markdown("""
45
  This app evaluates *which AI response is better* given a prompt.
46
 
47
+ - Enter a **prompt** and two **responses**
48
+ - The model predicts **which one is higher quality**
49
 
50
+ Powered by a fine-tuned **DeBERTa-v3-small** model ๐Ÿš€
51
  """)
52
 
53
+ # โœ… Main input form
54
  col1, col2 = st.columns(2)
55
+
56
  with col1:
57
  prompt = st.text_area("๐Ÿ“ Enter the Prompt", height=150)
58
+
59
  with col2:
60
  st.markdown("<br>", unsafe_allow_html=True)
61
  st.markdown("๐Ÿ‘‰ Provide two possible responses below:")
 
63
  response_a = st.text_area("โœ๏ธ Response A", height=100)
64
  response_b = st.text_area("โœ๏ธ Response B", height=100)
65
 
66
+ # โœ… Button logic
67
  if st.button("๐Ÿ” Evaluate Responses"):
68
  if prompt and response_a and response_b:
69
+ # Create formatted inputs
70
  text_a = f"Prompt: {prompt} [SEP] {response_a}"
71
  text_b = f"Prompt: {prompt} [SEP] {response_b}"
72
 
73
+ # Tokenize
74
  encoded_a = tokenizer(text_a, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
75
  encoded_b = tokenizer(text_b, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
76
 
77
+ # Only keep required inputs (fixes token_type_ids error)
78
+ encoded_a = {
79
+ "input_ids": encoded_a["input_ids"].to(device),
80
+ "attention_mask": encoded_a["attention_mask"].to(device)
81
+ }
82
+ encoded_b = {
83
+ "input_ids": encoded_b["input_ids"].to(device),
84
+ "attention_mask": encoded_b["attention_mask"].to(device)
85
+ }
86
+
87
+ # Run prediction
88
  with torch.no_grad():
89
  score_a = model(**encoded_a).squeeze()
90
  score_b = model(**encoded_b).squeeze()
91
 
92
+ # Convert to probability
93
  prob_a = torch.sigmoid(score_a).item()
94
  prob_b = torch.sigmoid(score_b).item()
95
 
96
+ # Display result
97
  st.subheader("๐Ÿ”ฎ Prediction Result")
98
  if prob_b > prob_a:
99
  st.success(f"โœ… *Response B is better!* (Confidence: {prob_b:.4f})")
100
  else:
101
  st.success(f"โœ… *Response A is better!* (Confidence: {prob_a:.4f})")
102
 
103
+ # Metrics and bar chart
104
  mcol1, mcol2 = st.columns(2)
105
  mcol1.metric(label="Confidence A", value=f"{prob_a:.4f}")
106
  mcol2.metric(label="Confidence B", value=f"{prob_b:.4f}")