naa142 commited on
Commit
8aba9cf
ยท
verified ยท
1 Parent(s): 1df92de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -29
app.py CHANGED
@@ -1,19 +1,18 @@
1
- # app.py
2
-
3
  import streamlit as st
4
  import torch
5
  import torch.nn as nn
6
  from transformers import AutoTokenizer, AutoModel
7
 
 
8
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
9
 
10
- # โœ… 1. Load tokenizer from current directory
11
- tokenizer = AutoTokenizer.from_pretrained(".", local_files_only=True)
12
 
13
- # โœ… 2. Define the model
14
  class ScoringModel(nn.Module):
15
- def _init_(self, base_model_name="microsoft/deberta-v3-small", dropout_rate=0.242):
16
- super()._init_()
17
  self.base = AutoModel.from_pretrained(base_model_name)
18
  self.base.gradient_checkpointing_enable()
19
  self.dropout1 = nn.Dropout(dropout_rate)
@@ -28,37 +27,31 @@ class ScoringModel(nn.Module):
28
  self.classifier(self.dropout3(hidden))) / 3
29
  return logits
30
 
31
- # โœ… 3. Initialize and load weights
32
  model = ScoringModel()
33
  model.load_state_dict(torch.load("scoring_model.pt", map_location=device))
34
  model = model.to(device)
35
  model.eval()
36
 
37
- # โœ… 4. Setup Streamlit page
38
  st.set_page_config(page_title="๐Ÿง  LLM Response Evaluator", page_icon="๐Ÿ“", layout="wide")
39
-
40
  st.markdown("<h1 style='text-align: center;'>๐Ÿง  LLM Response Evaluator</h1>", unsafe_allow_html=True)
41
  st.markdown("---")
42
 
43
- # โœ… Sidebar Info
44
  with st.sidebar:
45
  st.header("โ„น๏ธ About")
46
  st.markdown("""
47
  This app evaluates *which AI response is better* given a prompt.
48
-
49
- *How it works:*
50
- - You enter a *prompt* and two *responses*.
51
- - The model predicts *which response* is of *higher quality*.
52
-
53
- Powered by a *fine-tuned DeBERTa-v3-small* model ๐Ÿš€
54
  """)
55
 
56
- # โœ… Main input section
57
  col1, col2 = st.columns(2)
58
-
59
  with col1:
60
  prompt = st.text_area("๐Ÿ“ Enter the Prompt", height=150)
61
-
62
  with col2:
63
  st.markdown("<br>", unsafe_allow_html=True)
64
  st.markdown("๐Ÿ‘‰ Provide two possible responses below:")
@@ -66,7 +59,6 @@ with col2:
66
  response_a = st.text_area("โœ๏ธ Response A", height=100)
67
  response_b = st.text_area("โœ๏ธ Response B", height=100)
68
 
69
- # โœ… Evaluation
70
  if st.button("๐Ÿ” Evaluate Responses"):
71
  if prompt and response_a and response_b:
72
  text_a = f"Prompt: {prompt} [SEP] {response_a}"
@@ -75,8 +67,8 @@ if st.button("๐Ÿ” Evaluate Responses"):
75
  encoded_a = tokenizer(text_a, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
76
  encoded_b = tokenizer(text_b, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
77
 
78
- encoded_a = {k: v.to(device) for k, v in encoded_a.items() if k in ["input_ids", "attention_mask"]}
79
- encoded_b = {k: v.to(device) for k, v in encoded_b.items() if k in ["input_ids", "attention_mask"]}
80
 
81
  with torch.no_grad():
82
  score_a = model(**encoded_a).squeeze()
@@ -85,24 +77,18 @@ if st.button("๐Ÿ” Evaluate Responses"):
85
  prob_a = torch.sigmoid(score_a).item()
86
  prob_b = torch.sigmoid(score_b).item()
87
 
88
- # โœ… Nice result display
89
  st.subheader("๐Ÿ”ฎ Prediction Result")
90
-
91
  if prob_b > prob_a:
92
  st.success(f"โœ… *Response B is better!* (Confidence: {prob_b:.4f})")
93
  else:
94
  st.success(f"โœ… *Response A is better!* (Confidence: {prob_a:.4f})")
95
 
96
- # โœ… Probability metrics in 2 columns
97
  mcol1, mcol2 = st.columns(2)
98
  mcol1.metric(label="Confidence A", value=f"{prob_a:.4f}")
99
  mcol2.metric(label="Confidence B", value=f"{prob_b:.4f}")
100
 
101
- # โœ… Bar chart comparison
102
  st.markdown("---")
103
  st.subheader("๐Ÿ“Š Confidence Comparison")
104
-
105
  st.bar_chart({"Confidence": [prob_a, prob_b]})
106
  else:
107
  st.warning("โš ๏ธ Please fill in *all fields* before evaluating!")
108
-
 
 
 
1
  import streamlit as st
2
  import torch
3
  import torch.nn as nn
4
  from transformers import AutoTokenizer, AutoModel
5
 
6
+ # โœ… Device setup
7
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
8
 
9
+ # โœ… Load tokenizer from local files (same repo)
10
+ tokenizer = AutoTokenizer.from_pretrained(".")
11
 
12
+ # โœ… Define model class
13
  class ScoringModel(nn.Module):
14
+ def __init__(self, base_model_name="microsoft/deberta-v3-small", dropout_rate=0.242):
15
+ super().__init__()
16
  self.base = AutoModel.from_pretrained(base_model_name)
17
  self.base.gradient_checkpointing_enable()
18
  self.dropout1 = nn.Dropout(dropout_rate)
 
27
  self.classifier(self.dropout3(hidden))) / 3
28
  return logits
29
 
30
+ # โœ… Load model
31
  model = ScoringModel()
32
  model.load_state_dict(torch.load("scoring_model.pt", map_location=device))
33
  model = model.to(device)
34
  model.eval()
35
 
36
+ # โœ… Streamlit UI
37
  st.set_page_config(page_title="๐Ÿง  LLM Response Evaluator", page_icon="๐Ÿ“", layout="wide")
 
38
  st.markdown("<h1 style='text-align: center;'>๐Ÿง  LLM Response Evaluator</h1>", unsafe_allow_html=True)
39
  st.markdown("---")
40
 
 
41
  with st.sidebar:
42
  st.header("โ„น๏ธ About")
43
  st.markdown("""
44
  This app evaluates *which AI response is better* given a prompt.
45
+
46
+ - Enter a **prompt** and two **responses**.
47
+ - The model predicts **which response** is better.
48
+
49
+ Powered by *fine-tuned DeBERTa-v3-small* ๐Ÿš€
 
50
  """)
51
 
 
52
  col1, col2 = st.columns(2)
 
53
  with col1:
54
  prompt = st.text_area("๐Ÿ“ Enter the Prompt", height=150)
 
55
  with col2:
56
  st.markdown("<br>", unsafe_allow_html=True)
57
  st.markdown("๐Ÿ‘‰ Provide two possible responses below:")
 
59
  response_a = st.text_area("โœ๏ธ Response A", height=100)
60
  response_b = st.text_area("โœ๏ธ Response B", height=100)
61
 
 
62
  if st.button("๐Ÿ” Evaluate Responses"):
63
  if prompt and response_a and response_b:
64
  text_a = f"Prompt: {prompt} [SEP] {response_a}"
 
67
  encoded_a = tokenizer(text_a, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
68
  encoded_b = tokenizer(text_b, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
69
 
70
+ encoded_a = {k: v.to(device) for k, v in encoded_a.items()}
71
+ encoded_b = {k: v.to(device) for k, v in encoded_b.items()}
72
 
73
  with torch.no_grad():
74
  score_a = model(**encoded_a).squeeze()
 
77
  prob_a = torch.sigmoid(score_a).item()
78
  prob_b = torch.sigmoid(score_b).item()
79
 
 
80
  st.subheader("๐Ÿ”ฎ Prediction Result")
 
81
  if prob_b > prob_a:
82
  st.success(f"โœ… *Response B is better!* (Confidence: {prob_b:.4f})")
83
  else:
84
  st.success(f"โœ… *Response A is better!* (Confidence: {prob_a:.4f})")
85
 
 
86
  mcol1, mcol2 = st.columns(2)
87
  mcol1.metric(label="Confidence A", value=f"{prob_a:.4f}")
88
  mcol2.metric(label="Confidence B", value=f"{prob_b:.4f}")
89
 
 
90
  st.markdown("---")
91
  st.subheader("๐Ÿ“Š Confidence Comparison")
 
92
  st.bar_chart({"Confidence": [prob_a, prob_b]})
93
  else:
94
  st.warning("โš ๏ธ Please fill in *all fields* before evaluating!")