re: two models
Browse files
app.py
CHANGED
@@ -12,19 +12,20 @@ from keras.layers import Layer
|
|
12 |
from transformers import DebertaV2Tokenizer, TFAutoModel
|
13 |
import streamlit as st
|
14 |
from google import genai
|
15 |
-
import pandas as pd
|
16 |
-
|
17 |
torch.classes.__path__ = []
|
18 |
# Download tokenizer data once
|
19 |
nltk.download('punkt', quiet=True)
|
20 |
|
21 |
# === Cleaning Function ===
|
22 |
def clean_response(text: str) -> str:
|
|
|
23 |
text = re.sub(r"[*_`#>\-\[\]()]", "", text)
|
24 |
text = re.sub(r"\s+", " ", text)
|
25 |
return text.strip()
|
26 |
|
27 |
-
# ===
|
|
|
|
|
28 |
def get_response_from_gemini(prompt: str, key) -> str:
|
29 |
gemini_client = genai.Client(api_key=key)
|
30 |
response = gemini_client.models.generate_content(
|
@@ -33,6 +34,7 @@ def get_response_from_gemini(prompt: str, key) -> str:
|
|
33 |
)
|
34 |
return response.text.strip()
|
35 |
|
|
|
36 |
def get_response_from_deepseek(prompt: str, key) -> str:
|
37 |
response = requests.post(
|
38 |
url="https://openrouter.ai/api/v1/chat/completions",
|
@@ -44,36 +46,6 @@ def get_response_from_deepseek(prompt: str, key) -> str:
|
|
44 |
)
|
45 |
return response.json()["choices"][0]["message"]["content"]
|
46 |
|
47 |
-
def get_response_from_llamafourscout(prompt: str, key) -> str:
|
48 |
-
response = requests.post(
|
49 |
-
url="https://openrouter.ai/api/v1/chat/completions",
|
50 |
-
headers={"Authorization": f"Bearer {key}"},
|
51 |
-
data=json.dumps({
|
52 |
-
"model": "meta-llama/llama-4-scout:free",
|
53 |
-
"messages": [{"role": "user", "content": prompt}]
|
54 |
-
})
|
55 |
-
)
|
56 |
-
return response.json()["choices"][0]["message"]["content"]
|
57 |
-
|
58 |
-
def get_response_from_mistralsmall(prompt: str, key) -> str:
|
59 |
-
response = requests.post(
|
60 |
-
url="https://openrouter.ai/api/v1/chat/completions",
|
61 |
-
headers={"Authorization": f"Bearer {key}"},
|
62 |
-
data=json.dumps({
|
63 |
-
"model": "mistralai/mistral-small-3.1-24b-instruct:free",
|
64 |
-
"messages": [{"role": "user", "content": prompt}]
|
65 |
-
})
|
66 |
-
)
|
67 |
-
return response.json()["choices"][0]["message"]["content"]
|
68 |
-
|
69 |
-
# === Model Function Mapping ===
|
70 |
-
MODEL_MAP = {
|
71 |
-
"Gemini": get_response_from_gemini,
|
72 |
-
"DeepSeek": get_response_from_deepseek,
|
73 |
-
"LLaMA 4 Scout": get_response_from_llamafourscout,
|
74 |
-
"Mistral Small": get_response_from_mistralsmall,
|
75 |
-
}
|
76 |
-
|
77 |
# === Metrics ===
|
78 |
def calculate_entropy(text: str) -> float:
|
79 |
try:
|
@@ -90,7 +62,8 @@ def calculate_entropy(text: str) -> float:
|
|
90 |
def calculate_ttr(text: str) -> float:
|
91 |
try:
|
92 |
tokens = [token.lower() for token in word_tokenize(text) if token.isalnum()]
|
93 |
-
return len(set(tokens
|
|
|
94 |
except:
|
95 |
return -999999
|
96 |
|
@@ -146,13 +119,10 @@ def preprocess_inputs(prompt: str, response_a: str, response_b: str, tokenizer,
|
|
146 |
]).reshape(1, -1).astype(np.float32)
|
147 |
return encoded["input_ids"], encoded["attention_mask"], metrics
|
148 |
|
149 |
-
# === History Buffer ===
|
150 |
-
if "history" not in st.session_state:
|
151 |
-
st.session_state.history = []
|
152 |
-
|
153 |
# === Streamlit UI ===
|
154 |
st.set_page_config(page_title="LMSYS Demo", layout="wide")
|
155 |
|
|
|
156 |
st.markdown(
|
157 |
"""
|
158 |
<style>
|
@@ -182,45 +152,42 @@ st.markdown(
|
|
182 |
st.title("Predicting Human Preference : Gemini vs DeepSeek")
|
183 |
st.write("As part of this demo, we make use of two SOTA LLMs : [Gemini 2.5 Pro](https://deepmind.google/technologies/gemini/pro/) and [DeepSeek R1](https://api-docs.deepseek.com/news/news250120) and make them compete against each other on a given prompt (to be entered through the sidebar)")
|
184 |
st.write("Using our proposed hybrid model, we predict which response is more suited to be preferred by a human user.")
|
185 |
-
|
186 |
st.sidebar.title("Ask a Question!")
|
187 |
-
model_choices = list(MODEL_MAP.keys())
|
188 |
-
model_a_name = st.sidebar.selectbox("Choose Model A", model_choices, index=0)
|
189 |
-
model_b_name = st.sidebar.selectbox("Choose Model B", model_choices, index=1)
|
190 |
question = st.sidebar.text_area("Enter your question:", key="prompt_input")
|
191 |
|
|
|
192 |
if "generated" not in st.session_state:
|
193 |
st.session_state["generated"] = False
|
194 |
|
|
|
195 |
if st.sidebar.button("Generate Responses") and question:
|
196 |
with st.spinner("Generating LLM responses..."):
|
197 |
-
raw_a =
|
198 |
-
raw_b =
|
199 |
-
|
200 |
-
st.session_state
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
"response_b_clean": clean_response(raw_b),
|
205 |
-
"generated": True,
|
206 |
-
"prediction": None,
|
207 |
-
"model_a_name": model_a_name,
|
208 |
-
"model_b_name": model_b_name
|
209 |
-
})
|
210 |
|
|
|
|
|
|
|
|
|
211 |
if st.session_state["generated"]:
|
212 |
-
tab1, tab2, tab3
|
213 |
|
214 |
with tab1:
|
215 |
st.subheader("Model Responses")
|
216 |
col1, col2 = st.columns(2)
|
217 |
with col1:
|
218 |
-
st.markdown(
|
219 |
st.markdown(st.session_state["response_a_raw"])
|
220 |
with col2:
|
221 |
-
st.markdown(
|
222 |
st.markdown(st.session_state["response_b_raw"])
|
223 |
|
|
|
224 |
if st.button("Predict Winner"):
|
225 |
with st.spinner("Running model..."):
|
226 |
input_ids, attention_mask, num_features = preprocess_inputs(
|
@@ -231,23 +198,9 @@ if st.session_state["generated"]:
|
|
231 |
)
|
232 |
predictions = model.predict([input_ids, attention_mask, num_features], verbose=0)
|
233 |
predicted_class = np.argmax(predictions, axis=-1)[0]
|
234 |
-
label_map = {0:
|
235 |
st.session_state["prediction"] = label_map[predicted_class]
|
236 |
|
237 |
-
# Add to history
|
238 |
-
st.session_state.history.append({
|
239 |
-
"Prompt": question,
|
240 |
-
"Model A": st.session_state['model_a_name'],
|
241 |
-
"Model B": st.session_state['model_b_name'],
|
242 |
-
"Response A": st.session_state["response_a_raw"],
|
243 |
-
"Response B": st.session_state["response_b_raw"],
|
244 |
-
"Prediction": label_map[predicted_class],
|
245 |
-
"FRES_A": num_features[0][0], "FRES_B": num_features[0][1],
|
246 |
-
"DC_A": num_features[0][2], "DC_B": num_features[0][3],
|
247 |
-
"TTR_A": num_features[0][4], "TTR_B": num_features[0][5],
|
248 |
-
"Entropy_A": num_features[0][6], "Entropy_B": num_features[0][7]
|
249 |
-
})
|
250 |
-
|
251 |
if st.session_state.get("prediction"):
|
252 |
st.success(f"🤖 Model Prediction: {st.session_state['prediction']}")
|
253 |
|
@@ -261,54 +214,34 @@ if st.session_state["generated"]:
|
|
261 |
st.markdown("### RNN")
|
262 |
col1, col2 = st.columns(2)
|
263 |
with col1:
|
264 |
-
st.image("images/plots/rnn_baseline_acc.png", caption="Accuracy - RNN",
|
265 |
with col2:
|
266 |
-
st.image("images/plots/rnn_baseline_loss.png", caption="Log Loss - RNN",
|
267 |
|
268 |
st.markdown("### LSTM")
|
269 |
col1, col2 = st.columns(2)
|
270 |
with col1:
|
271 |
-
st.image("images/plots/lstm_baseline_acc.png", caption="Accuracy - LSTM",
|
272 |
with col2:
|
273 |
-
st.image("images/plots/lstm_baseline_loss.png", caption="Log Loss - LSTM",
|
274 |
|
275 |
st.markdown("### Bi-LSTM")
|
276 |
col1, col2 = st.columns(2)
|
277 |
with col1:
|
278 |
-
st.image("images/plots/bilstm_baseline_acc.png", caption="Accuracy - Bi-LSTM",
|
279 |
with col2:
|
280 |
-
st.image("images/plots/bilstm_baseline_loss.png", caption="Log Loss - Bi-LSTM",
|
281 |
|
282 |
st.markdown("### Hybrid (Dual-LSTM)")
|
283 |
col1, col2 = st.columns(2)
|
284 |
with col1:
|
285 |
-
st.image("images/plots/duallstm_hybrid_acc.png", caption="Accuracy - Hybrid (Dual-LSTM)",
|
286 |
with col2:
|
287 |
-
st.image("images/plots/duallstm_hybrid_loss.png", caption="Log Loss - Hybrid (Dual-LSTM)",
|
288 |
|
289 |
st.markdown("### Hybrid (Bi-LSTM)")
|
290 |
col1, col2 = st.columns(2)
|
291 |
with col1:
|
292 |
-
st.image("images/plots/bilstm_hybrid_acc.png", caption="Accuracy - Hybrid (Bi-LSTM)",
|
293 |
with col2:
|
294 |
-
st.image("images/plots/bilstm_hybrid_loss.png", caption="Log Loss - Hybrid (Bi-LSTM)",
|
295 |
-
|
296 |
-
with tab4:
|
297 |
-
st.subheader("History of Predictions")
|
298 |
-
if st.session_state.history:
|
299 |
-
for i, item in enumerate(reversed(st.session_state.history), 1):
|
300 |
-
with st.expander(f"Prediction #{len(st.session_state.history) - i + 1}: {item['Prediction']}"):
|
301 |
-
st.markdown(f"**Prompt:** {item['Prompt']}")
|
302 |
-
st.markdown(f"**Model A ({item['Model A']}):**")
|
303 |
-
st.markdown(item['Response A'])
|
304 |
-
st.markdown(f"**Model B ({item['Model B']}):**")
|
305 |
-
st.markdown(item['Response B'])
|
306 |
-
st.markdown("**Numerical Features:**")
|
307 |
-
st.json({
|
308 |
-
"FRES_A": item['FRES_A'], "FRES_B": item['FRES_B'],
|
309 |
-
"DC_A": item['DC_A'], "DC_B": item['DC_B'],
|
310 |
-
"TTR_A": item['TTR_A'], "TTR_B": item['TTR_B'],
|
311 |
-
"Entropy_A": item['Entropy_A'], "Entropy_B": item['Entropy_B']
|
312 |
-
})
|
313 |
-
else:
|
314 |
-
st.info("No history yet. Run a prediction first.")
|
|
|
12 |
from transformers import DebertaV2Tokenizer, TFAutoModel
|
13 |
import streamlit as st
|
14 |
from google import genai
|
|
|
|
|
15 |
torch.classes.__path__ = []
|
16 |
# Download tokenizer data once
|
17 |
nltk.download('punkt', quiet=True)
|
18 |
|
19 |
# === Cleaning Function ===
|
20 |
def clean_response(text: str) -> str:
|
21 |
+
# Simple markdown cleaner
|
22 |
text = re.sub(r"[*_`#>\-\[\]()]", "", text)
|
23 |
text = re.sub(r"\s+", " ", text)
|
24 |
return text.strip()
|
25 |
|
26 |
+
# === Gemini API ===
|
27 |
+
|
28 |
+
|
29 |
def get_response_from_gemini(prompt: str, key) -> str:
|
30 |
gemini_client = genai.Client(api_key=key)
|
31 |
response = gemini_client.models.generate_content(
|
|
|
34 |
)
|
35 |
return response.text.strip()
|
36 |
|
37 |
+
# === DeepSeek API ===
|
38 |
def get_response_from_deepseek(prompt: str, key) -> str:
|
39 |
response = requests.post(
|
40 |
url="https://openrouter.ai/api/v1/chat/completions",
|
|
|
46 |
)
|
47 |
return response.json()["choices"][0]["message"]["content"]
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
# === Metrics ===
|
50 |
def calculate_entropy(text: str) -> float:
|
51 |
try:
|
|
|
62 |
def calculate_ttr(text: str) -> float:
|
63 |
try:
|
64 |
tokens = [token.lower() for token in word_tokenize(text) if token.isalnum()]
|
65 |
+
return len(set(tokens
|
66 |
+
)) / len(tokens) if tokens else -999999
|
67 |
except:
|
68 |
return -999999
|
69 |
|
|
|
119 |
]).reshape(1, -1).astype(np.float32)
|
120 |
return encoded["input_ids"], encoded["attention_mask"], metrics
|
121 |
|
|
|
|
|
|
|
|
|
122 |
# === Streamlit UI ===
|
123 |
st.set_page_config(page_title="LMSYS Demo", layout="wide")
|
124 |
|
125 |
+
# Optional styling (vintage theme)
|
126 |
st.markdown(
|
127 |
"""
|
128 |
<style>
|
|
|
152 |
st.title("Predicting Human Preference : Gemini vs DeepSeek")
|
153 |
st.write("As part of this demo, we make use of two SOTA LLMs : [Gemini 2.5 Pro](https://deepmind.google/technologies/gemini/pro/) and [DeepSeek R1](https://api-docs.deepseek.com/news/news250120) and make them compete against each other on a given prompt (to be entered through the sidebar)")
|
154 |
st.write("Using our proposed hybrid model, we predict which response is more suited to be preferred by a human user.")
|
|
|
155 |
st.sidebar.title("Ask a Question!")
|
|
|
|
|
|
|
156 |
question = st.sidebar.text_area("Enter your question:", key="prompt_input")
|
157 |
|
158 |
+
# Init session state
|
159 |
if "generated" not in st.session_state:
|
160 |
st.session_state["generated"] = False
|
161 |
|
162 |
+
# Generate responses
|
163 |
if st.sidebar.button("Generate Responses") and question:
|
164 |
with st.spinner("Generating LLM responses..."):
|
165 |
+
raw_a = get_response_from_gemini(question, st.secrets["GEMINI_API_KEY"])
|
166 |
+
raw_b = get_response_from_deepseek(question, st.secrets["OPENROUTER_API_KEY"])
|
167 |
+
|
168 |
+
st.session_state["response_a_raw"] = raw_a
|
169 |
+
st.session_state["response_b_raw"] = raw_b
|
170 |
+
st.session_state["response_a_clean"] = clean_response(raw_a)
|
171 |
+
st.session_state["response_b_clean"] = clean_response(raw_b)
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
|
173 |
+
st.session_state["generated"] = True
|
174 |
+
st.session_state["prediction"] = None
|
175 |
+
|
176 |
+
# Display and interact
|
177 |
if st.session_state["generated"]:
|
178 |
+
tab1, tab2, tab3 = st.tabs(["Predictions","Model Architecture", "📈 Metric Curves"])
|
179 |
|
180 |
with tab1:
|
181 |
st.subheader("Model Responses")
|
182 |
col1, col2 = st.columns(2)
|
183 |
with col1:
|
184 |
+
st.markdown("#### Gemini")
|
185 |
st.markdown(st.session_state["response_a_raw"])
|
186 |
with col2:
|
187 |
+
st.markdown("#### DeepSeek")
|
188 |
st.markdown(st.session_state["response_b_raw"])
|
189 |
|
190 |
+
|
191 |
if st.button("Predict Winner"):
|
192 |
with st.spinner("Running model..."):
|
193 |
input_ids, attention_mask, num_features = preprocess_inputs(
|
|
|
198 |
)
|
199 |
predictions = model.predict([input_ids, attention_mask, num_features], verbose=0)
|
200 |
predicted_class = np.argmax(predictions, axis=-1)[0]
|
201 |
+
label_map = {0: "Gemini!", 1: "DeepSeek!", 2: "Tie!"}
|
202 |
st.session_state["prediction"] = label_map[predicted_class]
|
203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
if st.session_state.get("prediction"):
|
205 |
st.success(f"🤖 Model Prediction: {st.session_state['prediction']}")
|
206 |
|
|
|
214 |
st.markdown("### RNN")
|
215 |
col1, col2 = st.columns(2)
|
216 |
with col1:
|
217 |
+
st.image("images/plots/rnn_baseline_acc.png", caption="Accuracy - RNN", use_container_width=True)
|
218 |
with col2:
|
219 |
+
st.image("images/plots/rnn_baseline_loss.png", caption="Log Loss - RNN", use_container_width=True)
|
220 |
|
221 |
st.markdown("### LSTM")
|
222 |
col1, col2 = st.columns(2)
|
223 |
with col1:
|
224 |
+
st.image("images/plots/lstm_baseline_acc.png", caption="Accuracy - LSTM", use_container_width=True)
|
225 |
with col2:
|
226 |
+
st.image("images/plots/lstm_baseline_loss.png", caption="Log Loss - LSTM", use_container_width=True)
|
227 |
|
228 |
st.markdown("### Bi-LSTM")
|
229 |
col1, col2 = st.columns(2)
|
230 |
with col1:
|
231 |
+
st.image("images/plots/bilstm_baseline_acc.png", caption="Accuracy - Bi-LSTM", use_container_width=True)
|
232 |
with col2:
|
233 |
+
st.image("images/plots/bilstm_baseline_loss.png", caption="Log Loss - Bi-LSTM", use_container_width=True)
|
234 |
|
235 |
st.markdown("### Hybrid (Dual-LSTM)")
|
236 |
col1, col2 = st.columns(2)
|
237 |
with col1:
|
238 |
+
st.image("images/plots/duallstm_hybrid_acc.png", caption="Accuracy - Hybrid (Dual-LSTM)", use_container_width=True)
|
239 |
with col2:
|
240 |
+
st.image("images/plots/duallstm_hybrid_loss.png", caption="Log Loss - Hybrid (Dual-LSTM)", use_container_width=True)
|
241 |
|
242 |
st.markdown("### Hybrid (Bi-LSTM)")
|
243 |
col1, col2 = st.columns(2)
|
244 |
with col1:
|
245 |
+
st.image("images/plots/bilstm_hybrid_acc.png", caption="Accuracy - Hybrid (Bi-LSTM)", use_container_width=True)
|
246 |
with col2:
|
247 |
+
st.image("images/plots/bilstm_hybrid_loss.png", caption="Log Loss - Hybrid (Bi-LSTM)", use_container_width=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|