added multi-models again
Browse files
app.py
CHANGED
@@ -7,25 +7,26 @@ from nltk.tokenize import word_tokenize
|
|
7 |
import textstat
|
8 |
import json
|
9 |
import requests
|
|
|
|
|
10 |
import tensorflow as tf
|
11 |
from keras.layers import Layer
|
12 |
from transformers import DebertaV2Tokenizer, TFAutoModel
|
13 |
import streamlit as st
|
14 |
from google import genai
|
|
|
|
|
15 |
torch.classes.__path__ = []
|
16 |
# Download tokenizer data once
|
17 |
nltk.download('punkt', quiet=True)
|
18 |
|
19 |
# === Cleaning Function ===
|
20 |
def clean_response(text: str) -> str:
|
21 |
-
# Simple markdown cleaner
|
22 |
text = re.sub(r"[*_`#>\-\[\]()]", "", text)
|
23 |
text = re.sub(r"\s+", " ", text)
|
24 |
return text.strip()
|
25 |
|
26 |
-
# ===
|
27 |
-
|
28 |
-
|
29 |
def get_response_from_gemini(prompt: str, key) -> str:
|
30 |
gemini_client = genai.Client(api_key=key)
|
31 |
response = gemini_client.models.generate_content(
|
@@ -34,7 +35,6 @@ def get_response_from_gemini(prompt: str, key) -> str:
|
|
34 |
)
|
35 |
return response.text.strip()
|
36 |
|
37 |
-
# === DeepSeek API ===
|
38 |
def get_response_from_deepseek(prompt: str, key) -> str:
|
39 |
response = requests.post(
|
40 |
url="https://openrouter.ai/api/v1/chat/completions",
|
@@ -46,6 +46,36 @@ def get_response_from_deepseek(prompt: str, key) -> str:
|
|
46 |
)
|
47 |
return response.json()["choices"][0]["message"]["content"]
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
# === Metrics ===
|
50 |
def calculate_entropy(text: str) -> float:
|
51 |
try:
|
@@ -62,8 +92,7 @@ def calculate_entropy(text: str) -> float:
|
|
62 |
def calculate_ttr(text: str) -> float:
|
63 |
try:
|
64 |
tokens = [token.lower() for token in word_tokenize(text) if token.isalnum()]
|
65 |
-
return len(set(tokens
|
66 |
-
)) / len(tokens) if tokens else -999999
|
67 |
except:
|
68 |
return -999999
|
69 |
|
@@ -119,10 +148,13 @@ def preprocess_inputs(prompt: str, response_a: str, response_b: str, tokenizer,
|
|
119 |
]).reshape(1, -1).astype(np.float32)
|
120 |
return encoded["input_ids"], encoded["attention_mask"], metrics
|
121 |
|
|
|
|
|
|
|
|
|
122 |
# === Streamlit UI ===
|
123 |
st.set_page_config(page_title="LMSYS Demo", layout="wide")
|
124 |
|
125 |
-
# Optional styling (vintage theme)
|
126 |
st.markdown(
|
127 |
"""
|
128 |
<style>
|
@@ -152,28 +184,42 @@ st.markdown(
|
|
152 |
st.title("Predicting Human Preference : Gemini vs DeepSeek")
|
153 |
st.write("As part of this demo, we make use of two SOTA LLMs : [Gemini 2.5 Pro](https://deepmind.google/technologies/gemini/pro/) and [DeepSeek R1](https://api-docs.deepseek.com/news/news250120) and make them compete against each other on a given prompt (to be entered through the sidebar)")
|
154 |
st.write("Using our proposed hybrid model, we predict which response is more suited to be preferred by a human user.")
|
|
|
155 |
st.sidebar.title("Ask a Question!")
|
|
|
|
|
|
|
156 |
question = st.sidebar.text_area("Enter your question:", key="prompt_input")
|
157 |
|
158 |
-
# Init session state
|
159 |
if "generated" not in st.session_state:
|
160 |
st.session_state["generated"] = False
|
161 |
|
162 |
-
|
163 |
-
if st.sidebar.button("Generate Responses") and question:
|
164 |
-
with st.spinner("Generating LLM responses..."):
|
165 |
-
raw_a = get_response_from_gemini(question, st.secrets["GEMINI_API_KEY"])
|
166 |
-
raw_b = get_response_from_deepseek(question, st.secrets["OPENROUTER_API_KEY"])
|
167 |
-
|
168 |
-
st.session_state["response_a_raw"] = raw_a
|
169 |
-
st.session_state["response_b_raw"] = raw_b
|
170 |
-
st.session_state["response_a_clean"] = clean_response(raw_a)
|
171 |
-
st.session_state["response_b_clean"] = clean_response(raw_b)
|
172 |
|
173 |
-
|
174 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
|
176 |
-
# Display and interact
|
177 |
if st.session_state["generated"]:
|
178 |
tab1, tab2, tab3 = st.tabs(["Predictions","Model Architecture", "📈 Metric Curves"])
|
179 |
|
@@ -181,13 +227,12 @@ if st.session_state["generated"]:
|
|
181 |
st.subheader("Model Responses")
|
182 |
col1, col2 = st.columns(2)
|
183 |
with col1:
|
184 |
-
st.markdown("####
|
185 |
st.markdown(st.session_state["response_a_raw"])
|
186 |
with col2:
|
187 |
-
st.markdown("####
|
188 |
st.markdown(st.session_state["response_b_raw"])
|
189 |
|
190 |
-
|
191 |
if st.button("Predict Winner"):
|
192 |
with st.spinner("Running model..."):
|
193 |
input_ids, attention_mask, num_features = preprocess_inputs(
|
@@ -198,9 +243,11 @@ if st.session_state["generated"]:
|
|
198 |
)
|
199 |
predictions = model.predict([input_ids, attention_mask, num_features], verbose=0)
|
200 |
predicted_class = np.argmax(predictions, axis=-1)[0]
|
201 |
-
label_map = {0: "
|
202 |
st.session_state["prediction"] = label_map[predicted_class]
|
203 |
|
|
|
|
|
204 |
if st.session_state.get("prediction"):
|
205 |
st.success(f"🤖 Model Prediction: {st.session_state['prediction']}")
|
206 |
|
@@ -245,3 +292,4 @@ if st.session_state["generated"]:
|
|
245 |
st.image("images/plots/bilstm_hybrid_acc.png", caption="Accuracy - Hybrid (Bi-LSTM)", use_container_width=True)
|
246 |
with col2:
|
247 |
st.image("images/plots/bilstm_hybrid_loss.png", caption="Log Loss - Hybrid (Bi-LSTM)", use_container_width=True)
|
|
|
|
7 |
import textstat
|
8 |
import json
|
9 |
import requests
|
10 |
+
import concurrent.futures
|
11 |
+
|
12 |
import tensorflow as tf
|
13 |
from keras.layers import Layer
|
14 |
from transformers import DebertaV2Tokenizer, TFAutoModel
|
15 |
import streamlit as st
|
16 |
from google import genai
|
17 |
+
import pandas as pd
|
18 |
+
|
19 |
torch.classes.__path__ = []
|
20 |
# Download tokenizer data once
|
21 |
nltk.download('punkt', quiet=True)
|
22 |
|
23 |
# === Cleaning Function ===
|
24 |
def clean_response(text: str) -> str:
|
|
|
25 |
text = re.sub(r"[*_`#>\-\[\]()]", "", text)
|
26 |
text = re.sub(r"\s+", " ", text)
|
27 |
return text.strip()
|
28 |
|
29 |
+
# === Model APIs ===
|
|
|
|
|
30 |
def get_response_from_gemini(prompt: str, key) -> str:
|
31 |
gemini_client = genai.Client(api_key=key)
|
32 |
response = gemini_client.models.generate_content(
|
|
|
35 |
)
|
36 |
return response.text.strip()
|
37 |
|
|
|
38 |
def get_response_from_deepseek(prompt: str, key) -> str:
|
39 |
response = requests.post(
|
40 |
url="https://openrouter.ai/api/v1/chat/completions",
|
|
|
46 |
)
|
47 |
return response.json()["choices"][0]["message"]["content"]
|
48 |
|
49 |
+
def get_response_from_llamafourscout(prompt: str, key) -> str:
|
50 |
+
response = requests.post(
|
51 |
+
url="https://openrouter.ai/api/v1/chat/completions",
|
52 |
+
headers={"Authorization": f"Bearer {key}"},
|
53 |
+
data=json.dumps({
|
54 |
+
"model": "meta-llama/llama-4-scout:free",
|
55 |
+
"messages": [{"role": "user", "content": prompt}]
|
56 |
+
})
|
57 |
+
)
|
58 |
+
return response.json()["choices"][0]["message"]["content"]
|
59 |
+
|
60 |
+
def get_response_from_mistralsmall(prompt: str, key) -> str:
|
61 |
+
response = requests.post(
|
62 |
+
url="https://openrouter.ai/api/v1/chat/completions",
|
63 |
+
headers={"Authorization": f"Bearer {key}"},
|
64 |
+
data=json.dumps({
|
65 |
+
"model": "mistralai/mistral-small-3.1-24b-instruct:free",
|
66 |
+
"messages": [{"role": "user", "content": prompt}]
|
67 |
+
})
|
68 |
+
)
|
69 |
+
return response.json()["choices"][0]["message"]["content"]
|
70 |
+
|
71 |
+
# === Model Function Mapping ===
|
72 |
+
MODEL_MAP = {
|
73 |
+
"Gemini": get_response_from_gemini,
|
74 |
+
"DeepSeek": get_response_from_deepseek,
|
75 |
+
"LLaMA 4 Scout": get_response_from_llamafourscout,
|
76 |
+
"Mistral Small": get_response_from_mistralsmall,
|
77 |
+
}
|
78 |
+
|
79 |
# === Metrics ===
|
80 |
def calculate_entropy(text: str) -> float:
|
81 |
try:
|
|
|
92 |
def calculate_ttr(text: str) -> float:
|
93 |
try:
|
94 |
tokens = [token.lower() for token in word_tokenize(text) if token.isalnum()]
|
95 |
+
return len(set(tokens)) / len(tokens) if tokens else -999999
|
|
|
96 |
except:
|
97 |
return -999999
|
98 |
|
|
|
148 |
]).reshape(1, -1).astype(np.float32)
|
149 |
return encoded["input_ids"], encoded["attention_mask"], metrics
|
150 |
|
151 |
+
# === History Buffer ===
|
152 |
+
if "history" not in st.session_state:
|
153 |
+
st.session_state.history = []
|
154 |
+
|
155 |
# === Streamlit UI ===
|
156 |
st.set_page_config(page_title="LMSYS Demo", layout="wide")
|
157 |
|
|
|
158 |
st.markdown(
|
159 |
"""
|
160 |
<style>
|
|
|
184 |
st.title("Predicting Human Preference : Gemini vs DeepSeek")
|
185 |
st.write("As part of this demo, we make use of two SOTA LLMs : [Gemini 2.5 Pro](https://deepmind.google/technologies/gemini/pro/) and [DeepSeek R1](https://api-docs.deepseek.com/news/news250120) and make them compete against each other on a given prompt (to be entered through the sidebar)")
|
186 |
st.write("Using our proposed hybrid model, we predict which response is more suited to be preferred by a human user.")
|
187 |
+
|
188 |
st.sidebar.title("Ask a Question!")
|
189 |
+
model_choices = list(MODEL_MAP.keys())
|
190 |
+
model_a_name = st.sidebar.selectbox("Choose Model A", model_choices, index=0)
|
191 |
+
model_b_name = st.sidebar.selectbox("Choose Model B", model_choices, index=1)
|
192 |
question = st.sidebar.text_area("Enter your question:", key="prompt_input")
|
193 |
|
|
|
194 |
if "generated" not in st.session_state:
|
195 |
st.session_state["generated"] = False
|
196 |
|
197 |
+
import concurrent.futures
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
|
199 |
+
if st.sidebar.button("Generate Responses") and question:
|
200 |
+
with st.spinner("Generating LLM responses"):
|
201 |
+
|
202 |
+
def fetch_model_response(model_name):
|
203 |
+
api_key = st.secrets["GEMINI_API_KEY"] if model_name == "Gemini" else st.secrets["OPENROUTER_API_KEY"]
|
204 |
+
return MODEL_MAP[model_name](question, api_key)
|
205 |
+
|
206 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
207 |
+
future_a = executor.submit(fetch_model_response, model_a_name)
|
208 |
+
future_b = executor.submit(fetch_model_response, model_b_name)
|
209 |
+
raw_a = future_a.result()
|
210 |
+
raw_b = future_b.result()
|
211 |
+
|
212 |
+
st.session_state.update({
|
213 |
+
"response_a_raw": raw_a,
|
214 |
+
"response_b_raw": raw_b,
|
215 |
+
"response_a_clean": clean_response(raw_a),
|
216 |
+
"response_b_clean": clean_response(raw_b),
|
217 |
+
"generated": True,
|
218 |
+
"prediction": None,
|
219 |
+
"model_a_name": model_a_name,
|
220 |
+
"model_b_name": model_b_name
|
221 |
+
})
|
222 |
|
|
|
223 |
if st.session_state["generated"]:
|
224 |
tab1, tab2, tab3 = st.tabs(["Predictions","Model Architecture", "📈 Metric Curves"])
|
225 |
|
|
|
227 |
st.subheader("Model Responses")
|
228 |
col1, col2 = st.columns(2)
|
229 |
with col1:
|
230 |
+
st.markdown(f"#### {st.session_state['model_a_name']}")
|
231 |
st.markdown(st.session_state["response_a_raw"])
|
232 |
with col2:
|
233 |
+
st.markdown(f"#### {st.session_state['model_b_name']}")
|
234 |
st.markdown(st.session_state["response_b_raw"])
|
235 |
|
|
|
236 |
if st.button("Predict Winner"):
|
237 |
with st.spinner("Running model..."):
|
238 |
input_ids, attention_mask, num_features = preprocess_inputs(
|
|
|
243 |
)
|
244 |
predictions = model.predict([input_ids, attention_mask, num_features], verbose=0)
|
245 |
predicted_class = np.argmax(predictions, axis=-1)[0]
|
246 |
+
label_map = {0: f"{st.session_state['model_a_name']}!", 1: f"{st.session_state['model_b_name']}!", 2: "Tie!"}
|
247 |
st.session_state["prediction"] = label_map[predicted_class]
|
248 |
|
249 |
+
|
250 |
+
|
251 |
if st.session_state.get("prediction"):
|
252 |
st.success(f"🤖 Model Prediction: {st.session_state['prediction']}")
|
253 |
|
|
|
292 |
st.image("images/plots/bilstm_hybrid_acc.png", caption="Accuracy - Hybrid (Bi-LSTM)", use_container_width=True)
|
293 |
with col2:
|
294 |
st.image("images/plots/bilstm_hybrid_loss.png", caption="Log Loss - Hybrid (Bi-LSTM)", use_container_width=True)
|
295 |
+
|