Files changed (1) hide show
  1. app.py +436 -318
app.py CHANGED
@@ -5,22 +5,33 @@ from uuid import uuid4
5
  from datasets import load_dataset
6
  from collections import Counter
7
  import numpy as np
8
- from configs import configs
9
- from clients import backend, logger
10
- from backend.helpers import get_random_session_samples
11
 
12
- dataset = load_dataset("iyosha-huji/stressEval", token=configs.HF_API_TOKEN)["test"]
 
 
13
 
14
- INSTRUCTIONS = """<div align='center'>You are given an audio sample and a question with 2 answer options.\n\nListen to the audio and select the correct answer from the options below.\n\n<b>Note:</b> The question is the same for all samples, but the audio and the corresponding answers change.</div>"""
15
 
16
 
17
- with open(Path(__file__).parent / "data/stage_indices.json") as f:
 
 
 
 
 
 
 
 
 
18
  STAGE_SPLITS = json.load(f)
19
 
20
 
21
  def human_eval_tab():
22
  with gr.Tab(label="Evaluation"):
23
- # ==== State =====
24
  i = gr.State(-1)
25
  selected_answer = gr.State(None)
26
  answers_dict = gr.State({})
@@ -28,6 +39,7 @@ def human_eval_tab():
28
  session_id = gr.State(None)
29
  user_name = gr.State(None)
30
  session_sample_indices = gr.State([])
 
31
 
32
  # === Login UI ===
33
  with gr.Group(visible=True) as login_group:
@@ -50,6 +62,7 @@ def human_eval_tab():
50
  backend, dataset, STAGE_SPLITS, usr, num_samples=15
51
  )
52
  logger.info(f"Session ID: {new_session_id}, Stage: {stage}")
 
53
  return (
54
  True,
55
  gr.update(visible=False),
@@ -57,6 +70,7 @@ def human_eval_tab():
57
  new_session_id,
58
  sample_indices,
59
  usr,
 
60
  )
61
  else:
62
  return (
@@ -66,6 +80,7 @@ def human_eval_tab():
66
  None,
67
  [],
68
  None,
 
69
  )
70
 
71
  # === Login Button ===
@@ -79,6 +94,7 @@ def human_eval_tab():
79
  session_id,
80
  session_sample_indices,
81
  user_name,
 
82
  ],
83
  )
84
 
@@ -99,7 +115,14 @@ def human_eval_tab():
99
  with gr.Row(show_progress=True):
100
  with gr.Column(variant="compact"):
101
  sample_info = gr.Markdown()
102
- gr.Markdown("**Question:**")
 
 
 
 
 
 
 
103
  question_md = gr.Markdown()
104
  radio = gr.Radio(label="Answer:", interactive=True)
105
  with gr.Column(variant="compact"):
@@ -122,82 +145,189 @@ def human_eval_tab():
122
  """
123
  )
124
 
125
- # === Logic ===
126
- def update_ui(i, answers, session_sample_indices):
127
- if i == -1: # We haven't started yet
 
 
128
  return (
129
  gr.update(visible=False),
130
  "",
131
- "",
132
- gr.update(visible=False),
133
- gr.update(visible=False),
134
- None,
 
 
135
  )
136
- # show the question
137
  true_index = session_sample_indices[i]
138
- sample = dataset[true_index]
139
  audio_data = (sample["audio"]["sampling_rate"], sample["audio"]["array"])
140
- previous_answer = answers.get(i, None)
 
141
  return (
142
  gr.update(visible=True),
143
- f"<div align='center'>Sample <b>{i+1}</b> out of <b>{len(session_sample_indices)}</b></div>",
144
- "Out of the following answers, according to the speaker's stressed words, what is most likely the underlying intention of the speaker?",
145
  gr.update(value=audio_data),
146
  gr.update(
147
- choices=sample["possible_answers"],
148
- value=previous_answer,
149
  ),
150
- previous_answer,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  )
152
 
 
153
  def update_next_index(
154
- i, answer, answers, session_id, session_sample_indices, user_name
 
 
 
 
 
 
 
155
  ):
156
- if answer is None and i != -1: # if no answer is selected
157
- # show warning message
158
  return (
159
- gr.update(),
160
- gr.update(visible=True),
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  gr.update(),
162
  answers,
163
  gr.update(visible=False),
164
  gr.update(visible=True),
 
 
165
  )
166
 
167
- if answer: # if an answer is selected
168
- # save the answer to the backend
169
- answers[i] = answer
170
- true_index = session_sample_indices[i]
171
- sample = dataset[true_index]
172
- interp_id = sample["interpretation_id"]
173
- trans_id = sample["transcription_id"]
174
- user_id = session_id
175
- user_name_str = user_name or "anonymous"
176
- logger.info(
177
- "saving answer to backend",
178
- context={
179
- "i": true_index,
180
- "interp_id": interp_id,
181
- "answer": answer,
182
- "user_id": user_id,
183
- },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  )
185
- if not backend.update_row(true_index, interp_id, user_id, answer):
186
- backend.add_row(
187
- true_index, interp_id, trans_id, user_id, answer, user_name_str
188
- )
189
 
190
- if i + 1 == len(session_sample_indices): # Last question just answered
 
191
  return (
192
- -1, # reset i to stop showing question
193
  gr.update(visible=False),
194
  gr.update(visible=False),
195
  answers,
196
  gr.update(visible=True), # show final page
197
- gr.update(visible=False), # hide previous button
 
 
198
  )
199
- # go to the next question
200
- new_i = i + 1 if i + 1 < len(session_sample_indices) else 0
 
201
  return (
202
  new_i,
203
  gr.update(visible=False),
@@ -205,303 +335,291 @@ def human_eval_tab():
205
  answers,
206
  gr.update(visible=False),
207
  gr.update(visible=True),
 
 
208
  )
209
 
210
- def update_prev_index(i):
211
- # prevent goint back in the first question and first page
212
- if i <= 0:
213
- return i, gr.update(visible=False)
214
- # go back to the previous question
215
- else:
216
- return i - 1, gr.update(visible=False)
217
-
218
- def answer_change_callback(answer, i, answers):
219
- answers[i] = answer
220
- return answer, answers
221
-
222
- def login_callback(logged_in):
223
- return (
224
- (
225
- gr.update(visible=True),
226
- gr.update(visible=True),
227
- gr.update(visible=False),
228
- gr.update(visible=False),
229
- )
230
- if logged_in
231
- else (
232
- gr.update(visible=False),
233
- gr.update(visible=False),
234
- gr.update(visible=False),
235
- gr.update(visible=False),
236
- )
237
- )
238
-
239
- # === Events ===
240
  next_btn.click(
241
  update_next_index,
242
  [
243
  i,
244
- selected_answer,
 
245
  answers_dict,
246
  session_id,
247
  session_sample_indices,
248
  user_name,
 
 
 
 
 
 
 
 
 
 
 
249
  ],
250
- [i, warning_msg, next_btn, answers_dict, final_group, prev_btn],
251
  )
 
252
  prev_btn.click(update_prev_index, i, [i, warning_msg])
 
253
  i.change(
254
  update_ui,
255
- [i, answers_dict, session_sample_indices],
256
  [
257
  question_group,
258
  sample_info,
259
- question_md,
260
  audio_output,
 
 
261
  radio,
262
  selected_answer,
263
  ],
264
  )
265
- radio.change(
 
266
  answer_change_callback,
267
  [radio, i, answers_dict],
268
  [selected_answer, answers_dict],
269
  )
270
- logged_in.change(
271
- login_callback, logged_in, [app_group, next_btn, prev_btn, warning_msg]
272
- )
273
-
274
 
275
- def compute_random_sampled_accuracy(df, dataset, n_rounds=100, seed=42):
276
- rng = np.random.default_rng(seed)
277
-
278
- # Filter to interpretation_ids with at least 3 user answers
279
- counts = df.groupby("interpretation_id")["user_id"].nunique()
280
- eligible_ids = set(counts[counts >= 3].index)
281
-
282
- # Group answers by interpretation_id
283
- grouped = df[df["interpretation_id"].isin(eligible_ids)].groupby(
284
- "interpretation_id"
285
- )
286
-
287
- all_scores = []
288
- total_answered_per_round = []
289
-
290
- for _ in range(n_rounds):
291
- correct = 0
292
- total = 0
293
-
294
- for interp_id, group in grouped:
295
- if group.empty:
296
- continue
297
-
298
- # Randomly pick one row
299
- row = group.sample(1, random_state=rng.integers(1e6)).iloc[0]
300
- answer = row["answer"]
301
- idx = int(row["index_in_dataset"])
302
- sample = dataset[idx]
303
- gt = sample["possible_answers"][sample["label"]]
304
- total += 1
305
- if answer == gt:
306
- correct += 1
307
-
308
- if total > 0:
309
- all_scores.append(correct / total)
310
- total_answered_per_round.append(total)
311
-
312
- if all_scores:
313
- mean_acc = np.mean(all_scores)
314
- mean_total = int(np.mean(total_answered_per_round))
315
- std_acc = np.std(all_scores, ddof=1) # sample std
316
- ci_95 = 1.96 * std_acc / np.sqrt(n_rounds)
317
- return mean_acc, std_acc, mean_total, ci_95
318
-
319
- return None, None, 0, None
320
-
321
-
322
- def get_admin_tab():
323
- with gr.Tab("Admin Console"):
324
- admin_password = gr.Text(label="Enter Admin Password", type="password")
325
- check_btn = gr.Button("Enter")
326
- error_box = gr.Markdown("", visible=False)
327
- output_box = gr.Markdown("", visible=False)
328
-
329
- def calculate_majority_vote_accuracy(pw):
330
- if pw != configs.ADMIN_PASSWORD:
331
- return gr.update(
332
- visible=True, value="❌ Incorrect password."
333
- ), gr.update(visible=False)
334
-
335
- df = backend.get_all_rows()
336
- if df.empty:
337
- return gr.update(visible=True, value="No data available."), gr.update(
338
- visible=False
339
- )
340
-
341
- # Majority vote per interpretation_id
342
- majority_answers = {}
343
- for interp_id, group in df.groupby("interpretation_id"):
344
- answer_counts = Counter(group["answer"])
345
- if answer_counts:
346
- majority_answers[interp_id] = answer_counts.most_common(1)[0][0]
347
-
348
- counts = df.groupby("interpretation_id")["user_id"].nunique().to_dict()
349
- total_answers = len(df)
350
- users_count = df["user_id"].nunique()
351
-
352
- stage_acc = {}
353
- stage_completes = {}
354
- stage_counts = {}
355
- stage_remaining = {}
356
-
357
- # global_correct = 0
358
- # global_total = 0
359
-
360
- for stage in ["stage1", "stage2", "stage3"]:
361
- correct, total = 0, 0
362
- complete = 0
363
- for i in STAGE_SPLITS[stage]:
364
- sample = dataset[i]
365
- interp_id = sample["interpretation_id"]
366
- label = sample["label"]
367
- gt = sample["possible_answers"][label]
368
-
369
- n = counts.get(interp_id, 0)
370
- if n >= 3:
371
- complete += 1
372
- if interp_id in majority_answers:
373
- pred = majority_answers[interp_id]
374
- total += 1
375
- if pred == gt:
376
- correct += 1
377
-
378
- stage_counts[stage] = len(STAGE_SPLITS[stage])
379
- stage_completes[stage] = complete
380
- stage_remaining[stage] = 3 * len(STAGE_SPLITS[stage]) - sum(
381
- counts.get(dataset[i]["interpretation_id"], 0)
382
- for i in STAGE_SPLITS[stage]
383
- )
384
-
385
- if complete == len(STAGE_SPLITS[stage]):
386
- acc = correct / total if total > 0 else 0
387
- stage_acc[stage] = (acc, correct, total)
388
- else:
389
- stage_acc[stage] = None # not shown yet
390
-
391
- # Determine active stage
392
- if stage_completes["stage1"] < stage_counts["stage1"]:
393
- current_stage = "Stage 1"
394
- elif stage_completes["stage2"] < stage_counts["stage2"]:
395
- current_stage = "Stage 2"
396
- else:
397
- current_stage = "Stage 3"
398
-
399
- # Majority Vote Accuracy Section
400
- agg_lines = []
401
- if stage_acc["stage1"]:
402
- acc1, c1, t1 = stage_acc["stage1"]
403
- agg_lines.append(f"- **Stage 1:** {acc1:.2%} ({c1}/{t1})")
404
- if stage_acc["stage2"]:
405
- acc2, c2, t2 = stage_acc["stage2"]
406
- agg_lines.append(
407
- f"- **Stage 1+2:** {(c1 + c2) / (t1 + t2):.2%} ({c1 + c2}/{t1 + t2})"
408
- )
409
- if stage_acc["stage3"]:
410
- acc3, c3, t3 = stage_acc["stage3"]
411
- agg_lines.append(
412
- f"- **All Stages:** {(c1 + c2 + c3) / (t1 + t2 + t3):.2%} ({c1 + c2 + c3}/{t1 + t2 + t3})"
413
- )
414
- agg_msg = "\n".join(agg_lines) if agg_lines else "No completed stages yet."
415
- # Compute random-sampled accuracy
416
- n_rounds = 100
417
- rand_acc, rand_std, rand_total, rand_ci = compute_random_sampled_accuracy(
418
- df, dataset, n_rounds=n_rounds
419
- )
420
-
421
- # Random-sampled Accuracy
422
- if rand_acc is not None:
423
- rand_acc_msg = (
424
- f"**Accuracy:** {rand_acc:.2%} ± {rand_ci:.2%} (95% CI)\n\n"
425
- f"Standard deviation: {rand_std:.2%}\n\n"
426
- f"Samples used: {rand_total} × {n_rounds} rounds"
427
- )
428
- else:
429
- rand_acc_msg = "Random sampling failed (no data)."
430
-
431
-
432
- correct = 0
433
- total = 0
434
-
435
- for _, row in df.iterrows():
436
- idx = int(row["index_in_dataset"])
437
- if idx >= len(dataset):
438
- continue # skip out-of-range
439
- sample = dataset[idx]
440
- gt_answer = sample["possible_answers"][sample["label"]]
441
- if row["answer"] == gt_answer:
442
- correct += 1
443
- total += 1
444
-
445
- overall_acc = correct / total if total > 0 else None
446
- if overall_acc is not None:
447
- overall_acc_msg = (
448
- f"Overall Accuracy: {overall_acc:.2%} ({correct}/{total})"
449
- )
450
- else:
451
- overall_acc_msg = "No data available."
452
-
453
- # Final message (no indentation!)
454
- msg = f"""
455
- ## ✅ Accuracy Summary
456
-
457
- ### Overall Accuracy
458
- {overall_acc_msg}
459
-
460
- ---
461
-
462
- ### Majority Vote
463
- {agg_msg}
464
-
465
- ---
466
-
467
- ### Random-Sampled Accuracy
468
- {rand_acc_msg}
469
-
470
- ---
471
-
472
- ## 📊 Answer Progress
473
-
474
- - **Total answers submitted:** {total_answers}
475
- - **Answers to go (global):** {3 * len(dataset) - total_answers}
476
- - **Unique users:** {users_count}
477
-
478
- ---
479
-
480
- ## 🧱 Stage Breakdown
481
-
482
- | Stage | Completed | Total | Remaining Answers |
483
- |-------|-----------|--------|-------------------|
484
- | 1 | {stage_completes['stage1']} / {stage_counts['stage1']} | {stage_counts['stage1']} | {stage_remaining['stage1']} |
485
- | 2 | {stage_completes['stage2']} / {stage_counts['stage2']} | {stage_counts['stage2']} | {stage_remaining['stage2']} |
486
- | 3 | {stage_completes['stage3']} / {stage_counts['stage3']} | {stage_counts['stage3']} | {stage_remaining['stage3']} |
487
 
488
- **➡️ Current Active Stage:** {current_stage}
489
- """
 
 
 
490
 
491
- return gr.update(visible=False), gr.update(visible=True, value=msg)
492
 
493
- check_btn.click(
494
- fn=calculate_majority_vote_accuracy,
495
- inputs=admin_password,
496
- outputs=[error_box, output_box],
497
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
498
 
499
 
500
  # App UI
501
  with gr.Blocks() as demo:
502
  human_eval_tab()
503
- get_admin_tab()
504
-
505
- # Launch app
506
- demo.launch()
507
-
 
5
  from datasets import load_dataset
6
  from collections import Counter
7
  import numpy as np
8
+ from .configs import configs
9
+ from .clients_rebuttal import backend, logger
10
+ from .backend.helpers_rebuttal import get_random_session_samples
11
 
12
+ # dataset = load_dataset("iyosha-huji/stressBench", token=configs.HF_API_TOKEN)["test"]
13
+ # dataset = load_dataset("iyosha-huji/stressEval", token=configs.HF_API_TOKEN)["test"]
14
+ stage_to_split_map = {1: "train_fine", 2: "train_full"}
15
 
16
+ dataset = load_dataset("slprl/Stress-17K-raw")
17
 
18
 
19
+ INSTRUCTIONS = """<div align='center'>You are given an audio sample and 2 questions.\n\nListen to the audio and select the correct answer from the provided options below.\n\n<b>Note:</b> The questions are the same for all samples, but the audio and the corresponding answers change.</div>"""
20
+
21
+
22
+ def _stringify(value):
23
+ if isinstance(value, list):
24
+ return "[" + ", ".join(map(str, value)) + "]"
25
+ return value if value is not None else ""
26
+
27
+
28
+ with open(Path(__file__).parent / "data/rebuttal/stage_indices.json") as f:
29
  STAGE_SPLITS = json.load(f)
30
 
31
 
32
  def human_eval_tab():
33
  with gr.Tab(label="Evaluation"):
34
+ # ==== State ====
35
  i = gr.State(-1)
36
  selected_answer = gr.State(None)
37
  answers_dict = gr.State({})
 
39
  session_id = gr.State(None)
40
  user_name = gr.State(None)
41
  session_sample_indices = gr.State([])
42
+ current_split = gr.State(None)
43
 
44
  # === Login UI ===
45
  with gr.Group(visible=True) as login_group:
 
62
  backend, dataset, STAGE_SPLITS, usr, num_samples=15
63
  )
64
  logger.info(f"Session ID: {new_session_id}, Stage: {stage}")
65
+ current_split = stage_to_split_map[stage]
66
  return (
67
  True,
68
  gr.update(visible=False),
 
70
  new_session_id,
71
  sample_indices,
72
  usr,
73
+ current_split,
74
  )
75
  else:
76
  return (
 
80
  None,
81
  [],
82
  None,
83
+ None,
84
  )
85
 
86
  # === Login Button ===
 
94
  session_id,
95
  session_sample_indices,
96
  user_name,
97
+ current_split,
98
  ],
99
  )
100
 
 
115
  with gr.Row(show_progress=True):
116
  with gr.Column(variant="compact"):
117
  sample_info = gr.Markdown()
118
+ # ✅ NEW: SSD question first
119
+ stress_question_md = gr.Markdown()
120
+ stress_checkbox = gr.CheckboxGroup(
121
+ label="Stressed words:", interactive=True
122
+ ) # ✅ NEW
123
+
124
+ # ✅ SSR question after SSD
125
+ # gr.Markdown("**Question:**")
126
  question_md = gr.Markdown()
127
  radio = gr.Radio(label="Answer:", interactive=True)
128
  with gr.Column(variant="compact"):
 
145
  """
146
  )
147
 
148
+ # === Logic === -------------------------------------------------------
149
+
150
+ # ── 1. UI refresh when i changes ────────────────────────────────────
151
+ def update_ui(i, answers, session_sample_indices, current_split):
152
+ if i == -1: # not started yet
153
  return (
154
  gr.update(visible=False),
155
  "",
156
+ "", # group, sample-info, SSD text
157
+ gr.update(visible=False), # audio
158
+ gr.update(visible=False), # SSD checkbox
159
+ "", # SSR text
160
+ gr.update(visible=False), # SSR radio
161
+ None, # selected_answer
162
  )
163
+ # show current sample
164
  true_index = session_sample_indices[i]
165
+ sample = dataset[current_split][true_index]
166
  audio_data = (sample["audio"]["sampling_rate"], sample["audio"]["array"])
167
+ previous = answers.get(i, {"ssd": [], "ssr": None})
168
+
169
  return (
170
  gr.update(visible=True),
171
+ f"<div align='center'>Sample <b>{i+1}</b> / <b>{len(session_sample_indices)}</b></div>",
172
+ f"Given that the speaker said: \"**{sample['transcription']}**\"\nWhat word(s) did the speaker stress?\nYou can select multiple words if you think more than one word is stressed.",
173
  gr.update(value=audio_data),
174
  gr.update(
175
+ choices=sample["transcription"].split(),
176
+ value=previous.get("ssd", []),
177
  ),
178
+ "Out of the following answers, according to the stressed words, what is most likely the underlying intention of the speaker?",
179
+ gr.update(
180
+ choices=sample["possible_answers"], value=previous.get("ssr")
181
+ ),
182
+ previous.get("ssr"),
183
+ )
184
+
185
+ # ── 2. SSD (checkbox) live-update – optional ------------------------
186
+ def ssd_change_callback(ssd_answer, i, answers):
187
+ answers[i] = answers.get(i, {})
188
+ answers[i]["ssd"] = ssd_answer
189
+ return answers
190
+
191
+ # ── 3. SSR (radio) live-update – keeps selected_answer state --------
192
+ def answer_change_callback(answer, i, answers):
193
+ answers[i] = answers.get(i, {})
194
+ answers[i]["ssr"] = answer
195
+ return answer, answers
196
+
197
+ # ── 4. navigate back one sample -------------------------------------
198
+ def update_prev_index(i):
199
+ if i <= 0:
200
+ return i, gr.update(visible=False)
201
+ return i - 1, gr.update(visible=False)
202
+
203
+ # ── 5. login toggle --------------------------------------------------
204
+ def login_callback(logged_in):
205
+ if logged_in:
206
+ return (
207
+ gr.update(visible=True), # show app_group
208
+ gr.update(visible=True), # show next_btn
209
+ gr.update(visible=False), # hide prev_btn
210
+ gr.update(visible=False), # hide warning_msg
211
+ )
212
+ return (
213
+ gr.update(visible=False),
214
+ gr.update(visible=False),
215
+ gr.update(visible=False),
216
+ gr.update(visible=False),
217
  )
218
 
219
+ # ── 6. main “Next / Submit” handler ---------------------------------
220
  def update_next_index(
221
+ i,
222
+ ssd_answer,
223
+ ssr_answer,
224
+ answers,
225
+ session_id,
226
+ session_sample_indices,
227
+ user_name,
228
+ current_split,
229
  ):
230
+ # ── 0. first click on "Start" ──────────────────────────────────────────
231
+ if i == -1:
232
  return (
233
+ 0, # show first sample
234
+ gr.update(visible=False), # hide warning
235
+ gr.update(value="Submit answer and go to Next"),
236
+ answers,
237
+ gr.update(visible=False), # keep final page hidden
238
+ gr.update(visible=False), # prev_btn stays hidden
239
+ gr.update(value=[]), # clear SSD checkbox
240
+ gr.update(value=None), # clear SSR radio
241
+ )
242
+
243
+ # ── 1. block if either answer missing (for real samples) ──────────────
244
+ if not ssd_answer or not ssr_answer:
245
+ return (
246
+ i,
247
+ gr.update(visible=True), # show warning
248
  gr.update(),
249
  answers,
250
  gr.update(visible=False),
251
  gr.update(visible=True),
252
+ gr.update(), # keep SSD
253
+ gr.update(), # keep SSR
254
  )
255
 
256
+ # store answers
257
+ answers[i] = {"ssd": ssd_answer, "ssr": ssr_answer}
258
+
259
+ true_index = session_sample_indices[i]
260
+ sample = dataset[current_split][true_index]
261
+ audio_id = sample["audio_id"]
262
+ trans_id = sample["transcription_id"]
263
+ interpretation = sample["intonation"]
264
+ user_id = session_id
265
+ user_name_str = user_name or "anonymous"
266
+
267
+ # ----- SSR values -----
268
+ user_ssr_answer = sample["possible_answers"].index(ssr_answer)
269
+ ssr_label = sample["label"]
270
+
271
+ # ----- SSD values -----
272
+ ssd_answer_str = ",".join(map(str, ssd_answer))
273
+ ssd_words_list = ssd_answer_str.split(",")
274
+ transcription_words = sample["transcription"].split()
275
+ user_ssd_answer = _stringify(
276
+ [
277
+ 1 if word in ssd_words_list else 0
278
+ for _, word in enumerate(transcription_words)
279
+ ]
280
+ )
281
+ ssd_label = _stringify(
282
+ [
283
+ 1 if idx in sample["gt_stress_indices"] else 0
284
+ for idx, _ in enumerate(sample["transcription"].split())
285
+ ]
286
+ )
287
+
288
+ # write to backend
289
+ updated = backend.update_row(
290
+ true_index,
291
+ audio_id,
292
+ user_id,
293
+ new_ssr_answer=ssr_answer,
294
+ new_user_ssr_answer=user_ssr_answer,
295
+ new_ssr_label=ssr_label,
296
+ new_ssd_answer=ssd_answer_str,
297
+ new_user_ssd_answer=user_ssd_answer,
298
+ new_ssd_label=ssd_label,
299
+ )
300
+ if not updated:
301
+ backend.add_row(
302
+ true_index,
303
+ audio_id,
304
+ interpretation,
305
+ trans_id,
306
+ user_id,
307
+ user_name_str,
308
+ ssr_answer,
309
+ user_ssr_answer,
310
+ ssr_label,
311
+ ssd_answer_str,
312
+ user_ssd_answer,
313
+ ssd_label,
314
  )
 
 
 
 
315
 
316
+ # finished?
317
+ if i + 1 == len(session_sample_indices):
318
  return (
319
+ -1,
320
  gr.update(visible=False),
321
  gr.update(visible=False),
322
  answers,
323
  gr.update(visible=True), # show final page
324
+ gr.update(visible=False), # hide prev
325
+ gr.update(value=[]), # clear SSD
326
+ gr.update(value=None), # clear SSR
327
  )
328
+
329
+ # otherwise advance
330
+ new_i = i + 1
331
  return (
332
  new_i,
333
  gr.update(visible=False),
 
335
  answers,
336
  gr.update(visible=False),
337
  gr.update(visible=True),
338
+ gr.update(value=[]), # reset SSD
339
+ gr.update(value=None), # reset SSR
340
  )
341
 
342
+ # === Event wiring ===================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
  next_btn.click(
344
  update_next_index,
345
  [
346
  i,
347
+ stress_checkbox,
348
+ radio,
349
  answers_dict,
350
  session_id,
351
  session_sample_indices,
352
  user_name,
353
+ current_split,
354
+ ],
355
+ [
356
+ i,
357
+ warning_msg,
358
+ next_btn,
359
+ answers_dict,
360
+ final_group,
361
+ prev_btn,
362
+ stress_checkbox,
363
+ radio,
364
  ],
 
365
  )
366
+
367
  prev_btn.click(update_prev_index, i, [i, warning_msg])
368
+
369
  i.change(
370
  update_ui,
371
+ [i, answers_dict, session_sample_indices, current_split],
372
  [
373
  question_group,
374
  sample_info,
375
+ stress_question_md,
376
  audio_output,
377
+ stress_checkbox,
378
+ question_md,
379
  radio,
380
  selected_answer,
381
  ],
382
  )
383
+
384
+ radio.change( # SSR radio
385
  answer_change_callback,
386
  [radio, i, answers_dict],
387
  [selected_answer, answers_dict],
388
  )
 
 
 
 
389
 
390
+ stress_checkbox.change( # SSD checkbox
391
+ ssd_change_callback, [stress_checkbox, i, answers_dict], [answers_dict]
392
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393
 
394
+ logged_in.change(
395
+ login_callback,
396
+ logged_in,
397
+ [app_group, next_btn, prev_btn, warning_msg],
398
+ )
399
 
 
400
 
401
+ # def compute_random_sampled_accuracy(df, dataset, n_rounds=100, seed=42):
402
+ # rng = np.random.default_rng(seed)
403
+
404
+ # # Filter to interpretation_ids with at least 3 user answers
405
+ # counts = df.groupby("interpretation_id")["user_id"].nunique()
406
+ # eligible_ids = set(counts[counts >= 3].index)
407
+
408
+ # # Group answers by interpretation_id
409
+ # grouped = df[df["interpretation_id"].isin(eligible_ids)].groupby(
410
+ # "interpretation_id"
411
+ # )
412
+
413
+ # all_scores = []
414
+ # total_answered_per_round = []
415
+
416
+ # for _ in range(n_rounds):
417
+ # correct = 0
418
+ # total = 0
419
+
420
+ # for interp_id, group in grouped:
421
+ # if group.empty:
422
+ # continue
423
+
424
+ # # Randomly pick one row
425
+ # row = group.sample(1, random_state=rng.integers(1e6)).iloc[0]
426
+ # answer = row["answer"]
427
+ # idx = int(row["index_in_dataset"])
428
+ # sample = dataset[idx]
429
+ # gt = sample["possible_answers"][sample["label"]]
430
+ # total += 1
431
+ # if answer == gt:
432
+ # correct += 1
433
+
434
+ # if total > 0:
435
+ # all_scores.append(correct / total)
436
+ # total_answered_per_round.append(total)
437
+
438
+ # if all_scores:
439
+ # mean_acc = np.mean(all_scores)
440
+ # mean_total = int(np.mean(total_answered_per_round))
441
+ # std_acc = np.std(all_scores, ddof=1) # sample std
442
+ # ci_95 = 1.96 * std_acc / np.sqrt(n_rounds)
443
+ # return mean_acc, std_acc, mean_total, ci_95
444
+
445
+ # return None, None, 0, None
446
+
447
+
448
+ # def get_admin_tab():
449
+ # with gr.Tab("Admin Console"):
450
+ # admin_password = gr.Text(label="Enter Admin Password", type="password")
451
+ # check_btn = gr.Button("Enter")
452
+ # error_box = gr.Markdown("", visible=False)
453
+ # output_box = gr.Markdown("", visible=False)
454
+
455
+ # def calculate_majority_vote_accuracy(pw):
456
+ # if pw != configs.ADMIN_PASSWORD:
457
+ # return gr.update(
458
+ # visible=True, value="❌ Incorrect password."
459
+ # ), gr.update(visible=False)
460
+
461
+ # df = backend.get_all_rows()
462
+ # if df.empty:
463
+ # return gr.update(visible=True, value="No data available."), gr.update(
464
+ # visible=False
465
+ # )
466
+
467
+ # # Majority vote per interpretation_id
468
+ # majority_answers = {}
469
+ # for interp_id, group in df.groupby("interpretation_id"):
470
+ # answer_counts = Counter(group["answer"])
471
+ # if answer_counts:
472
+ # majority_answers[interp_id] = answer_counts.most_common(1)[0][0]
473
+
474
+ # counts = df.groupby("interpretation_id")["user_id"].nunique().to_dict()
475
+ # total_answers = len(df)
476
+ # users_count = df["user_id"].nunique()
477
+
478
+ # stage_acc = {}
479
+ # stage_completes = {}
480
+ # stage_counts = {}
481
+ # stage_remaining = {}
482
+
483
+ # # global_correct = 0
484
+ # # global_total = 0
485
+
486
+ # for stage in ["stage1", "stage2", "stage3"]:
487
+ # correct, total = 0, 0
488
+ # complete = 0
489
+ # for i in STAGE_SPLITS[stage]:
490
+ # sample = dataset[i]
491
+ # interp_id = sample["interpretation_id"]
492
+ # label = sample["label"]
493
+ # gt = sample["possible_answers"][label]
494
+
495
+ # n = counts.get(interp_id, 0)
496
+ # if n >= 3:
497
+ # complete += 1
498
+ # if interp_id in majority_answers:
499
+ # pred = majority_answers[interp_id]
500
+ # total += 1
501
+ # if pred == gt:
502
+ # correct += 1
503
+
504
+ # stage_counts[stage] = len(STAGE_SPLITS[stage])
505
+ # stage_completes[stage] = complete
506
+ # stage_remaining[stage] = 3 * len(STAGE_SPLITS[stage]) - sum(
507
+ # counts.get(dataset[i]["interpretation_id"], 0)
508
+ # for i in STAGE_SPLITS[stage]
509
+ # )
510
+
511
+ # if complete == len(STAGE_SPLITS[stage]):
512
+ # acc = correct / total if total > 0 else 0
513
+ # stage_acc[stage] = (acc, correct, total)
514
+ # else:
515
+ # stage_acc[stage] = None # not shown yet
516
+
517
+ # # Determine active stage
518
+ # if stage_completes["stage1"] < stage_counts["stage1"]:
519
+ # current_stage = "Stage 1"
520
+ # elif stage_completes["stage2"] < stage_counts["stage2"]:
521
+ # current_stage = "Stage 2"
522
+ # else:
523
+ # current_stage = "Stage 3"
524
+
525
+ # # Majority Vote Accuracy Section
526
+ # agg_lines = []
527
+ # if stage_acc["stage1"]:
528
+ # acc1, c1, t1 = stage_acc["stage1"]
529
+ # agg_lines.append(f"- **Stage 1:** {acc1:.2%} ({c1}/{t1})")
530
+ # if stage_acc["stage2"]:
531
+ # acc2, c2, t2 = stage_acc["stage2"]
532
+ # agg_lines.append(
533
+ # f"- **Stage 1+2:** {(c1 + c2) / (t1 + t2):.2%} ({c1 + c2}/{t1 + t2})"
534
+ # )
535
+ # if stage_acc["stage3"]:
536
+ # acc3, c3, t3 = stage_acc["stage3"]
537
+ # agg_lines.append(
538
+ # f"- **All Stages:** {(c1 + c2 + c3) / (t1 + t2 + t3):.2%} ({c1 + c2 + c3}/{t1 + t2 + t3})"
539
+ # )
540
+ # agg_msg = "\n".join(agg_lines) if agg_lines else "No completed stages yet."
541
+ # # Compute random-sampled accuracy
542
+ # n_rounds = 100
543
+ # rand_acc, rand_std, rand_total, rand_ci = compute_random_sampled_accuracy(
544
+ # df, dataset, n_rounds=n_rounds
545
+ # )
546
+
547
+ # # Random-sampled Accuracy
548
+ # if rand_acc is not None:
549
+ # rand_acc_msg = (
550
+ # f"**Accuracy:** {rand_acc:.2%} ± {rand_ci:.2%} (95% CI)\n\n"
551
+ # f"Standard deviation: {rand_std:.2%}\n\n"
552
+ # f"Samples used: {rand_total} × {n_rounds} rounds"
553
+ # )
554
+ # else:
555
+ # rand_acc_msg = "Random sampling failed (no data)."
556
+
557
+ # correct = 0
558
+ # total = 0
559
+
560
+ # for _, row in df.iterrows():
561
+ # idx = int(row["index_in_dataset"])
562
+ # if idx >= len(dataset):
563
+ # continue # skip out-of-range
564
+ # sample = dataset[idx]
565
+ # gt_answer = sample["possible_answers"][sample["label"]]
566
+ # if row["answer"] == gt_answer:
567
+ # correct += 1
568
+ # total += 1
569
+
570
+ # overall_acc = correct / total if total > 0 else None
571
+ # if overall_acc is not None:
572
+ # overall_acc_msg = (
573
+ # f"Overall Accuracy: {overall_acc:.2%} ({correct}/{total})"
574
+ # )
575
+ # else:
576
+ # overall_acc_msg = "No data available."
577
+ # # Final message (no indentation!)
578
+ # msg = f"""
579
+ # ## ✅ Accuracy Summary
580
+ # ### Overall Accuracy
581
+ # {overall_acc_msg}
582
+
583
+ # ---
584
+ # ### Majority Vote
585
+ # {agg_msg}
586
+
587
+ # ---
588
+
589
+ # ### Random-Sampled Accuracy
590
+ # {rand_acc_msg}
591
+
592
+ # ---
593
+
594
+ # ## 📊 Answer Progress
595
+
596
+ # - **Total answers submitted:** {total_answers}
597
+ # - **Answers to go (global):** {3 * len(dataset) - total_answers}
598
+ # - **Unique users:** {users_count}
599
+
600
+ # ---
601
+
602
+ # ## 🧱 Stage Breakdown
603
+
604
+ # | Stage | Completed | Total | Remaining Answers |
605
+ # |-------|-----------|--------|-------------------|
606
+ # | 1 | {stage_completes['stage1']} / {stage_counts['stage1']} | {stage_counts['stage1']} | {stage_remaining['stage1']} |
607
+ # | 2 | {stage_completes['stage2']} / {stage_counts['stage2']} | {stage_counts['stage2']} | {stage_remaining['stage2']} |
608
+ # | 3 | {stage_completes['stage3']} / {stage_counts['stage3']} | {stage_counts['stage3']} | {stage_remaining['stage3']} |
609
+
610
+ # **➡️ Current Active Stage:** {current_stage}
611
+ # """
612
+
613
+ # return gr.update(visible=False), gr.update(visible=True, value=msg)
614
+
615
+ # check_btn.click(
616
+ # fn=calculate_majority_vote_accuracy,
617
+ # inputs=admin_password,
618
+ # outputs=[error_box, output_box],
619
+ # )
620
 
621
 
622
  # App UI
623
  with gr.Blocks() as demo:
624
  human_eval_tab()
625
+ # get_admin_tab()