evaluation

Sleeping

App Files Files Community

Update app.py

by iyosha - opened 9 days ago

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+436

-318

Files changed (1) hide show

app.py +436 -318

app.py CHANGED Viewed

@@ -5,22 +5,33 @@ from uuid import uuid4
 from datasets import load_dataset
 from collections import Counter
 import numpy as np
-from configs import configs
-from clients import backend, logger
-from backend.helpers import get_random_session_samples
-dataset = load_dataset("iyosha-huji/stressEval", token=configs.HF_API_TOKEN)["test"]
-INSTRUCTIONS = """<div align='center'>You are given an audio sample and a question with 2 answer options.\n\nListen to the audio and select the correct answer from the options below.\n\n<b>Note:</b> The question is the same for all samples, but the audio and the corresponding answers change.</div>"""
-with open(Path(__file__).parent / "data/stage_indices.json") as f:
     STAGE_SPLITS = json.load(f)
 def human_eval_tab():
     with gr.Tab(label="Evaluation"):
-        # ==== State =====
         i = gr.State(-1)
         selected_answer = gr.State(None)
         answers_dict = gr.State({})
@@ -28,6 +39,7 @@ def human_eval_tab():
         session_id = gr.State(None)
         user_name = gr.State(None)
         session_sample_indices = gr.State([])
         # === Login UI ===
         with gr.Group(visible=True) as login_group:
@@ -50,6 +62,7 @@ def human_eval_tab():
                     backend, dataset, STAGE_SPLITS, usr, num_samples=15
                 )
                 logger.info(f"Session ID: {new_session_id}, Stage: {stage}")
                 return (
                     True,
                     gr.update(visible=False),
@@ -57,6 +70,7 @@ def human_eval_tab():
                     new_session_id,
                     sample_indices,
                     usr,
                 )
             else:
                 return (
@@ -66,6 +80,7 @@ def human_eval_tab():
                     None,
                     [],
                     None,
                 )
         # === Login Button ===
@@ -79,6 +94,7 @@ def human_eval_tab():
                 session_id,
                 session_sample_indices,
                 user_name,
             ],
         )
@@ -99,7 +115,14 @@ def human_eval_tab():
                 with gr.Row(show_progress=True):
                     with gr.Column(variant="compact"):
                         sample_info = gr.Markdown()
-                        gr.Markdown("**Question:**")
                         question_md = gr.Markdown()
                         radio = gr.Radio(label="Answer:", interactive=True)
                     with gr.Column(variant="compact"):
@@ -122,82 +145,189 @@ def human_eval_tab():
             """
             )
-        # === Logic ===
-        def update_ui(i, answers, session_sample_indices):
-            if i == -1:  # We haven't started yet
                 return (
                     gr.update(visible=False),
                     "",
-                    "",
-                    gr.update(visible=False),
-                    gr.update(visible=False),
-                    None,
                 )
-            # show the question
             true_index = session_sample_indices[i]
-            sample = dataset[true_index]
             audio_data = (sample["audio"]["sampling_rate"], sample["audio"]["array"])
-            previous_answer = answers.get(i, None)
             return (
                 gr.update(visible=True),
-                f"<div align='center'>Sample <b>{i+1}</b> out of <b>{len(session_sample_indices)}</b></div>",
-                "Out of the following answers, according to the speaker's stressed words, what is most likely the underlying intention of the speaker?",
                 gr.update(value=audio_data),
                 gr.update(
-                    choices=sample["possible_answers"],
-                    value=previous_answer,
                 ),
-                previous_answer,
             )
         def update_next_index(
-            i, answer, answers, session_id, session_sample_indices, user_name
         ):
-            if answer is None and i != -1:  # if no answer is selected
-                # show warning message
                 return (
-                    gr.update(),
-                    gr.update(visible=True),
                     gr.update(),
                     answers,
                     gr.update(visible=False),
                     gr.update(visible=True),
                 )
-            if answer:  # if an answer is selected
-                # save the answer to the backend
-                answers[i] = answer
-                true_index = session_sample_indices[i]
-                sample = dataset[true_index]
-                interp_id = sample["interpretation_id"]
-                trans_id = sample["transcription_id"]
-                user_id = session_id
-                user_name_str = user_name or "anonymous"
-                logger.info(
-                    "saving answer to backend",
-                    context={
-                        "i": true_index,
-                        "interp_id": interp_id,
-                        "answer": answer,
-                        "user_id": user_id,
-                    },
                 )
-                if not backend.update_row(true_index, interp_id, user_id, answer):
-                    backend.add_row(
-                        true_index, interp_id, trans_id, user_id, answer, user_name_str
-                    )
-            if i + 1 == len(session_sample_indices):  # Last question just answered
                 return (
-                    -1,  # reset i to stop showing question
                     gr.update(visible=False),
                     gr.update(visible=False),
                     answers,
                     gr.update(visible=True),  # show final page
-                    gr.update(visible=False),  # hide previous button
                 )
-            # go to the next question
-            new_i = i + 1 if i + 1 < len(session_sample_indices) else 0
             return (
                 new_i,
                 gr.update(visible=False),
@@ -205,303 +335,291 @@ def human_eval_tab():
                 answers,
                 gr.update(visible=False),
                 gr.update(visible=True),
             )
-        def update_prev_index(i):
-            # prevent goint back in the first question and first page
-            if i <= 0:
-                return i, gr.update(visible=False)
-            # go back to the previous question
-            else:
-                return i - 1, gr.update(visible=False)
-        def answer_change_callback(answer, i, answers):
-            answers[i] = answer
-            return answer, answers
-        def login_callback(logged_in):
-            return (
-                (
-                    gr.update(visible=True),
-                    gr.update(visible=True),
-                    gr.update(visible=False),
-                    gr.update(visible=False),
-                )
-                if logged_in
-                else (
-                    gr.update(visible=False),
-                    gr.update(visible=False),
-                    gr.update(visible=False),
-                    gr.update(visible=False),
-                )
-            )
-        # === Events ===
         next_btn.click(
             update_next_index,
             [
                 i,
-                selected_answer,
                 answers_dict,
                 session_id,
                 session_sample_indices,
                 user_name,
             ],
-            [i, warning_msg, next_btn, answers_dict, final_group, prev_btn],
         )
         prev_btn.click(update_prev_index, i, [i, warning_msg])
         i.change(
             update_ui,
-            [i, answers_dict, session_sample_indices],
             [
                 question_group,
                 sample_info,
-                question_md,
                 audio_output,
                 radio,
                 selected_answer,
             ],
         )
-        radio.change(
             answer_change_callback,
             [radio, i, answers_dict],
             [selected_answer, answers_dict],
         )
-        logged_in.change(
-            login_callback, logged_in, [app_group, next_btn, prev_btn, warning_msg]
-        )
-def compute_random_sampled_accuracy(df, dataset, n_rounds=100, seed=42):
-    rng = np.random.default_rng(seed)
-    # Filter to interpretation_ids with at least 3 user answers
-    counts = df.groupby("interpretation_id")["user_id"].nunique()
-    eligible_ids = set(counts[counts >= 3].index)
-    # Group answers by interpretation_id
-    grouped = df[df["interpretation_id"].isin(eligible_ids)].groupby(
-        "interpretation_id"
-    )
-    all_scores = []
-    total_answered_per_round = []
-    for _ in range(n_rounds):
-        correct = 0
-        total = 0
-        for interp_id, group in grouped:
-            if group.empty:
-                continue
-            # Randomly pick one row
-            row = group.sample(1, random_state=rng.integers(1e6)).iloc[0]
-            answer = row["answer"]
-            idx = int(row["index_in_dataset"])
-            sample = dataset[idx]
-            gt = sample["possible_answers"][sample["label"]]
-            total += 1
-            if answer == gt:
-                correct += 1
-        if total > 0:
-            all_scores.append(correct / total)
-            total_answered_per_round.append(total)
-    if all_scores:
-        mean_acc = np.mean(all_scores)
-        mean_total = int(np.mean(total_answered_per_round))
-        std_acc = np.std(all_scores, ddof=1)  # sample std
-        ci_95 = 1.96 * std_acc / np.sqrt(n_rounds)
-        return mean_acc, std_acc, mean_total, ci_95
-    return None, None, 0, None
-def get_admin_tab():
-    with gr.Tab("Admin Console"):
-        admin_password = gr.Text(label="Enter Admin Password", type="password")
-        check_btn = gr.Button("Enter")
-        error_box = gr.Markdown("", visible=False)
-        output_box = gr.Markdown("", visible=False)
-        def calculate_majority_vote_accuracy(pw):
-            if pw != configs.ADMIN_PASSWORD:
-                return gr.update(
-                    visible=True, value="❌ Incorrect password."
-                ), gr.update(visible=False)
-            df = backend.get_all_rows()
-            if df.empty:
-                return gr.update(visible=True, value="No data available."), gr.update(
-                    visible=False
-                )
-            # Majority vote per interpretation_id
-            majority_answers = {}
-            for interp_id, group in df.groupby("interpretation_id"):
-                answer_counts = Counter(group["answer"])
-                if answer_counts:
-                    majority_answers[interp_id] = answer_counts.most_common(1)[0][0]
-            counts = df.groupby("interpretation_id")["user_id"].nunique().to_dict()
-            total_answers = len(df)
-            users_count = df["user_id"].nunique()
-            stage_acc = {}
-            stage_completes = {}
-            stage_counts = {}
-            stage_remaining = {}
-            # global_correct = 0
-            # global_total = 0
-            for stage in ["stage1", "stage2", "stage3"]:
-                correct, total = 0, 0
-                complete = 0
-                for i in STAGE_SPLITS[stage]:
-                    sample = dataset[i]
-                    interp_id = sample["interpretation_id"]
-                    label = sample["label"]
-                    gt = sample["possible_answers"][label]
-                    n = counts.get(interp_id, 0)
-                    if n >= 3:
-                        complete += 1
-                    if interp_id in majority_answers:
-                        pred = majority_answers[interp_id]
-                        total += 1
-                        if pred == gt:
-                            correct += 1
-                stage_counts[stage] = len(STAGE_SPLITS[stage])
-                stage_completes[stage] = complete
-                stage_remaining[stage] = 3 * len(STAGE_SPLITS[stage]) - sum(
-                    counts.get(dataset[i]["interpretation_id"], 0)
-                    for i in STAGE_SPLITS[stage]
-                )
-                if complete == len(STAGE_SPLITS[stage]):
-                    acc = correct / total if total > 0 else 0
-                    stage_acc[stage] = (acc, correct, total)
-                else:
-                    stage_acc[stage] = None  # not shown yet
-            # Determine active stage
-            if stage_completes["stage1"] < stage_counts["stage1"]:
-                current_stage = "Stage 1"
-            elif stage_completes["stage2"] < stage_counts["stage2"]:
-                current_stage = "Stage 2"
-            else:
-                current_stage = "Stage 3"
-            # Majority Vote Accuracy Section
-            agg_lines = []
-            if stage_acc["stage1"]:
-                acc1, c1, t1 = stage_acc["stage1"]
-                agg_lines.append(f"- **Stage 1:** {acc1:.2%} ({c1}/{t1})")
-            if stage_acc["stage2"]:
-                acc2, c2, t2 = stage_acc["stage2"]
-                agg_lines.append(
-                    f"- **Stage 1+2:** {(c1 + c2) / (t1 + t2):.2%} ({c1 + c2}/{t1 + t2})"
-                )
-            if stage_acc["stage3"]:
-                acc3, c3, t3 = stage_acc["stage3"]
-                agg_lines.append(
-                    f"- **All Stages:** {(c1 + c2 + c3) / (t1 + t2 + t3):.2%} ({c1 + c2 + c3}/{t1 + t2 + t3})"
-                )
-            agg_msg = "\n".join(agg_lines) if agg_lines else "No completed stages yet."
-            # Compute random-sampled accuracy
-            n_rounds = 100
-            rand_acc, rand_std, rand_total, rand_ci = compute_random_sampled_accuracy(
-                df, dataset, n_rounds=n_rounds
-            )
-            # Random-sampled Accuracy
-            if rand_acc is not None:
-                rand_acc_msg = (
-                    f"**Accuracy:** {rand_acc:.2%} ± {rand_ci:.2%} (95% CI)\n\n"
-                    f"Standard deviation: {rand_std:.2%}\n\n"
-                    f"Samples used: {rand_total} × {n_rounds} rounds"
-                )
-            else:
-                rand_acc_msg = "Random sampling failed (no data)."
-            correct = 0
-            total = 0
-            for _, row in df.iterrows():
-                idx = int(row["index_in_dataset"])
-                if idx >= len(dataset):
-                    continue  # skip out-of-range
-                sample = dataset[idx]
-                gt_answer = sample["possible_answers"][sample["label"]]
-                if row["answer"] == gt_answer:
-                    correct += 1
-                total += 1
-            overall_acc = correct / total if total > 0 else None
-            if overall_acc is not None:
-                overall_acc_msg = (
-                    f"Overall Accuracy: {overall_acc:.2%} ({correct}/{total})"
-                )
-            else:
-                overall_acc_msg = "No data available."
-            # Final message (no indentation!)
-            msg = f"""
-## ✅ Accuracy Summary
-### Overall Accuracy
-{overall_acc_msg}
----
-### Majority Vote
-{agg_msg}
----
-### Random-Sampled Accuracy
-{rand_acc_msg}
----
-## 📊 Answer Progress
-- **Total answers submitted:** {total_answers}
-- **Answers to go (global):** {3 * len(dataset) - total_answers}
-- **Unique users:** {users_count}
----
-## 🧱 Stage Breakdown
-| Stage | Completed | Total | Remaining Answers |
-|-------|-----------|--------|-------------------|
-|  1    | {stage_completes['stage1']} / {stage_counts['stage1']} | {stage_counts['stage1']} | {stage_remaining['stage1']} |
-|  2    | {stage_completes['stage2']} / {stage_counts['stage2']} | {stage_counts['stage2']} | {stage_remaining['stage2']} |
-|  3    | {stage_completes['stage3']} / {stage_counts['stage3']} | {stage_counts['stage3']} | {stage_remaining['stage3']} |
-**➡️ Current Active Stage:** {current_stage}
-"""
-            return gr.update(visible=False), gr.update(visible=True, value=msg)
-        check_btn.click(
-            fn=calculate_majority_vote_accuracy,
-            inputs=admin_password,
-            outputs=[error_box, output_box],
-        )
 # App UI
 with gr.Blocks() as demo:
     human_eval_tab()
-    get_admin_tab()
-# Launch app
-demo.launch()

 from datasets import load_dataset
 from collections import Counter
 import numpy as np
+from .configs import configs
+from .clients_rebuttal import backend, logger
+from .backend.helpers_rebuttal import get_random_session_samples
+# dataset = load_dataset("iyosha-huji/stressBench", token=configs.HF_API_TOKEN)["test"]
+# dataset = load_dataset("iyosha-huji/stressEval", token=configs.HF_API_TOKEN)["test"]
+stage_to_split_map = {1: "train_fine", 2: "train_full"}
+dataset = load_dataset("slprl/Stress-17K-raw")
+INSTRUCTIONS = """<div align='center'>You are given an audio sample and 2 questions.\n\nListen to the audio and select the correct answer from the provided options below.\n\n<b>Note:</b> The questions are the same for all samples, but the audio and the corresponding answers change.</div>"""
+def _stringify(value):
+    if isinstance(value, list):
+        return "[" + ", ".join(map(str, value)) + "]"
+    return value if value is not None else ""
+with open(Path(__file__).parent / "data/rebuttal/stage_indices.json") as f:
     STAGE_SPLITS = json.load(f)
 def human_eval_tab():
     with gr.Tab(label="Evaluation"):
+        # ==== State ====
         i = gr.State(-1)
         selected_answer = gr.State(None)
         answers_dict = gr.State({})
         session_id = gr.State(None)
         user_name = gr.State(None)
         session_sample_indices = gr.State([])
+        current_split = gr.State(None)
         # === Login UI ===
         with gr.Group(visible=True) as login_group:
                     backend, dataset, STAGE_SPLITS, usr, num_samples=15
                 )
                 logger.info(f"Session ID: {new_session_id}, Stage: {stage}")
+                current_split = stage_to_split_map[stage]
                 return (
                     True,
                     gr.update(visible=False),
                     new_session_id,
                     sample_indices,
                     usr,
+                    current_split,
                 )
             else:
                 return (
                     None,
                     [],
                     None,
+                    None,
                 )
         # === Login Button ===
                 session_id,
                 session_sample_indices,
                 user_name,
+                current_split,
             ],
         )
                 with gr.Row(show_progress=True):
                     with gr.Column(variant="compact"):
                         sample_info = gr.Markdown()
+                        # ✅ NEW: SSD question first
+                        stress_question_md = gr.Markdown()
+                        stress_checkbox = gr.CheckboxGroup(
+                            label="Stressed words:", interactive=True
+                        )  # ✅ NEW
+                        # ✅ SSR question after SSD
+                        # gr.Markdown("**Question:**")
                         question_md = gr.Markdown()
                         radio = gr.Radio(label="Answer:", interactive=True)
                     with gr.Column(variant="compact"):
             """
             )
+        # === Logic === -------------------------------------------------------
+        # ── 1. UI refresh when i changes ────────────────────────────────────
+        def update_ui(i, answers, session_sample_indices, current_split):
+            if i == -1:  # not started yet
                 return (
                     gr.update(visible=False),
                     "",
+                    "",  # group, sample-info, SSD text
+                    gr.update(visible=False),  # audio
+                    gr.update(visible=False),  # SSD checkbox
+                    "",  # SSR text
+                    gr.update(visible=False),  # SSR radio
+                    None,  # selected_answer
                 )
+            # show current sample
             true_index = session_sample_indices[i]
+            sample = dataset[current_split][true_index]
             audio_data = (sample["audio"]["sampling_rate"], sample["audio"]["array"])
+            previous = answers.get(i, {"ssd": [], "ssr": None})
             return (
                 gr.update(visible=True),
+                f"<div align='center'>Sample <b>{i+1}</b> / <b>{len(session_sample_indices)}</b></div>",
+                f"Given that the speaker said: \"**{sample['transcription']}**\"\nWhat word(s) did the speaker stress?\nYou can select multiple words if you think more than one word is stressed.",
                 gr.update(value=audio_data),
                 gr.update(
+                    choices=sample["transcription"].split(),
+                    value=previous.get("ssd", []),
                 ),
+                "Out of the following answers, according to the stressed words, what is most likely the underlying intention of the speaker?",
+                gr.update(
+                    choices=sample["possible_answers"], value=previous.get("ssr")
+                ),
+                previous.get("ssr"),
+            )
+        # ── 2. SSD (checkbox) live-update – optional ------------------------
+        def ssd_change_callback(ssd_answer, i, answers):
+            answers[i] = answers.get(i, {})
+            answers[i]["ssd"] = ssd_answer
+            return answers
+        # ── 3. SSR (radio) live-update – keeps selected_answer state --------
+        def answer_change_callback(answer, i, answers):
+            answers[i] = answers.get(i, {})
+            answers[i]["ssr"] = answer
+            return answer, answers
+        # ── 4. navigate back one sample -------------------------------------
+        def update_prev_index(i):
+            if i <= 0:
+                return i, gr.update(visible=False)
+            return i - 1, gr.update(visible=False)
+        # ── 5. login toggle --------------------------------------------------
+        def login_callback(logged_in):
+            if logged_in:
+                return (
+                    gr.update(visible=True),  # show app_group
+                    gr.update(visible=True),  # show next_btn
+                    gr.update(visible=False),  # hide prev_btn
+                    gr.update(visible=False),  # hide warning_msg
+                )
+            return (
+                gr.update(visible=False),
+                gr.update(visible=False),
+                gr.update(visible=False),
+                gr.update(visible=False),
             )
+        # ── 6. main “Next / Submit” handler ---------------------------------
         def update_next_index(
+            i,
+            ssd_answer,
+            ssr_answer,
+            answers,
+            session_id,
+            session_sample_indices,
+            user_name,
+            current_split,
         ):
+            # ── 0. first click on "Start" ──────────────────────────────────────────
+            if i == -1:
                 return (
+                    0,  # show first sample
+                    gr.update(visible=False),  # hide warning
+                    gr.update(value="Submit answer and go to Next"),
+                    answers,
+                    gr.update(visible=False),  # keep final page hidden
+                    gr.update(visible=False),  # prev_btn stays hidden
+                    gr.update(value=[]),  # clear SSD checkbox
+                    gr.update(value=None),  # clear SSR radio
+                )
+            # ── 1. block if either answer missing (for real samples) ──────────────
+            if not ssd_answer or not ssr_answer:
+                return (
+                    i,
+                    gr.update(visible=True),  # show warning
                     gr.update(),
                     answers,
                     gr.update(visible=False),
                     gr.update(visible=True),
+                    gr.update(),  # keep SSD
+                    gr.update(),  # keep SSR
                 )
+            # store answers
+            answers[i] = {"ssd": ssd_answer, "ssr": ssr_answer}
+            true_index = session_sample_indices[i]
+            sample = dataset[current_split][true_index]
+            audio_id = sample["audio_id"]
+            trans_id = sample["transcription_id"]
+            interpretation = sample["intonation"]
+            user_id = session_id
+            user_name_str = user_name or "anonymous"
+            # ----- SSR values -----
+            user_ssr_answer = sample["possible_answers"].index(ssr_answer)
+            ssr_label = sample["label"]
+            # ----- SSD values -----
+            ssd_answer_str = ",".join(map(str, ssd_answer))
+            ssd_words_list = ssd_answer_str.split(",")
+            transcription_words = sample["transcription"].split()
+            user_ssd_answer = _stringify(
+                [
+                    1 if word in ssd_words_list else 0
+                    for _, word in enumerate(transcription_words)
+                ]
+            )
+            ssd_label = _stringify(
+                [
+                    1 if idx in sample["gt_stress_indices"] else 0
+                    for idx, _ in enumerate(sample["transcription"].split())
+                ]
+            )
+            # write to backend
+            updated = backend.update_row(
+                true_index,
+                audio_id,
+                user_id,
+                new_ssr_answer=ssr_answer,
+                new_user_ssr_answer=user_ssr_answer,
+                new_ssr_label=ssr_label,
+                new_ssd_answer=ssd_answer_str,
+                new_user_ssd_answer=user_ssd_answer,
+                new_ssd_label=ssd_label,
+            )
+            if not updated:
+                backend.add_row(
+                    true_index,
+                    audio_id,
+                    interpretation,
+                    trans_id,
+                    user_id,
+                    user_name_str,
+                    ssr_answer,
+                    user_ssr_answer,
+                    ssr_label,
+                    ssd_answer_str,
+                    user_ssd_answer,
+                    ssd_label,
                 )
+            # finished?
+            if i + 1 == len(session_sample_indices):
                 return (
+                    -1,
                     gr.update(visible=False),
                     gr.update(visible=False),
                     answers,
                     gr.update(visible=True),  # show final page
+                    gr.update(visible=False),  # hide prev
+                    gr.update(value=[]),  # clear SSD
+                    gr.update(value=None),  # clear SSR
                 )
+            # otherwise advance
+            new_i = i + 1
             return (
                 new_i,
                 gr.update(visible=False),
                 answers,
                 gr.update(visible=False),
                 gr.update(visible=True),
+                gr.update(value=[]),  # reset SSD
+                gr.update(value=None),  # reset SSR
             )
+        # === Event wiring ===================================================
         next_btn.click(
             update_next_index,
             [
                 i,
+                stress_checkbox,
+                radio,
                 answers_dict,
                 session_id,
                 session_sample_indices,
                 user_name,
+                current_split,
+            ],
+            [
+                i,
+                warning_msg,
+                next_btn,
+                answers_dict,
+                final_group,
+                prev_btn,
+                stress_checkbox,
+                radio,
             ],
         )
         prev_btn.click(update_prev_index, i, [i, warning_msg])
         i.change(
             update_ui,
+            [i, answers_dict, session_sample_indices, current_split],
             [
                 question_group,
                 sample_info,
+                stress_question_md,
                 audio_output,
+                stress_checkbox,
+                question_md,
                 radio,
                 selected_answer,
             ],
         )
+        radio.change(  # SSR radio
             answer_change_callback,
             [radio, i, answers_dict],
             [selected_answer, answers_dict],
         )
+        stress_checkbox.change(  # SSD checkbox
+            ssd_change_callback, [stress_checkbox, i, answers_dict], [answers_dict]
+        )
+        logged_in.change(
+            login_callback,
+            logged_in,
+            [app_group, next_btn, prev_btn, warning_msg],
+        )
+# def compute_random_sampled_accuracy(df, dataset, n_rounds=100, seed=42):
+#     rng = np.random.default_rng(seed)
+#     # Filter to interpretation_ids with at least 3 user answers
+#     counts = df.groupby("interpretation_id")["user_id"].nunique()
+#     eligible_ids = set(counts[counts >= 3].index)
+#     # Group answers by interpretation_id
+#     grouped = df[df["interpretation_id"].isin(eligible_ids)].groupby(
+#         "interpretation_id"
+#     )
+#     all_scores = []
+#     total_answered_per_round = []
+#     for _ in range(n_rounds):
+#         correct = 0
+#         total = 0
+#         for interp_id, group in grouped:
+#             if group.empty:
+#                 continue
+#             # Randomly pick one row
+#             row = group.sample(1, random_state=rng.integers(1e6)).iloc[0]
+#             answer = row["answer"]
+#             idx = int(row["index_in_dataset"])
+#             sample = dataset[idx]
+#             gt = sample["possible_answers"][sample["label"]]
+#             total += 1
+#             if answer == gt:
+#                 correct += 1
+#         if total > 0:
+#             all_scores.append(correct / total)
+#             total_answered_per_round.append(total)
+#     if all_scores:
+#         mean_acc = np.mean(all_scores)
+#         mean_total = int(np.mean(total_answered_per_round))
+#         std_acc = np.std(all_scores, ddof=1)  # sample std
+#         ci_95 = 1.96 * std_acc / np.sqrt(n_rounds)
+#         return mean_acc, std_acc, mean_total, ci_95
+#     return None, None, 0, None
+# def get_admin_tab():
+#     with gr.Tab("Admin Console"):
+#         admin_password = gr.Text(label="Enter Admin Password", type="password")
+#         check_btn = gr.Button("Enter")
+#         error_box = gr.Markdown("", visible=False)
+#         output_box = gr.Markdown("", visible=False)
+#         def calculate_majority_vote_accuracy(pw):
+#             if pw != configs.ADMIN_PASSWORD:
+#                 return gr.update(
+#                     visible=True, value="❌ Incorrect password."
+#                 ), gr.update(visible=False)
+#             df = backend.get_all_rows()
+#             if df.empty:
+#                 return gr.update(visible=True, value="No data available."), gr.update(
+#                     visible=False
+#                 )
+#             # Majority vote per interpretation_id
+#             majority_answers = {}
+#             for interp_id, group in df.groupby("interpretation_id"):
+#                 answer_counts = Counter(group["answer"])
+#                 if answer_counts:
+#                     majority_answers[interp_id] = answer_counts.most_common(1)[0][0]
+#             counts = df.groupby("interpretation_id")["user_id"].nunique().to_dict()
+#             total_answers = len(df)
+#             users_count = df["user_id"].nunique()
+#             stage_acc = {}
+#             stage_completes = {}
+#             stage_counts = {}
+#             stage_remaining = {}
+#             # global_correct = 0
+#             # global_total = 0
+#             for stage in ["stage1", "stage2", "stage3"]:
+#                 correct, total = 0, 0
+#                 complete = 0
+#                 for i in STAGE_SPLITS[stage]:
+#                     sample = dataset[i]
+#                     interp_id = sample["interpretation_id"]
+#                     label = sample["label"]
+#                     gt = sample["possible_answers"][label]
+#                     n = counts.get(interp_id, 0)
+#                     if n >= 3:
+#                         complete += 1
+#                     if interp_id in majority_answers:
+#                         pred = majority_answers[interp_id]
+#                         total += 1
+#                         if pred == gt:
+#                             correct += 1
+#                 stage_counts[stage] = len(STAGE_SPLITS[stage])
+#                 stage_completes[stage] = complete
+#                 stage_remaining[stage] = 3 * len(STAGE_SPLITS[stage]) - sum(
+#                     counts.get(dataset[i]["interpretation_id"], 0)
+#                     for i in STAGE_SPLITS[stage]
+#                 )
+#                 if complete == len(STAGE_SPLITS[stage]):
+#                     acc = correct / total if total > 0 else 0
+#                     stage_acc[stage] = (acc, correct, total)
+#                 else:
+#                     stage_acc[stage] = None  # not shown yet
+#             # Determine active stage
+#             if stage_completes["stage1"] < stage_counts["stage1"]:
+#                 current_stage = "Stage 1"
+#             elif stage_completes["stage2"] < stage_counts["stage2"]:
+#                 current_stage = "Stage 2"
+#             else:
+#                 current_stage = "Stage 3"
+#             # Majority Vote Accuracy Section
+#             agg_lines = []
+#             if stage_acc["stage1"]:
+#                 acc1, c1, t1 = stage_acc["stage1"]
+#                 agg_lines.append(f"- **Stage 1:** {acc1:.2%} ({c1}/{t1})")
+#             if stage_acc["stage2"]:
+#                 acc2, c2, t2 = stage_acc["stage2"]
+#                 agg_lines.append(
+#                     f"- **Stage 1+2:** {(c1 + c2) / (t1 + t2):.2%} ({c1 + c2}/{t1 + t2})"
+#                 )
+#             if stage_acc["stage3"]:
+#                 acc3, c3, t3 = stage_acc["stage3"]
+#                 agg_lines.append(
+#                     f"- **All Stages:** {(c1 + c2 + c3) / (t1 + t2 + t3):.2%} ({c1 + c2 + c3}/{t1 + t2 + t3})"
+#                 )
+#             agg_msg = "\n".join(agg_lines) if agg_lines else "No completed stages yet."
+#             # Compute random-sampled accuracy
+#             n_rounds = 100
+#             rand_acc, rand_std, rand_total, rand_ci = compute_random_sampled_accuracy(
+#                 df, dataset, n_rounds=n_rounds
+#             )
+#             # Random-sampled Accuracy
+#             if rand_acc is not None:
+#                 rand_acc_msg = (
+#                     f"**Accuracy:** {rand_acc:.2%} ± {rand_ci:.2%} (95% CI)\n\n"
+#                     f"Standard deviation: {rand_std:.2%}\n\n"
+#                     f"Samples used: {rand_total} × {n_rounds} rounds"
+#                 )
+#             else:
+#                 rand_acc_msg = "Random sampling failed (no data)."
+#             correct = 0
+#             total = 0
+#             for _, row in df.iterrows():
+#                 idx = int(row["index_in_dataset"])
+#                 if idx >= len(dataset):
+#                     continue  # skip out-of-range
+#                 sample = dataset[idx]
+#                 gt_answer = sample["possible_answers"][sample["label"]]
+#                 if row["answer"] == gt_answer:
+#                     correct += 1
+#                 total += 1
+#             overall_acc = correct / total if total > 0 else None
+#             if overall_acc is not None:
+#                 overall_acc_msg = (
+#                     f"Overall Accuracy: {overall_acc:.2%} ({correct}/{total})"
+#                 )
+#             else:
+#                 overall_acc_msg = "No data available."
+#             # Final message (no indentation!)
+#             msg = f"""
+# ## ✅ Accuracy Summary
+# ### Overall Accuracy
+# {overall_acc_msg}
+# ---
+# ### Majority Vote
+# {agg_msg}
+# ---
+# ### Random-Sampled Accuracy
+# {rand_acc_msg}
+# ---
+# ## 📊 Answer Progress
+# - **Total answers submitted:** {total_answers}
+# - **Answers to go (global):** {3 * len(dataset) - total_answers}
+# - **Unique users:** {users_count}
+# ---
+# ## 🧱 Stage Breakdown
+# | Stage | Completed | Total | Remaining Answers |
+# |-------|-----------|--------|-------------------|
+# |  1    | {stage_completes['stage1']} / {stage_counts['stage1']} | {stage_counts['stage1']} | {stage_remaining['stage1']} |
+# |  2    | {stage_completes['stage2']} / {stage_counts['stage2']} | {stage_counts['stage2']} | {stage_remaining['stage2']} |
+# |  3    | {stage_completes['stage3']} / {stage_counts['stage3']} | {stage_counts['stage3']} | {stage_remaining['stage3']} |
+# **➡️ Current Active Stage:** {current_stage}
+# """
+#             return gr.update(visible=False), gr.update(visible=True, value=msg)
+#         check_btn.click(
+#             fn=calculate_majority_vote_accuracy,
+#             inputs=admin_password,
+#             outputs=[error_box, output_box],
+#         )
 # App UI
 with gr.Blocks() as demo:
     human_eval_tab()
+    # get_admin_tab()