Spaces:

jannisborn
/

NumberTokenLoss

Running

App Files Files Community

jannisborn commited on 11 days ago

Commit

9914a10

unverified ·

1 Parent(s): ac2c591

wip

Browse files

Files changed (2) hide show

src/scenarios.py +60 -0
src/streamlit_app.py +337 -384

src/scenarios.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import numpy as np
+#  (1) A one-hot moving from token 0 to token 10 (“Text”)
+dirac = [
+    {
+        "name": f"Dirac: all mass on token {i}",
+        "values": [1.0 if j == i else 0.0 for j in range(11)],
+        "ground_truth": "4",
+        "explanation": "A Dirac distribution: all probability on a single token.",
+    }
+    for i in range(11)
+]
+# (2) A Gaussian with peak_mass=0.6 at center, remaining mass=0.4 spread by a Gaussian ---
+def make_gauss_values(center, n=11, sigma=1.5, peak_mass=0.6):
+    xs = np.arange(n)
+    # unnormalized Gaussian
+    kernel = np.exp(-0.5 * ((xs - center) / sigma) ** 2)
+    # zero out the center, re-normalize the *other* weights to sum to 1
+    others = kernel.copy()
+    others[center] = 0.0
+    others /= others.sum()
+    # allocate 0.6 to the center, 0.4 to the rest
+    vals = others * (1.0 - peak_mass)
+    vals[center] = peak_mass
+    return vals.tolist()
+gauss = [
+    {
+        "name": f"Gaussian: center at token {c}",
+        "values": make_gauss_values(c),
+        "ground_truth": "4",
+        "explanation": "Gaussian-style: 0.6 mass at the highlighted token, 0.4 spread smoothly to its neighbors.",
+    }
+    for c in range(11)
+]
+# (3) Bimodal: two spikes of 0.5 mass each, symmetrically offset from the GT=4 ---
+def make_bimodal_values(offset, n=11, gt=4):
+    # clamp to [0,n-1]
+    left = max(0, gt - offset)
+    right = min(n - 1, gt + offset)
+    vals = [0.0] * n
+    vals[left] = 0.5
+    vals[right] = 0.5
+    return vals
+bimodal = [
+    {
+        "name": f"Bimodal: peaks at tokens {max(0, 4 - d)} & {min(10, 4 + d)}",
+        "values": make_bimodal_values(d),
+        "ground_truth": "4",
+        "explanation": "Two-point (bimodal) distribution: equal 0.5 mass on each peak, which move ±offset from the ground truth.",
+    }
+    for d in range(11)
+]

src/streamlit_app.py CHANGED Viewed

@@ -1,392 +1,163 @@
 import altair as alt
 import pandas as pd
 import streamlit_vertical_slider as svs
 import torch
-# from streamlit_vertical_slider import vertical_slider # Not directly used, svs.vertical_slider is
-import streamlit as st
-import time
-import plotly.graph_objects as go  # Add Plotly import
 # Define options globally as it's used in initialization and UI
 options = [str(i) for i in range(10)] + ["Text"]
 # --- Session State Initialization ---
 # Ensure all session state variables are initialized before first use, especially by widgets.
-if 'running_demo' not in st.session_state:
     st.session_state.running_demo = False
-if 'demo_step' not in st.session_state:
     st.session_state.demo_step = 0
-if 'last_update_time' not in st.session_state:
     st.session_state.last_update_time = 0
-if 'loss_container' not in st.session_state:
     st.session_state.loss_container = None
-if 'previous_chart_html' not in st.session_state:
     st.session_state.previous_chart_html = ""
 # Initialize states for sliders and ground_truth selector
 # Using len(options) to correctly size for 0-9 + "Text"
 for i in range(len(options)):
     if f"slider_{i}" not in st.session_state:
         st.session_state[f"slider_{i}"] = 1.0 / len(options)
-if 'ground_truth' not in st.session_state:
-    st.session_state['ground_truth'] = options[0] # Default to "0"
 st.title("Number Token Loss - Demo")
-st.markdown("""
-Adjust the sliders to set a predicted probability for each token (0-9 and "Text").
-The sliders are vertical and compact. The app normalizes the slider values
-to form a valid probability distribution, visualizes it, and computes the corresponding
-Cross Entropy, NTL-MSE, and NTL-WAS losses.
-""")
-# --- Scenario Definitions ---
-scenarios = [
-    {
-        "name": "Probability mass at 0",
-        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "0",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-     {
-        "name": "Probability mass at 0",
-        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "1",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-     {
-        "name": "Probability mass at 0",
-        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "2",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-     {
-        "name": "Probability mass at 0",
-        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "3",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-     {
-        "name": "Probability mass at 0",
-        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "4",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-     {
-        "name": "Probability mass at 0",
-        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "5",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-     {
-        "name": "Probability mass at 0",
-        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "6",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-     {
-        "name": "Probability mass at 0",
-        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "7",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-     {
-        "name": "Probability mass at 0",
-        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "8",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-     {
-        "name": "Probability mass at 0",
-        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "9",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-    {
-        "name": "Probability mass around 5",
-        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
-        "ground_truth": "0",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-    {
-        "name": "Probability mass around 5",
-        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
-        "ground_truth": "1",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-    {
-        "name": "Probability mass around 5",
-        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
-        "ground_truth": "2",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-    {
-        "name": "Probability mass around 5",
-        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
-        "ground_truth": "3",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-    {
-        "name": "Probability mass around 5",
-        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
-        "ground_truth": "4",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-    {
-        "name": "Probability mass around ground truth (5)",
-        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
-        "ground_truth": "5",
-        "explanation": "Cross Entropy is moderate, NTL is low because predictions are close to ground truth."
-    },
-    {
-        "name": "Probability mass around 5",
-        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
-        "ground_truth": "6",
-        "explanation": "Cross Entropy is moderate, NTL is low because predictions are close to ground truth."
-    },
-    {
-        "name": "Probability mass around 5",
-        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
-        "ground_truth": "7",
-        "explanation": "Cross Entropy is moderate, NTL is low because predictions are close to ground truth."
-    },
-    {
-        "name": "Probability mass around 5",
-        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
-        "ground_truth": "8",
-        "explanation": "Cross Entropy is high, NTL is higher but still penalizes less than CE because distribution knows it's a number."
-    },
-    {
-        "name": "Probability mass around 5",
-        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
-        "ground_truth": "9",
-        "explanation": "Cross Entropy is moderate, NTL is low because predictions are close to ground truth."
-    },
-    {
-        "name": "Probability mass concentrated on 5",
-        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "0",
-        "explanation": "Both CE and NTL are high because the prediction is far from correct."
-    },
-    {
-        "name": "Probability mass concentrated on 5",
-        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "1",
-        "explanation": "Both CE and NTL are high because the prediction is far from correct."
-    },
-    {
-        "name": "Probability mass concentrated on 5",
-        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "2",
-        "explanation": "Both CE and NTL are high because the prediction is far from correct."
-    },
-    {
-        "name": "Probability mass concentrated on 5",
-        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "3",
-        "explanation": "Both CE and NTL are high because the prediction is far from correct."
-    },
-    {
-        "name": "Probability mass concentrated on 5",
-        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "4",
-        "explanation": "Both CE and NTL are high because the prediction is far from correct."
-    },
-    {
-        "name": "Probability mass concentrated on 5",
-        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "5",
-        "explanation": "Both CE and NTL are high because the prediction is far from correct."
-    },
-    {
-        "name": "Probability mass concentrated on 5",
-        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "6",
-        "explanation": "Both CE and NTL are high because the prediction is far from correct."
-    },
-    {
-        "name": "Probability mass concentrated on 5",
-        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "7",
-        "explanation": "Both CE and NTL are high because the prediction is far from correct."
-    },
-    {
-        "name": "Probability mass concentrated on 5",
-        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "8",
-        "explanation": "Both CE and NTL are high because the prediction is far from correct."
-    },
-    {
-        "name": "Probability mass concentrated on 5",
-        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "9",
-        "explanation": "Both CE and NTL are high because the prediction is far from correct."
-    },
-    {
-        "name": "Probability mass concentrated on 1",
-        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
-        "ground_truth": "0",
-        "explanation": "Both losses are low because the prediction is correct."
-    },
-    {
-        "name": "Probability mass concentrated on 1",
-        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
-        "ground_truth": "1",
-        "explanation": "Both losses are low because the prediction is correct."
-    },
-    {
-        "name": "Probability mass concentrated on 1",
-        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
-        "ground_truth": "2",
-        "explanation": "Both losses are low because the prediction is correct."
-    },
-    {
-        "name": "Probability mass concentrated on 1",
-        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
-        "ground_truth": "3",
-        "explanation": "Both losses are low because the prediction is correct."
-    },
-    {
-        "name": "Probability mass concentrated on 1",
-        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
-        "ground_truth": "4",
-        "explanation": "Both losses are low because the prediction is correct."
-    },
-    {
-        "name": "Probability mass concentrated on 1",
-        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
-        "ground_truth": "5",
-        "explanation": "Both losses are low because the prediction is correct."
-    },
-    {
-        "name": "Probability mass concentrated on 1",
-        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
-        "ground_truth": "6",
-        "explanation": "Both losses are low because the prediction is correct."
-    },
-    {
-        "name": "Probability mass concentrated on 1",
-        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
-        "ground_truth": "7",
-        "explanation": "Both losses are low because the prediction is correct."
-    },
-    {
-        "name": "Probability mass concentrated on 1",
-        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
-        "ground_truth": "8",
-        "explanation": "Both losses are low because the prediction is correct."
-    },
-    {
-        "name": "Probability mass concentrated on 1",
-        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
-        "ground_truth": "9",
-        "explanation": "Both losses are low because the prediction is correct."
-    },
-    {
-        "name": "Almost correct (1 vs 2)",
-        "values": [0.1, 0.1, 0.7, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], # 11 values
-        "ground_truth": "0",
-        "explanation": "CE penalizes harshly, but NTL-WAS remains low because prediction is numerically close."
-    },
-    {
-        "name": "Almost correct (1 vs 2)",
-        "values": [0.1, 0.1, 0.7, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], # 11 values
-        "ground_truth": "1",
-        "explanation": "CE penalizes harshly, but NTL-WAS remains low because prediction is numerically close."
-    },
-    {
-        "name": "Almost correct (1 vs 2)",
-        "values": [0.1, 0.1, 0.7, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], # 11 values
-        "ground_truth": "2",
-        "explanation": "CE penalizes harshly, but NTL-WAS remains low because prediction is numerically close."
-    },
-    {
-        "name": "Almost correct (1 vs 2)",
-        "values": [0.1, 0.1, 0.7, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], # 11 values
-        "ground_truth": "3",
-        "explanation": "CE penalizes harshly, but NTL-WAS remains low because prediction is numerically close."
-    }
-]
-# --- Helper Functions ---
 def apply_scenario(step_idx):
-    scenario = scenarios[step_idx]
-    # These assignments modify session state. They must be done *before* the widgets
-    # are rendered in the script run that should display these new values.
     for i, val in enumerate(scenario["values"]):
         st.session_state[f"slider_{i}"] = val
-    st.session_state['ground_truth'] = scenario["ground_truth"]
-def start_demo():
     st.session_state.running_demo = True
     st.session_state.demo_step = 0
     st.session_state.last_update_time = time.time()
-    apply_scenario(0) # Apply the first scenario's state
-    # The button click that calls start_demo() will itself cause a rerun.
 def stop_demo():
     st.session_state.running_demo = False
 # --- Demo State Advancement Logic ---
 # This block handles advancing the demo. If it advances, it updates session state
 # and then reruns. This ensures widgets are drawn with the new state in the next run.
 if st.session_state.running_demo:
     current_time = time.time()
-    if current_time - st.session_state.last_update_time > 3.0:  # 3 seconds per scenario
-        next_step = (st.session_state.demo_step + 1) % len(scenarios)
         st.session_state.demo_step = next_step
         apply_scenario(next_step)  # Update session state for the new scenario
-        st.session_state.last_update_time = time.time() # Reset timer
         st.rerun()  # Crucial: Rerun to reflect changes in widgets and charts
 # --- UI Rendering ---
 # This section renders the main UI. It executes after any potential rerun from the block above.
 if st.session_state.running_demo:
-    st.info(f"Showing scenario {st.session_state.demo_step + 1}/{len(scenarios)}: {scenarios[st.session_state.demo_step]['name']}")
-    st.markdown(f"**Explanation:** {scenarios[st.session_state.demo_step]['explanation']}")
     if st.button("Stop Demo"):
-        stop_demo()
         st.rerun()
-else: # Not st.session_state.running_demo
-    if st.button("Start Automated Demo"):
-        start_demo() # This calls apply_scenario(0)
-        st.rerun()   # Rerun to enter demo mode and draw scenario 0 correctly
-# Sliders and Ground Truth Selector
-# These widgets will read their initial values from st.session_state.
-# User interactions will update st.session_state directly due to their keys.
-if not st.session_state.running_demo:
-    st.markdown("#### Predicted Token Probabilities")
-    cols = st.columns(len(options))
-    for i, col in enumerate(cols):
-        label = options[i] # Use token name directly for label
-        with col:
-            svs.vertical_slider(
-                label=label, min_value=0.0, max_value=1.0, step=0.01, height=50,
-                key=f"slider_{i}", # This key links the widget to st.session_state[f"slider_{i}"]
-                slider_color="green", track_color="lightgray", thumb_color="black"
-            )
-# Ground truth selectbox
-st.selectbox(
-    "Ground Truth Token", options=options,
-    index=options.index(st.session_state['ground_truth']), # Display value from session state
-    key='ground_truth' # Links widget to st.session_state['ground_truth']
-)
 # Placeholder for charts and loss calculations that will be updated
 # This section always reads the current st.session_state to generate its content.
-current_prob_values_from_state = [st.session_state.get(f"slider_{j}", 1.0/len(options)) for j in range(len(options))]
 total_from_state = sum(current_prob_values_from_state)
 probs_for_charts = (
     torch.ones(len(options)) / len(options)
@@ -394,63 +165,144 @@ probs_for_charts = (
     else torch.tensor([v / total_from_state for v in current_prob_values_from_state])
 )
-gt_choice_for_charts = st.session_state.get('ground_truth', options[0])
 if gt_choice_for_charts == "Text":
-    gt_index_for_charts = 10 # Assuming "Text" is the 11th item (index 10)
     gt_numeric_for_charts = None
 else:
     gt_index_for_charts = int(gt_choice_for_charts)
     gt_numeric_for_charts = gt_index_for_charts
-st.markdown("#### Input Probability Distribution")
-df_dist = pd.DataFrame({"token": options, "probability": probs_for_charts.numpy()})
-df_dist["type"] = ["Ground Truth" if token == gt_choice_for_charts else "Prediction" for token in options]
-chart = (
-    alt.Chart(df_dist).mark_bar().encode(
-        x=alt.X("token:N", title="Token", sort=options), # Ensure consistent sort order
-        y=alt.Y("probability:Q", title="Probability", scale=alt.Scale(domain=[0, 1])),
-        color=alt.Color("type:N", scale=alt.Scale(domain=["Ground Truth", "Prediction"], range=["green", "steelblue"]), legend=alt.Legend(title="Token Type"))
-    ).properties(height=300)
 )
-st.altair_chart(chart, use_container_width=True)
 ce_loss = -torch.log(torch.clamp(probs_for_charts[gt_index_for_charts], min=1e-9))
-if gt_numeric_for_charts is None: # Text token
-    ntl_mse_loss = torch.tensor(float('nan')) # MSE not applicable for text
-    ntl_was_loss = torch.tensor(float('nan')) # WAS not applicable for text
-else: # Numeric token
-    numeric_probs_for_loss = probs_for_charts[:10] # Probabilities for 0-9
     # Ensure numeric_probs_for_loss sums to 1 for NTL calculations if it's a subset
     numeric_probs_sum = torch.sum(numeric_probs_for_loss)
-    if numeric_probs_sum > 1e-6 : # Avoid division by zero
-            normalized_numeric_probs = numeric_probs_for_loss / numeric_probs_sum
     else:
-            normalized_numeric_probs = torch.zeros_like(numeric_probs_for_loss)
     loss_values_tensor = torch.arange(0, 10, dtype=torch.float32)
     # Use normalized probabilities for NTL if only considering numeric tokens
-    if gt_choice_for_charts != "Text" and torch.sum(probs_for_charts[:10]) > 1e-6 :
-        pred_value = torch.sum( (probs_for_charts[:10]/torch.sum(probs_for_charts[:10])) * loss_values_tensor)
-    elif gt_choice_for_charts != "Text": # if sum is zero, pred_value is ill-defined or 0
-            pred_value = torch.tensor(0.0)
-    else: # Should not happen if gt_numeric_for_charts is not None
-        pred_value = torch.tensor(float('nan'))
     if not torch.isnan(pred_value):
-        ntl_mse_loss = (pred_value - float(gt_numeric_for_charts)) ** 2
         abs_diff = torch.abs(loss_values_tensor - float(gt_numeric_for_charts))
         if gt_choice_for_charts != "Text" and torch.sum(probs_for_charts[:10]) > 1e-6:
-                ntl_was_loss = torch.sum((probs_for_charts[:10]/torch.sum(probs_for_charts[:10])) * abs_diff)
         elif gt_choice_for_charts != "Text":
-                ntl_was_loss = torch.tensor(0.0) # Or some other default if all numeric probs are zero
         else:
-                ntl_was_loss = torch.tensor(float('nan'))
     else:
-        ntl_mse_loss = torch.tensor(float('nan'))
-        ntl_was_loss = torch.tensor(float('nan'))
 ce_val = round(ce_loss.item(), 3)
@@ -458,6 +310,38 @@ mse_val = round(ntl_mse_loss.item(), 3) if not torch.isnan(ntl_mse_loss) else "N
 was_val = round(ntl_was_loss.item(), 3) if not torch.isnan(ntl_was_loss) else "N/A"
 loss_data = {"Loss": ["Cross Entropy"], "Value": [ce_val]}
 if was_val != "N/A":
     loss_data["Loss"].append("NTL-WAS")
@@ -469,34 +353,103 @@ if mse_val != "N/A":
 loss_df = pd.DataFrame(loss_data)
 # ============== Chart Display ==============
 # Create a single chart for loss visualization
 st.subheader("Loss Comparison")
 # Create an Altair chart that will look good and redraw cleanly
-chart = alt.Chart(loss_df).mark_bar().encode(
-    x=alt.X('Loss:N', sort=loss_df["Loss"].tolist()),
-    y=alt.Y('Value:Q', scale=alt.Scale(domain=[0, max(loss_df["Value"].max() * 1.2, 20 if st.session_state.running_demo else 0.5)])),
-    color=alt.Color('Loss:N', scale=alt.Scale(
-        domain=['Cross Entropy', 'NTL-WAS', 'NTL-MSE'],
-        range=['steelblue', 'red', 'forestgreen']
-    )),
-    tooltip=['Loss', 'Value']
-).properties(
-    height=300
 )
 # Add value labels on top of bars
-text = chart.mark_text(
-    align='center',
-    baseline='bottom',
-    dy=-5,
-    fontSize=14
-).encode(
-    text=alt.Text('Value:Q', format='.3f')
 )
 # Combine chart and text
-final_chart = (chart + text)
 # Display chart with the full container width
 st.altair_chart(final_chart, use_container_width=True)
@@ -507,7 +460,7 @@ st.altair_chart(final_chart, use_container_width=True)
 if st.session_state.running_demo:
     # This check is implicitly: if we are here and demo is running, it means
     # the time-based advance condition was NOT met in the block at the top.
-    time.sleep(0.1) # Adjusted from 0.2 to 0.5 (or try 1.0)
     st.rerun()
 # Add explanation of the demonstration

+import time
 import altair as alt
+import numpy as np
 import pandas as pd
+import streamlit as st
 import streamlit_vertical_slider as svs
 import torch
+from scenarios import bimodal, dirac, gauss
+DEMO_INTERVAL = 1.5
+NTL_MSE_SCALING = 0.5
+MAX_LOSS_PLOT = 15
+LAST_STEP = -1
+# """TODO:
+# - Remove flickering of loss evolution scenario plot (lower ylim?)
+# - Move manual part down (predicted token probabilities)
+# - Allow to set GT token for each demo
+# - Add text token to loss evolution barplot
+# - pick good default (4?)
+# """
 # Define options globally as it's used in initialization and UI
 options = [str(i) for i in range(10)] + ["Text"]
 # --- Session State Initialization ---
 # Ensure all session state variables are initialized before first use, especially by widgets.
+if "running_demo" not in st.session_state:
     st.session_state.running_demo = False
+if "demo_step" not in st.session_state:
     st.session_state.demo_step = 0
+if "last_update_time" not in st.session_state:
     st.session_state.last_update_time = 0
+if "loss_container" not in st.session_state:
     st.session_state.loss_container = None
+if "previous_chart_html" not in st.session_state:
     st.session_state.previous_chart_html = ""
+if "active_scenarios" not in st.session_state:
+    # default if you want one to load on first show
+    st.session_state.active_scenarios = dirac
+if "loss_history" not in st.session_state:
+    st.session_state.loss_history = []
 # Initialize states for sliders and ground_truth selector
 # Using len(options) to correctly size for 0-9 + "Text"
 for i in range(len(options)):
     if f"slider_{i}" not in st.session_state:
         st.session_state[f"slider_{i}"] = 1.0 / len(options)
+if "ground_truth" not in st.session_state:
+    st.session_state["ground_truth"] = options[0]  # Default to "0"
 st.title("Number Token Loss - Demo")
+st.markdown(
+    """
+    **Instructions**
+    1. **Pick a ground truth token (0–9).**
+    2. **Select one of the three automated demos:**
+    - **Dirac**: a one-hot (Dirac) distribution whose single 1.0 mass moves from token 0 all the way to “Text.”
+    - **Gaussian**: a peaked Gaussian (0.6 mass at center, 0.4 spread) that slides its center from token 0 to “Text.”
+    - **Bimodal**: two equal peaks (0.5 each) that start at (0,8) and then move symmetrically away from the GT token.
+    """
+)
+if "ground_truth" not in st.session_state:
+    st.session_state["ground_truth"] = "4"
+gt = st.selectbox(
+    "Ground Truth Token",
+    options=options,
+    index=options.index(st.session_state["ground_truth"]),
+    key="ground_truth",
+)
 def apply_scenario(step_idx):
+    scenario = st.session_state.active_scenarios[step_idx]
     for i, val in enumerate(scenario["values"]):
         st.session_state[f"slider_{i}"] = val
+def start_dirac_demo():
+    st.session_state.active_scenarios = dirac
+    st.session_state.running_demo = True
+    st.session_state.demo_step = 0
+    st.session_state.last_update_time = time.time()
+    apply_scenario(0)
+def start_gauss_demo():
+    st.session_state.active_scenarios = gauss
+    st.session_state.running_demo = True
+    st.session_state.demo_step = 0
+    st.session_state.last_update_time = time.time()
+    apply_scenario(0)
+def start_bimodal_demo():
+    st.session_state.active_scenarios = bimodal
     st.session_state.running_demo = True
     st.session_state.demo_step = 0
     st.session_state.last_update_time = time.time()
+    apply_scenario(0)
 def stop_demo():
     st.session_state.running_demo = False
 # --- Demo State Advancement Logic ---
 # This block handles advancing the demo. If it advances, it updates session state
 # and then reruns. This ensures widgets are drawn with the new state in the next run.
 if st.session_state.running_demo:
+    scenario = st.session_state.active_scenarios
     current_time = time.time()
+    if current_time - st.session_state.last_update_time > DEMO_INTERVAL:
+        next_step = (st.session_state.demo_step + 1) % len(scenario)
         st.session_state.demo_step = next_step
         apply_scenario(next_step)  # Update session state for the new scenario
+        st.session_state.last_update_time = time.time()  # Reset timer
         st.rerun()  # Crucial: Rerun to reflect changes in widgets and charts
 # --- UI Rendering ---
 # This section renders the main UI. It executes after any potential rerun from the block above.
 if st.session_state.running_demo:
+    st.info(
+        f"Showing scenario {st.session_state.demo_step + 1}"
+        f"/{len(st.session_state.active_scenarios)}: "
+        f"{st.session_state.active_scenarios[st.session_state.demo_step]['name']}"
+    )
     if st.button("Stop Demo"):
+        st.session_state.running_demo = False
         st.rerun()
+else:
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        if st.button("Run: Dirac"):
+            start_dirac_demo()
+            st.rerun()
+    with col2:
+        if st.button("Run: Gauss"):
+            start_gauss_demo()
+            st.rerun()
+    with col3:
+        if st.button("Run: Bimodal"):
+            start_bimodal_demo()
+            st.rerun()
 # Placeholder for charts and loss calculations that will be updated
 # This section always reads the current st.session_state to generate its content.
+current_prob_values_from_state = [
+    st.session_state.get(f"slider_{j}", 1.0 / len(options)) for j in range(len(options))
+]
 total_from_state = sum(current_prob_values_from_state)
 probs_for_charts = (
     torch.ones(len(options)) / len(options)
     else torch.tensor([v / total_from_state for v in current_prob_values_from_state])
 )
+gt_choice_for_charts = st.session_state.get("ground_truth", options[0])
 if gt_choice_for_charts == "Text":
+    gt_index_for_charts = 10  # Assuming "Text" is the 11th item (index 10)
     gt_numeric_for_charts = None
 else:
     gt_index_for_charts = int(gt_choice_for_charts)
     gt_numeric_for_charts = gt_index_for_charts
+gt = st.session_state["ground_truth"]
+st.markdown(f"#### Predicted Probability Distribution — Ground truth token {gt}")
+df_dist = pd.DataFrame(
+    {"token": options, "probability": probs_for_charts.numpy().round(2)}
+)
+df_dist["type"] = [
+    "Ground Truth" if token == gt_choice_for_charts else "Prediction"
+    for token in options
+]
+bg = (
+    alt.Chart(pd.DataFrame({"token": [gt]}))
+    .mark_bar(size=40, color="lightgray", opacity=0.4)
+    .encode(
+        x=alt.X("token:N", sort=options),
+        x2=alt.X2("token:N"),  # pin the right edge to the same category
+        y=alt.value(0),  # bottom at y=0
+        y2=alt.value(1),  # top at y=1 (full height)
+    )
+)
+bars = (
+    alt.Chart(df_dist)
+    .mark_bar()
+    .encode(
+        x=alt.X(
+            "token:N",
+            title="Token",
+            sort=options,
+            axis=alt.Axis(labelAngle=0, labelFontSize=14, titleFontSize=16),
+        ),
+        y=alt.Y(
+            "probability:Q",
+            title="Probability",
+            scale=alt.Scale(domain=[0, 1]),
+            axis=alt.Axis(format=".2f", labelFontSize=14, titleFontSize=16),
+        ),
+        color=alt.Color(
+            "type:N",
+            scale=alt.Scale(
+                domain=["Ground Truth", "Prediction"], range=["green", "steelblue"]
+            ),
+            legend=alt.Legend(title="Token Type", titleFontSize=16, labelFontSize=14),
+        ),
+        tooltip=[
+            alt.Tooltip("token:N", title="Token"),
+            alt.Tooltip("probability:Q", title="Probability", format=".2f"),
+            alt.Tooltip("type:N", title="Type"),
+        ],
+    )
+    .properties(height=300)
+)
+annot1 = (
+    alt.Chart(pd.DataFrame({"token": [gt]}))
+    .mark_text(
+        text="⬇ Ground",
+        dy=-25,  # 10px above the top of the bar
+        dx=25,
+        fontSize=14,
+        fontWeight="bold",
+        color="green",
+    )
+    .encode(x=alt.X("token:N", sort=options), y=alt.value(1))
+)
+# second line: “truth=4”
+annot2 = (
+    alt.Chart(pd.DataFrame({"token": [gt]}))
+    .mark_text(
+        text=f"truth={gt}",
+        dy=-10,  # 25px above the top, so it sits above line 1
+        dx=35,
+        fontSize=14,
+        fontWeight="bold",
+        color="green",
+    )
+    .encode(x=alt.X("token:N", sort=options), y=alt.value(1))
 )
+# 4) Layer them in order: background, bars, annotation
+final_chart = (bg + bars + annot1 + annot2).properties(height=300)
+st.altair_chart(final_chart, use_container_width=True)
 ce_loss = -torch.log(torch.clamp(probs_for_charts[gt_index_for_charts], min=1e-9))
+if gt_numeric_for_charts is None:  # Text token
+    ntl_mse_loss = torch.tensor(float("nan"))  # MSE not applicable for text
+    ntl_was_loss = torch.tensor(float("nan"))  # WAS not applicable for text
+else:  # Numeric token
+    numeric_probs_for_loss = probs_for_charts[:10]  # Probabilities for 0-9
     # Ensure numeric_probs_for_loss sums to 1 for NTL calculations if it's a subset
     numeric_probs_sum = torch.sum(numeric_probs_for_loss)
+    if numeric_probs_sum > 1e-6:  # Avoid division by zero
+        normalized_numeric_probs = numeric_probs_for_loss / numeric_probs_sum
     else:
+        normalized_numeric_probs = torch.zeros_like(numeric_probs_for_loss)
     loss_values_tensor = torch.arange(0, 10, dtype=torch.float32)
     # Use normalized probabilities for NTL if only considering numeric tokens
+    if gt_choice_for_charts != "Text" and torch.sum(probs_for_charts[:10]) > 1e-6:
+        pred_value = torch.sum(
+            (probs_for_charts[:10] / torch.sum(probs_for_charts[:10]))
+            * loss_values_tensor
+        )
+    elif (
+        gt_choice_for_charts != "Text"
+    ):  # if sum is zero, pred_value is ill-defined or 0
+        pred_value = torch.tensor(0.0)
+    else:  # Should not happen if gt_numeric_for_charts is not None
+        pred_value = torch.tensor(float("nan"))
     if not torch.isnan(pred_value):
+        ntl_mse_loss = ntl_mse_loss = (
+            NTL_MSE_SCALING * (pred_value - float(gt_numeric_for_charts)) ** 2
+        )
         abs_diff = torch.abs(loss_values_tensor - float(gt_numeric_for_charts))
         if gt_choice_for_charts != "Text" and torch.sum(probs_for_charts[:10]) > 1e-6:
+            ntl_was_loss = torch.sum(
+                (probs_for_charts[:10] / torch.sum(probs_for_charts[:10])) * abs_diff
+            )
         elif gt_choice_for_charts != "Text":
+            ntl_was_loss = torch.tensor(0.0)
         else:
+            ntl_was_loss = torch.tensor(float("nan"))
     else:
+        ntl_mse_loss = torch.tensor(float("nan"))
+        ntl_was_loss = torch.tensor(float("nan"))
 ce_val = round(ce_loss.item(), 3)
 was_val = round(ntl_was_loss.item(), 3) if not torch.isnan(ntl_was_loss) else "N/A"
+if len(st.session_state.loss_history) < st.session_state.demo_step + 1:
+    st.session_state.loss_history.append(
+        {
+            "token_index": np.argmax(
+                st.session_state.active_scenarios[st.session_state["demo_step"]][
+                    "values"
+                ]
+            ),
+            # int(np.argmax(st.session_state['values']))
+            # int(),
+            "CE": ce_val,
+            "NTL-MSE": mse_val if mse_val != "N/A" else None,
+            "NTL-WAS": was_val if was_val != "N/A" else None,
+        }
+    )
+    last_step = st.session_state.demo_step
+if st.session_state.loss_history:
+    loss_plot_data = []
+    for entry in st.session_state.loss_history:
+        for loss_type in ["CE", "NTL-MSE", "NTL-WAS"]:
+            if entry[loss_type] is not None:
+                loss_plot_data.append(
+                    {
+                        "Token Index": entry["token_index"],
+                        "Loss Type": loss_type,
+                        "Loss Value": entry[loss_type],  # TODO: clip to MAX_LOSS_PLOT?
+                    }
+                )
+    df_loss_plot = pd.DataFrame(loss_plot_data)
 loss_data = {"Loss": ["Cross Entropy"], "Value": [ce_val]}
 if was_val != "N/A":
     loss_data["Loss"].append("NTL-WAS")
 loss_df = pd.DataFrame(loss_data)
 # ============== Chart Display ==============
+st.subheader("Loss Evolution Over Scenarios")
+x_domain = list(range(10))
+grouped_chart = (
+    alt.Chart(df_loss_plot)
+    .mark_bar()
+    .encode(
+        x=alt.X(
+            "Token Index:O",
+            title="Predicted Token Index",
+            axis=alt.Axis(labelAngle=0),
+            scale=alt.Scale(domain=x_domain),
+        ),
+        y=alt.Y(
+            "Loss Value:Q", title="Loss", scale=alt.Scale(domain=[0, MAX_LOSS_PLOT])
+        ),
+        color=alt.Color("Loss Type:N", legend=alt.Legend(title="Loss")),
+        xOffset="Loss Type:N",  # <== this causes the grouping instead of stacking
+    )
+    .properties(height=300)
+)
+st.altair_chart(grouped_chart, use_container_width=True)
 # Create a single chart for loss visualization
 st.subheader("Loss Comparison")
+st.markdown("""
+Adjust the sliders to set a predicted probability for each token (0-9 and "Text").
+The sliders are vertical and compact. The app normalizes the slider values
+to form a valid probability distribution, visualizes it, and computes the corresponding
+Cross Entropy, NTL-MSE, and NTL-WAS losses.
+""")
 # Create an Altair chart that will look good and redraw cleanly
+chart = (
+    alt.Chart(loss_df)
+    .mark_bar()
+    .encode(
+        x=alt.X("Loss:N", sort=loss_df["Loss"].tolist()),
+        y=alt.Y(
+            "Value:Q",
+            scale=alt.Scale(
+                domain=[
+                    0,
+                    max(
+                        loss_df["Value"].max() * 1.2,
+                        20 if st.session_state.running_demo else 0.5,
+                    ),
+                ]
+            ),
+        ),
+        color=alt.Color(
+            "Loss:N",
+            scale=alt.Scale(
+                domain=["Cross Entropy", "NTL-WAS", "NTL-MSE"],
+                range=["steelblue", "red", "forestgreen"],
+            ),
+        ),
+        tooltip=["Loss", "Value"],
+    )
+    .properties(height=300)
 )
+# Sliders and Ground Truth Selector
+# These widgets will read their initial values from st.session_state.
+# User interactions will update st.session_state directly due to their keys.
+if not st.session_state.running_demo:
+    st.markdown("#### Predicted Token Probabilities")
+    cols = st.columns(len(options))
+    for i, col in enumerate(cols):
+        label = options[i]  # Use token name directly for label
+        with col:
+            svs.vertical_slider(
+                label=label,
+                min_value=0.0,
+                max_value=1.0,
+                step=0.01,
+                height=50,
+                key=f"slider_{i}",  # This key links the widget to st.session_state[f"slider_{i}"]
+                slider_color="green",
+                track_color="lightgray",
+                thumb_color="black",
+            )
 # Add value labels on top of bars
+text = chart.mark_text(align="center", baseline="bottom", dy=-5, fontSize=14).encode(
+    text=alt.Text("Value:Q", format=".3f")
 )
 # Combine chart and text
+final_chart = chart + text
 # Display chart with the full container width
 st.altair_chart(final_chart, use_container_width=True)
 if st.session_state.running_demo:
     # This check is implicitly: if we are here and demo is running, it means
     # the time-based advance condition was NOT met in the block at the top.
+    time.sleep(0.1)
     st.rerun()
 # Add explanation of the demonstration