drop down
Browse files
app.py
CHANGED
@@ -108,14 +108,12 @@ def create_visualization(blt_ps, d_model_slider, local_n_layers_slider):
|
|
108 |
|
109 |
# Local Component Transformer Part: 12 * N_local * D_local^2 + 2 * V_local * D_local
|
110 |
blt_local_transformer_params = (12 * local_n_layers_slider * local_d_model**2) + \
|
111 |
-
|
112 |
|
113 |
# Local Component Cross-Attention Part: N_local * 4 * D_local^2 (estimated)
|
114 |
blt_local_ca_params = local_n_layers_slider * 4 * local_d_model**2
|
115 |
blt_local_total_internal_params = blt_local_transformer_params + blt_local_ca_params
|
116 |
|
117 |
-
# blt_total_model_params = blt_global_internal_params + blt_local_total_internal_params # Kept for potential other uses, not directly for this annotation
|
118 |
-
|
119 |
bpe_params_str = format_params_display(bpe_model_params)
|
120 |
|
121 |
# Format BLT global and local parameters separately
|
@@ -283,7 +281,7 @@ with gr.Blocks(title="BLT vs BPE FLOPs Comparison") as demo:
|
|
283 |
minimum=2,
|
284 |
maximum=24, # Max value for local_n_layers
|
285 |
value=10,
|
286 |
-
step=2,
|
287 |
label="Local Model Layers (local_n_layers)",
|
288 |
info="Number of layers in the BLT's local model"
|
289 |
)
|
@@ -300,21 +298,24 @@ with gr.Blocks(title="BLT vs BPE FLOPs Comparison") as demo:
|
|
300 |
fully compare BPE-based transformers and BLT, you'll need to investigate those
|
301 |
claims in the paper itself.
|
302 |
""")
|
303 |
-
gr.Markdown("### Fixed Parameters")
|
304 |
-
gr.Markdown(f"""
|
305 |
-
- **BPE's bytes per token (bpe_ps)**: {bpe_ps}
|
306 |
-
- **BPE/BLT Global - Num Layers (n_layers)**: {n_layers}
|
307 |
-
- **BPE/BLT Global - Num Heads (n_heads)**: {n_heads}
|
308 |
-
- **BPE - Vocabulary Size (n_vocab)**: {n_vocab:,}
|
309 |
-
- **BPE/BLT - Context Length (n_ctx_base)**: {n_ctx_base:,} bytes
|
310 |
-
- **BLT Local - Model Dimension (local_d_model)**: {local_d_model}
|
311 |
-
- **BLT Local - Num Heads (local_n_heads)**: {local_n_heads}
|
312 |
-
- **BLT Local - Vocabulary Size (local_n_vocab)**: {local_n_vocab}
|
313 |
-
- **BLT Local - FF Multiplier (local_d_ff_multiplier)**: {local_d_ff_multiplier}
|
314 |
-
""")
|
315 |
|
316 |
-
|
317 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
318 |
|
319 |
with gr.Column(scale=2):
|
320 |
plot = gr.Plot(label="FLOPs Comparison & Model Parameters")
|
@@ -328,7 +329,7 @@ with gr.Blocks(title="BLT vs BPE FLOPs Comparison") as demo:
|
|
328 |
bpe_model_p = (12 * n_layers * d_model_val**2) + (2 * n_vocab * d_model_val)
|
329 |
blt_global_p = 12 * n_layers * d_model_val**2
|
330 |
blt_local_transformer_p = (12 * local_n_layers_val * local_d_model**2) + \
|
331 |
-
|
332 |
blt_local_ca_p = local_n_layers_val * 4 * local_d_model**2
|
333 |
blt_local_total_internal_p = blt_local_transformer_p + blt_local_ca_p
|
334 |
blt_total_model_p = blt_global_p + blt_local_total_internal_p
|
|
|
108 |
|
109 |
# Local Component Transformer Part: 12 * N_local * D_local^2 + 2 * V_local * D_local
|
110 |
blt_local_transformer_params = (12 * local_n_layers_slider * local_d_model**2) + \
|
111 |
+
(2 * local_n_vocab * local_d_model)
|
112 |
|
113 |
# Local Component Cross-Attention Part: N_local * 4 * D_local^2 (estimated)
|
114 |
blt_local_ca_params = local_n_layers_slider * 4 * local_d_model**2
|
115 |
blt_local_total_internal_params = blt_local_transformer_params + blt_local_ca_params
|
116 |
|
|
|
|
|
117 |
bpe_params_str = format_params_display(bpe_model_params)
|
118 |
|
119 |
# Format BLT global and local parameters separately
|
|
|
281 |
minimum=2,
|
282 |
maximum=24, # Max value for local_n_layers
|
283 |
value=10,
|
284 |
+
step=2, # Ensure even numbers for CA split
|
285 |
label="Local Model Layers (local_n_layers)",
|
286 |
info="Number of layers in the BLT's local model"
|
287 |
)
|
|
|
298 |
fully compare BPE-based transformers and BLT, you'll need to investigate those
|
299 |
claims in the paper itself.
|
300 |
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
301 |
|
302 |
+
# --- UPDATED SECTION 1: Fixed Parameters dropdown ---
|
303 |
+
with gr.Accordion("Fixed Parameters", open=False):
|
304 |
+
gr.Markdown(f"""
|
305 |
+
- **BPE's bytes per token (bpe_ps)**: {bpe_ps}
|
306 |
+
- **BPE/BLT Global - Num Layers (n_layers)**: {n_layers}
|
307 |
+
- **BPE/BLT Global - Num Heads (n_heads)**: {n_heads}
|
308 |
+
- **BPE - Vocabulary Size (n_vocab)**: {n_vocab:,}
|
309 |
+
- **BPE/BLT - Context Length (n_ctx_base)**: {n_ctx_base:,} bytes
|
310 |
+
- **BLT Local - Model Dimension (local_d_model)**: {local_d_model}
|
311 |
+
- **BLT Local - Num Heads (local_n_heads)**: {local_n_heads}
|
312 |
+
- **BLT Local - Vocabulary Size (local_n_vocab)**: {local_n_vocab}
|
313 |
+
- **BLT Local - FF Multiplier (local_d_ff_multiplier)**: {local_d_ff_multiplier}
|
314 |
+
""")
|
315 |
+
|
316 |
+
# --- UPDATED SECTION 2: Current Values & Totals dropdown ---
|
317 |
+
with gr.Accordion("Current Values & Totals", open=False):
|
318 |
+
info_text = gr.Markdown("")
|
319 |
|
320 |
with gr.Column(scale=2):
|
321 |
plot = gr.Plot(label="FLOPs Comparison & Model Parameters")
|
|
|
329 |
bpe_model_p = (12 * n_layers * d_model_val**2) + (2 * n_vocab * d_model_val)
|
330 |
blt_global_p = 12 * n_layers * d_model_val**2
|
331 |
blt_local_transformer_p = (12 * local_n_layers_val * local_d_model**2) + \
|
332 |
+
(2 * local_n_vocab * local_d_model)
|
333 |
blt_local_ca_p = local_n_layers_val * 4 * local_d_model**2
|
334 |
blt_local_total_internal_p = blt_local_transformer_p + blt_local_ca_p
|
335 |
blt_total_model_p = blt_global_p + blt_local_total_internal_p
|