lucalp commited on
Commit
515c5ed
·
1 Parent(s): 86aec55
Files changed (1) hide show
  1. app.py +20 -19
app.py CHANGED
@@ -108,14 +108,12 @@ def create_visualization(blt_ps, d_model_slider, local_n_layers_slider):
108
 
109
  # Local Component Transformer Part: 12 * N_local * D_local^2 + 2 * V_local * D_local
110
  blt_local_transformer_params = (12 * local_n_layers_slider * local_d_model**2) + \
111
- (2 * local_n_vocab * local_d_model)
112
 
113
  # Local Component Cross-Attention Part: N_local * 4 * D_local^2 (estimated)
114
  blt_local_ca_params = local_n_layers_slider * 4 * local_d_model**2
115
  blt_local_total_internal_params = blt_local_transformer_params + blt_local_ca_params
116
 
117
- # blt_total_model_params = blt_global_internal_params + blt_local_total_internal_params # Kept for potential other uses, not directly for this annotation
118
-
119
  bpe_params_str = format_params_display(bpe_model_params)
120
 
121
  # Format BLT global and local parameters separately
@@ -283,7 +281,7 @@ with gr.Blocks(title="BLT vs BPE FLOPs Comparison") as demo:
283
  minimum=2,
284
  maximum=24, # Max value for local_n_layers
285
  value=10,
286
- step=2, # Ensure even numbers for CA split
287
  label="Local Model Layers (local_n_layers)",
288
  info="Number of layers in the BLT's local model"
289
  )
@@ -300,21 +298,24 @@ with gr.Blocks(title="BLT vs BPE FLOPs Comparison") as demo:
300
  fully compare BPE-based transformers and BLT, you'll need to investigate those
301
  claims in the paper itself.
302
  """)
303
- gr.Markdown("### Fixed Parameters")
304
- gr.Markdown(f"""
305
- - **BPE's bytes per token (bpe_ps)**: {bpe_ps}
306
- - **BPE/BLT Global - Num Layers (n_layers)**: {n_layers}
307
- - **BPE/BLT Global - Num Heads (n_heads)**: {n_heads}
308
- - **BPE - Vocabulary Size (n_vocab)**: {n_vocab:,}
309
- - **BPE/BLT - Context Length (n_ctx_base)**: {n_ctx_base:,} bytes
310
- - **BLT Local - Model Dimension (local_d_model)**: {local_d_model}
311
- - **BLT Local - Num Heads (local_n_heads)**: {local_n_heads}
312
- - **BLT Local - Vocabulary Size (local_n_vocab)**: {local_n_vocab}
313
- - **BLT Local - FF Multiplier (local_d_ff_multiplier)**: {local_d_ff_multiplier}
314
- """)
315
 
316
- gr.Markdown("### Current Values & Totals")
317
- info_text = gr.Markdown("")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
 
319
  with gr.Column(scale=2):
320
  plot = gr.Plot(label="FLOPs Comparison & Model Parameters")
@@ -328,7 +329,7 @@ with gr.Blocks(title="BLT vs BPE FLOPs Comparison") as demo:
328
  bpe_model_p = (12 * n_layers * d_model_val**2) + (2 * n_vocab * d_model_val)
329
  blt_global_p = 12 * n_layers * d_model_val**2
330
  blt_local_transformer_p = (12 * local_n_layers_val * local_d_model**2) + \
331
- (2 * local_n_vocab * local_d_model)
332
  blt_local_ca_p = local_n_layers_val * 4 * local_d_model**2
333
  blt_local_total_internal_p = blt_local_transformer_p + blt_local_ca_p
334
  blt_total_model_p = blt_global_p + blt_local_total_internal_p
 
108
 
109
  # Local Component Transformer Part: 12 * N_local * D_local^2 + 2 * V_local * D_local
110
  blt_local_transformer_params = (12 * local_n_layers_slider * local_d_model**2) + \
111
+ (2 * local_n_vocab * local_d_model)
112
 
113
  # Local Component Cross-Attention Part: N_local * 4 * D_local^2 (estimated)
114
  blt_local_ca_params = local_n_layers_slider * 4 * local_d_model**2
115
  blt_local_total_internal_params = blt_local_transformer_params + blt_local_ca_params
116
 
 
 
117
  bpe_params_str = format_params_display(bpe_model_params)
118
 
119
  # Format BLT global and local parameters separately
 
281
  minimum=2,
282
  maximum=24, # Max value for local_n_layers
283
  value=10,
284
+ step=2, # Ensure even numbers for CA split
285
  label="Local Model Layers (local_n_layers)",
286
  info="Number of layers in the BLT's local model"
287
  )
 
298
  fully compare BPE-based transformers and BLT, you'll need to investigate those
299
  claims in the paper itself.
300
  """)
 
 
 
 
 
 
 
 
 
 
 
 
301
 
302
+ # --- UPDATED SECTION 1: Fixed Parameters dropdown ---
303
+ with gr.Accordion("Fixed Parameters", open=False):
304
+ gr.Markdown(f"""
305
+ - **BPE's bytes per token (bpe_ps)**: {bpe_ps}
306
+ - **BPE/BLT Global - Num Layers (n_layers)**: {n_layers}
307
+ - **BPE/BLT Global - Num Heads (n_heads)**: {n_heads}
308
+ - **BPE - Vocabulary Size (n_vocab)**: {n_vocab:,}
309
+ - **BPE/BLT - Context Length (n_ctx_base)**: {n_ctx_base:,} bytes
310
+ - **BLT Local - Model Dimension (local_d_model)**: {local_d_model}
311
+ - **BLT Local - Num Heads (local_n_heads)**: {local_n_heads}
312
+ - **BLT Local - Vocabulary Size (local_n_vocab)**: {local_n_vocab}
313
+ - **BLT Local - FF Multiplier (local_d_ff_multiplier)**: {local_d_ff_multiplier}
314
+ """)
315
+
316
+ # --- UPDATED SECTION 2: Current Values & Totals dropdown ---
317
+ with gr.Accordion("Current Values & Totals", open=False):
318
+ info_text = gr.Markdown("")
319
 
320
  with gr.Column(scale=2):
321
  plot = gr.Plot(label="FLOPs Comparison & Model Parameters")
 
329
  bpe_model_p = (12 * n_layers * d_model_val**2) + (2 * n_vocab * d_model_val)
330
  blt_global_p = 12 * n_layers * d_model_val**2
331
  blt_local_transformer_p = (12 * local_n_layers_val * local_d_model**2) + \
332
+ (2 * local_n_vocab * local_d_model)
333
  blt_local_ca_p = local_n_layers_val * 4 * local_d_model**2
334
  blt_local_total_internal_p = blt_local_transformer_p + blt_local_ca_p
335
  blt_total_model_p = blt_global_p + blt_local_total_internal_p