Spaces:

lucalp
/

byte-latent-transformer-flops

Running

App Files Files Community

lucalp commited on May 25

Commit

810a93a

1 Parent(s): 7afe1ac

Adding model size breakdown

Browse files

Files changed (1) hide show

app.py +24 -19

app.py CHANGED Viewed

@@ -94,33 +94,37 @@ def format_params_display(num_params):
         return f"{num_params / 1_000_000:.2f}M Params"
 def create_visualization(blt_ps, d_model_slider, local_n_layers_slider):
     results = calculate_flops(blt_ps, d_model_slider, local_n_layers_slider)
     # Calculate model parameters
     # BPE Model Parameters: 12 * N * D^2 + 2 * V * D
-    # N = n_layers (global), D = d_model_slider, V = n_vocab (global)
     bpe_model_params = (12 * n_layers * d_model_slider**2) + (2 * n_vocab * d_model_slider)
     # BLT Model Parameters
     # Global Component: 12 * N * D^2 (no main vocab projection)
-    # N = n_layers (global), D = d_model_slider
     blt_global_internal_params = 12 * n_layers * d_model_slider**2
     # Local Component Transformer Part: 12 * N_local * D_local^2 + 2 * V_local * D_local
-    # N_local = local_n_layers_slider, D_local = local_d_model, V_local = local_n_vocab
     blt_local_transformer_params = (12 * local_n_layers_slider * local_d_model**2) + \
                                    (2 * local_n_vocab * local_d_model)
     # Local Component Cross-Attention Part: N_local * 4 * D_local^2 (estimated)
-    # This assumes 4*D^2 params per CA block (Q,K,V,O projections within local_d_model or from global to local)
-    # and local_n_layers_slider effective CA blocks.
     blt_local_ca_params = local_n_layers_slider * 4 * local_d_model**2
     blt_local_total_internal_params = blt_local_transformer_params + blt_local_ca_params
-    blt_total_model_params = blt_global_internal_params + blt_local_total_internal_params
     bpe_params_str = format_params_display(bpe_model_params)
-    blt_params_str = format_params_display(blt_total_model_params)
     # Create the figure with subplots for better control
     fig = go.Figure()
@@ -181,21 +185,21 @@ def create_visualization(blt_ps, d_model_slider, local_n_layers_slider):
         ),
         barmode='stack',
         showlegend=True,
-        height=650, # Increased height slightly for param text
         template="plotly_white",
         font=dict(size=14),
         bargap=0.3,
         plot_bgcolor='white',
-        margin=dict(b=100) # Add bottom margin for parameter text
     )
     fig.add_annotation(
         x='BLT',
-        y=results['blt_total'] * 1.05,  # Position above stacked bar, adjust if needed
         text=f"Total FLOPs/Byte: {results['blt_total']:.2e}",
         showarrow=False,
-        font=dict(size=12, color="black"), # Removed bold to differentiate from param text
-        bgcolor="rgba(255,255,255,0.5)", # Slight background for readability
         bordercolor="black",
         borderwidth=1,
         xanchor='center',
@@ -209,24 +213,25 @@ def create_visualization(blt_ps, d_model_slider, local_n_layers_slider):
         text=bpe_params_str,
         showarrow=False,
         xref="x",
-        yref="paper", # Use paper coordinates for y to position below x-axis
         yanchor='top',
         xanchor='center',
-        yshift=-35, # Adjust this value to position correctly below the bar
-        font=dict(size=11, color="black", weight="bold"),
     )
     fig.add_annotation(
         x='BLT',
         y=0,
-        text=blt_params_str,
         showarrow=False,
         xref="x",
         yref="paper",
         yanchor='top',
         xanchor='center',
-        yshift=-35, # Adjust this value
-        font=dict(size=11, color="black", weight="bold"),
     )
@@ -246,7 +251,7 @@ with gr.Blocks(title="BLT vs BPE FLOPs Comparison") as demo:
     A few things you'll notice:
     1. Patch size reduces global model FLOPs but not local model
-    2. Increasing patch size and global model dimension doesn't change total FLOPs (Note: FLOPs yes, parameters will change with d_model)
     3. In smaller BLTs, local models constitute a larger portion of the total FLOPs
     Parameter counts are displayed below each bar.
     """)

         return f"{num_params / 1_000_000:.2f}M Params"
 def create_visualization(blt_ps, d_model_slider, local_n_layers_slider):
     results = calculate_flops(blt_ps, d_model_slider, local_n_layers_slider)
     # Calculate model parameters
     # BPE Model Parameters: 12 * N * D^2 + 2 * V * D
     bpe_model_params = (12 * n_layers * d_model_slider**2) + (2 * n_vocab * d_model_slider)
     # BLT Model Parameters
     # Global Component: 12 * N * D^2 (no main vocab projection)
     blt_global_internal_params = 12 * n_layers * d_model_slider**2
     # Local Component Transformer Part: 12 * N_local * D_local^2 + 2 * V_local * D_local
     blt_local_transformer_params = (12 * local_n_layers_slider * local_d_model**2) + \
                                    (2 * local_n_vocab * local_d_model)
     # Local Component Cross-Attention Part: N_local * 4 * D_local^2 (estimated)
     blt_local_ca_params = local_n_layers_slider * 4 * local_d_model**2
     blt_local_total_internal_params = blt_local_transformer_params + blt_local_ca_params
+    # blt_total_model_params = blt_global_internal_params + blt_local_total_internal_params # Kept for potential other uses, not directly for this annotation
     bpe_params_str = format_params_display(bpe_model_params)
+    # Format BLT global and local parameters separately
+    blt_global_params_fmt_str = format_params_display(blt_global_internal_params)
+    blt_local_params_fmt_str = format_params_display(blt_local_total_internal_params)
+    # Combine for annotation text, using <br> for line break
+    blt_combined_params_str = f"Global: {blt_global_params_fmt_str}<br>Local: {blt_local_params_fmt_str}"
     # Create the figure with subplots for better control
     fig = go.Figure()
         ),
         barmode='stack',
         showlegend=True,
+        height=650,
         template="plotly_white",
         font=dict(size=14),
         bargap=0.3,
         plot_bgcolor='white',
+        margin=dict(b=110) # Increased bottom margin slightly more for two lines of text
     )
     fig.add_annotation(
         x='BLT',
+        y=results['blt_total'] * 1.05,
         text=f"Total FLOPs/Byte: {results['blt_total']:.2e}",
         showarrow=False,
+        font=dict(size=12, color="black"),
+        bgcolor="rgba(255,255,255,0.5)",
         bordercolor="black",
         borderwidth=1,
         xanchor='center',
         text=bpe_params_str,
         showarrow=False,
         xref="x",
+        yref="paper",
         yanchor='top',
         xanchor='center',
+        yshift=-35,
+        font=dict(size=10, color="black", weight="bold"), # Font size 10 for param text
     )
     fig.add_annotation(
         x='BLT',
         y=0,
+        text=blt_combined_params_str, # Using the new combined string with breakdown
         showarrow=False,
         xref="x",
         yref="paper",
         yanchor='top',
         xanchor='center',
+        yshift=-45, # Adjusted yshift for two lines of text
+        font=dict(size=10, color="black", weight="bold"), # Font size 10 for param text
+        align="center" # Ensure text is centered if it wraps due to <br>
     )
     A few things you'll notice:
     1. Patch size reduces global model FLOPs but not local model
+    2. Increasing patch size and global model dimension doesn't change total FLOPs
     3. In smaller BLTs, local models constitute a larger portion of the total FLOPs
     Parameter counts are displayed below each bar.
     """)