Spaces:

lucalp
/

byte-latent-transformer-flops

Running

App Files Files Community

lucalp commited on May 25

Commit

6d3d780

1 Parent(s): dbde1bc

Initial commit

Browse files

Files changed (1) hide show

app.py +275 -0

app.py ADDED Viewed

	@@ -0,0 +1,275 @@

+"""
+h/t to Adam Casson for easy-to-use function to calculate FLOPs, source: https://huggingface.co/spaces/adamcasson/transformer-flops-calculator/blob/main/app.py
+"""
+import gradio as gr
+import plotly.graph_objects as go
+import numpy as np
+# Fixed BPE parameters
+bpe_ps = 4.4  # determined by tokenizer
+n_ctx_base = 8192
+n_heads = 20
+n_vocab = 128000
+n_layers = 26
+# Fixed local model parameters
+local_d_model = 1024
+local_g_size = 1
+local_n_ctx = 512  # in bytes
+local_n_heads = 16
+local_n_vocab = 256
+local_d_model_k = local_d_model / local_n_heads
+local_d_ff_multiplier = 4
+def openai_flops_per_token(n_layers, n_heads, d_model, n_ctx, n_vocab, ff_ratio=4):
+    """Open AI method for forward pass FLOPs counting of decoder-only Transformer"""
+    d_attn = d_model // n_heads
+    d_ff = d_model * ff_ratio
+    embeddings = 4 * d_model
+    attn_qkv = 2 * n_layers * d_model * 3 * (d_attn * n_heads)
+    attn_mask = 2 * n_layers * n_ctx * (d_attn * n_heads)
+    attn_project = 2 * n_layers * (d_attn * n_heads) * d_model
+    ff = 2 * n_layers * 2 * d_model * d_ff
+    logits = 2 * d_model * n_vocab
+    return embeddings + attn_qkv + attn_mask + attn_project + ff + logits
+def cross_attention_flops_per_token(n_layers, n_ctx_cross_attn_kv_len, d_model):
+    ca_qo_proj_flops = (
+        # Cross Attention QO FLOPs + backward
+        2 * 4 * d_model**2
+    )
+    ca_context_flops = 4 * n_ctx_cross_attn_kv_len * d_model
+    return n_layers * (ca_qo_proj_flops + ca_context_flops)
+def calculate_flops(blt_ps, d_model, local_n_layers):
+    # BPE calculations
+    n_ctx = int(n_ctx_base / bpe_ps)
+    bpe_flops_per_token = openai_flops_per_token(n_layers, n_heads, d_model, n_ctx, n_vocab)
+    bpe_per_byte = bpe_flops_per_token / bpe_ps
+    # BLT Global calculations
+    blt_n_ctx = int(n_ctx_base / blt_ps)
+    blt_global_flops_per_token = openai_flops_per_token(n_layers, n_heads, d_model, blt_n_ctx, n_vocab=0)
+    blt_global_flops_per_byte = blt_global_flops_per_token / blt_ps
+    # BLT Local calculations
+    local_models_transformer_flops_per_byte = openai_flops_per_token(
+        local_n_layers, local_n_heads, local_d_model, local_n_ctx, local_n_vocab
+    )
+    encoder_model_ca_flops_per_byte = cross_attention_flops_per_token(
+        local_n_layers/2, local_n_ctx, local_d_model
+    )
+    decoder_model_ca_flops_per_byte = cross_attention_flops_per_token(
+        local_n_layers/2, local_n_ctx // blt_ps, local_d_model
+    )
+    local_models_cross_attention_flops_per_byte = encoder_model_ca_flops_per_byte + decoder_model_ca_flops_per_byte
+    local_models_flops = local_models_transformer_flops_per_byte + local_models_cross_attention_flops_per_byte
+    # Calculate advantage
+    blt_total = local_models_flops + blt_global_flops_per_byte
+    advantage = 100 * ((blt_total - bpe_per_byte) / bpe_per_byte)
+    return {
+        'bpe_per_byte': bpe_per_byte,
+        'blt_global': blt_global_flops_per_byte,
+        'blt_local': local_models_flops,
+        'blt_total': blt_total,
+        'advantage': advantage
+    }
+def create_visualization(blt_ps, d_model, local_n_layers):
+    results = calculate_flops(blt_ps, d_model, local_n_layers)
+    # Create the figure with subplots for better control
+    fig = go.Figure()
+    # Add BPE bar (only for BPE category)
+    fig.add_trace(go.Bar(
+        name='BPE',
+        x=['BPE'],
+        y=[results['bpe_per_byte']],
+        text=[f"{results['bpe_per_byte']:.2e}"],
+        textposition='outside',
+        marker_color='#FF6B6B',
+        width=0.4,
+        showlegend=True
+    ))
+    # Add BLT Global bar (base of stack)
+    fig.add_trace(go.Bar(
+        name='BLT Global',
+        x=['BLT'],
+        y=[results['blt_global']],
+        text=[f"{results['blt_global']:.2e}"],
+        textposition='inside',
+        marker_color='#4ECDC4',
+        width=0.4,
+        showlegend=True
+    ))
+    # Add BLT Local bar (top of stack)
+    fig.add_trace(go.Bar(
+        name='BLT Local',
+        x=['BLT'],
+        y=[results['blt_local']],
+        text=[f"{results['blt_local']:.2e}"],
+        textposition='inside',
+        marker_color='#45B7D1',
+        width=0.4,
+        showlegend=True
+    ))
+    # Update layout with proper stacking and scientific notation
+    fig.update_layout(
+        title={
+            'text': f"FLOPs per Byte Comparison<br><sub>BLT FLOPs comparison: {results['advantage']:.1f}%</sub>",
+            'x': 0.5,
+            'xanchor': 'center',
+            'font': {'size': 20}
+        },
+        xaxis=dict(
+            title="Architecture",
+            tickfont=dict(size=14)
+        ),
+        yaxis=dict(
+            title="FLOPs per Byte",
+            tickformat=".1e",  # Scientific notation with 1 decimal
+            tickfont=dict(size=12),
+            gridcolor='lightgray'
+        ),
+        barmode='stack',
+        showlegend=True,
+        height=600,
+        template="plotly_white",
+        font=dict(size=14),
+        bargap=0.3,
+        plot_bgcolor='white'
+    )
+    fig.add_annotation(
+        x='BLT',
+        y=results['blt_total'] * 1.1,  # Position above stacked bar
+        text=f"Total: {results['blt_total']:.2e}",
+        showarrow=False,
+        font=dict(size=12, color="black", weight="bold"),
+        bgcolor="white",
+        bordercolor="black",
+        borderwidth=1
+    )
+    # Update traces to ensure proper stacking
+    fig.update_traces(textfont_size=10)
+    return fig
+# Create Gradio interface
+with gr.Blocks(title="BLT vs BPE FLOPs Comparison") as demo:
+    gr.Markdown("""
+    # BLT vs BPE FLOPs Comparison
+    This interactive visualization compares the computational efficiency (FLOPs per byte) between:
+    - **BPE (Byte Pair Encoding)**: Traditional transformer architecture
+    - **BLT (Byte Latent Transformer)**: Novel architecture with Global and Local components with a dynamic patch size to segment bytes.
+    A few things you'll notice:
+    1. Patch size reduces global model FLOPs but not local model
+    2. Increasing patch size and global model dimension doesn't change total FLOPs
+    3. In smaller BLTs, local models constitute a larger portion of the total FLOPs
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### Adjustable Parameters")
+            blt_ps_slider = gr.Slider(
+                minimum=1.0,
+                maximum=10.0,
+                value=4.4,
+                step=0.1,
+                label="BLT Patch Size (blt_ps)",
+                info="Patch size for BLT architecture"
+            )
+            d_model_slider = gr.Slider(
+                minimum=512,
+                maximum=8192,
+                value=2560,
+                step=128,
+                label="Model Dimension (d_model)",
+                info="Hidden dimension size of the model"
+            )
+            local_n_layers_slider = gr.Slider(
+                minimum=2,
+                maximum=24,
+                value=10,
+                step=2,
+                label="Local Model Layers (local_n_layers)",
+                info="Number of layers in the local model"
+            )
+            gr.Markdown("### Fixed Parameters")
+            gr.Markdown("""
+            - **BPE's bytes per token**: 4.4
+            - **BPE/BLT Number of Layers**: 26
+            - **BPE/BLT Number of Heads**: 20
+            - **BPE's Vocabulary Size**: 128,000
+            - **BPE/BLT Context Length**: 8,192 bytes
+            - **Local Model Dimension**: 1,024
+            - **Local Model Heads**: 16
+            """)
+            gr.Markdown("### Current Values")
+            info_text = gr.Markdown("")
+        with gr.Column(scale=2):
+            plot = gr.Plot(label="FLOPs Comparison")
+    # Set up interactivity
+    def update_plot(blt_ps, d_model, local_n_layers):
+        fig = create_visualization(blt_ps, d_model, local_n_layers)
+        # Calculate values for info display
+        results = calculate_flops(blt_ps, d_model, local_n_layers)
+        info_str = f"""
+        **BPE FLOPs/byte**: {results['bpe_per_byte']:.2e}
+        **BLT Global FLOPs/byte**: {results['blt_global']:.2e}
+        **BLT Local FLOPs/byte**: {results['blt_local']:.2e}
+        **BLT Total FLOPs/byte**: {results['blt_total']:.2e}
+        """
+        return fig, info_str
+    # Update plot when any slider changes
+    blt_ps_slider.change(
+        update_plot,
+        inputs=[blt_ps_slider, d_model_slider, local_n_layers_slider],
+        outputs=[plot, info_text]
+    )
+    d_model_slider.change(
+        update_plot,
+        inputs=[blt_ps_slider, d_model_slider, local_n_layers_slider],
+        outputs=[plot, info_text]
+    )
+    local_n_layers_slider.change(
+        update_plot,
+        inputs=[blt_ps_slider, d_model_slider, local_n_layers_slider],
+        outputs=[plot, info_text]
+    )
+    # Initial plot
+    demo.load(
+        update_plot,
+        inputs=[blt_ps_slider, d_model_slider, local_n_layers_slider],
+        outputs=[plot, info_text]
+    )
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()