File size: 5,331 Bytes
a53f381
 
 
 
 
 
 
cb79870
a53f381
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb79870
a53f381
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import os
# turn off Streamlit’s automatic file-watching
os.environ["STREAMLIT_SERVER_ENABLE_FILE_WATCHER"] = "false"

import sys
import types
import torch               # now safe to import
import streamlit as st
import numpy as np

# Prevent Streamlit from trying to walk torch.classes' non-standard __path__
if isinstance(getattr(sys.modules.get("torch"), "classes", None), types.ModuleType):
    torch.classes.__path__ = []

# pip install tiktoken transformers
import tiktoken
from transformers import GPT2TokenizerFast

st.set_page_config(page_title="Embedding Dimension Visualizer", layout="wide")
st.title("🔍 Embedding Dimension Visualizer")

# ---- THEORY EXPANDER ----
with st.expander("📖 Theory: Tokenization, BPE & Positional Encoding"):
    st.markdown("""
**1️⃣ Tokenization**  
Splits raw text into atomic units (“tokens”).  

**2️⃣ Byte-Pair Encoding (BPE)**  
Iteratively merges the most frequent pair of symbols to build a subword vocabulary.  
E.g.  "embedding" → ["em", "bed", "ding"]

**3️⃣ Positional Encoding**  
We add a deterministic sinusoidal vector to each token embedding so the model knows position.
""")
    st.markdown("For embedding dimension \(d\), position \(pos\) and channel index \(i\):")
    st.latex(r"""\mathrm{PE}_{(pos,\,2i)}   = \sin\!\Bigl(\frac{pos}{10000^{2i/d}}\Bigr)""")
    st.latex(r"""\mathrm{PE}_{(pos,\,2i+1)} = \cos\!\Bigl(\frac{pos}{10000^{2i/d}}\Bigr)""")
    st.markdown("""
- \(pos\) starts at 0 for the first token  
- Even channels use \(\sin\), odd channels use \(\cos\)  
- This injects unique, smoothly varying positional signals into each embedding  
""")


# ---- Sidebar ----
with st.sidebar:
    st.header("Settings")
    input_text = st.text_input("Enter text to embed", value="Hello world!")
    dim = st.number_input(
        "Embedding dimensions",
        min_value=2,
        max_value=1536,
        value=3,
        step=1,
        help="Choose 2, 3, 512, 768, 1536, etc."
    )
    tokenizer_choice = st.selectbox(
        "Choose tokenizer",
        ["tiktoken", "openai", "huggingface"],
        help="Which tokenization scheme to demo."
    )
    generate = st.button("Generate / Reset Embedding")

if not generate:
    st.info("Adjust the settings in the sidebar and click **Generate / Reset Embedding** to see the tokens and sliders.")
    st.stop()

# ---- Tokenize ----
if tokenizer_choice in ("tiktoken", "openai"):
    model_name = "gpt2" if tokenizer_choice=="tiktoken" else "gpt-3.5-turbo"
    enc = tiktoken.encoding_for_model(model_name)
    token_ids = enc.encode(input_text)
    token_strs = [enc.decode([tid]) for tid in token_ids]
else:
    hf_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
    token_ids = hf_tokenizer.encode(input_text)
    token_strs = hf_tokenizer.convert_ids_to_tokens(token_ids)

st.subheader("🪶 Tokens and IDs")
for i, (tok, tid) in enumerate(zip(token_strs, token_ids), start=1):
    st.write(f"**{i}.** `{tok}` → ID **{tid}**")

st.write("---")
st.subheader("📊 Embedding + Positional Encoding per Token")
st.write(f"Input: `{input_text}` | Tokenizer: **{tokenizer_choice}** | Dims per token: **{dim}**")
if dim > 20:
    st.warning("Showing >20 sliders per block may be unwieldy; consider smaller dims for teaching.")

# helper for sinusoidal positional encoding
def get_positional_encoding(position: int, d_model: int) -> np.ndarray:
    pe = np.zeros(d_model, dtype=float)
    for i in range(d_model):
        angle = position / np.power(10000, (2 * (i // 2)) / d_model)
        pe[i] = np.sin(angle) if (i % 2 == 0) else np.cos(angle)
    return pe

# ---- For each token, three slider‐blocks ----
for t_idx, tok in enumerate(token_strs, start=1):
    emb = np.random.uniform(-1.0, 1.0, size=dim)
    pe  = get_positional_encoding(t_idx - 1, dim)
    combined = emb + pe

    with st.expander(f"Token {t_idx}: `{tok}`"):
        st.markdown("**1️⃣ Embedding**")
        for d in range(dim):
            st.slider(
                label=f"Emb Dim {d+1}",
                min_value=-1.0, max_value=1.0,
                value=float(emb[d]),
                key=f"t{t_idx}_emb{d+1}",
                disabled=True
            )

        st.markdown("**2️⃣ Positional Encoding (sin / cos)**")
        for d in range(dim):
            st.slider(
                label=f"PE Dim {d+1}",
                min_value=-1.0, max_value=1.0,
                value=float(pe[d]),
                key=f"t{t_idx}_pe{d+1}",
                disabled=True
            )

        st.markdown("**3️⃣ Embedding + Positional Encoding**")
        for d in range(dim):
            st.slider(
                label=f"Sum Dim {d+1}",
                min_value=-2.0, max_value=2.0,
                value=float(combined[d]),
                key=f"t{t_idx}_sum{d+1}",
                disabled=True
            )

# ---- NEW FINAL SECTION ----
st.write("---")
st.subheader("Final Input Embedding Plus Positional Encoding Ready to Send to ATtention Heads")

for t_idx, tid in enumerate(token_ids, start=1):
    with st.expander(f"Token ID {tid}"):
        for d in range(1, dim+1):
            # pull the “sum” value out of session state
            val = st.session_state.get(f"t{t_idx}_sum{d}", None)
            st.write(f"Dim {d}: {val:.4f}" if val is not None else f"Dim {d}: N/A")