Spaces:
Running
Running
File size: 5,331 Bytes
a53f381 cb79870 a53f381 cb79870 a53f381 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import os
# turn off Streamlit’s automatic file-watching
os.environ["STREAMLIT_SERVER_ENABLE_FILE_WATCHER"] = "false"
import sys
import types
import torch # now safe to import
import streamlit as st
import numpy as np
# Prevent Streamlit from trying to walk torch.classes' non-standard __path__
if isinstance(getattr(sys.modules.get("torch"), "classes", None), types.ModuleType):
torch.classes.__path__ = []
# pip install tiktoken transformers
import tiktoken
from transformers import GPT2TokenizerFast
st.set_page_config(page_title="Embedding Dimension Visualizer", layout="wide")
st.title("🔍 Embedding Dimension Visualizer")
# ---- THEORY EXPANDER ----
with st.expander("📖 Theory: Tokenization, BPE & Positional Encoding"):
st.markdown("""
**1️⃣ Tokenization**
Splits raw text into atomic units (“tokens”).
**2️⃣ Byte-Pair Encoding (BPE)**
Iteratively merges the most frequent pair of symbols to build a subword vocabulary.
E.g. "embedding" → ["em", "bed", "ding"]
**3️⃣ Positional Encoding**
We add a deterministic sinusoidal vector to each token embedding so the model knows position.
""")
st.markdown("For embedding dimension \(d\), position \(pos\) and channel index \(i\):")
st.latex(r"""\mathrm{PE}_{(pos,\,2i)} = \sin\!\Bigl(\frac{pos}{10000^{2i/d}}\Bigr)""")
st.latex(r"""\mathrm{PE}_{(pos,\,2i+1)} = \cos\!\Bigl(\frac{pos}{10000^{2i/d}}\Bigr)""")
st.markdown("""
- \(pos\) starts at 0 for the first token
- Even channels use \(\sin\), odd channels use \(\cos\)
- This injects unique, smoothly varying positional signals into each embedding
""")
# ---- Sidebar ----
with st.sidebar:
st.header("Settings")
input_text = st.text_input("Enter text to embed", value="Hello world!")
dim = st.number_input(
"Embedding dimensions",
min_value=2,
max_value=1536,
value=3,
step=1,
help="Choose 2, 3, 512, 768, 1536, etc."
)
tokenizer_choice = st.selectbox(
"Choose tokenizer",
["tiktoken", "openai", "huggingface"],
help="Which tokenization scheme to demo."
)
generate = st.button("Generate / Reset Embedding")
if not generate:
st.info("Adjust the settings in the sidebar and click **Generate / Reset Embedding** to see the tokens and sliders.")
st.stop()
# ---- Tokenize ----
if tokenizer_choice in ("tiktoken", "openai"):
model_name = "gpt2" if tokenizer_choice=="tiktoken" else "gpt-3.5-turbo"
enc = tiktoken.encoding_for_model(model_name)
token_ids = enc.encode(input_text)
token_strs = [enc.decode([tid]) for tid in token_ids]
else:
hf_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
token_ids = hf_tokenizer.encode(input_text)
token_strs = hf_tokenizer.convert_ids_to_tokens(token_ids)
st.subheader("🪶 Tokens and IDs")
for i, (tok, tid) in enumerate(zip(token_strs, token_ids), start=1):
st.write(f"**{i}.** `{tok}` → ID **{tid}**")
st.write("---")
st.subheader("📊 Embedding + Positional Encoding per Token")
st.write(f"Input: `{input_text}` | Tokenizer: **{tokenizer_choice}** | Dims per token: **{dim}**")
if dim > 20:
st.warning("Showing >20 sliders per block may be unwieldy; consider smaller dims for teaching.")
# helper for sinusoidal positional encoding
def get_positional_encoding(position: int, d_model: int) -> np.ndarray:
pe = np.zeros(d_model, dtype=float)
for i in range(d_model):
angle = position / np.power(10000, (2 * (i // 2)) / d_model)
pe[i] = np.sin(angle) if (i % 2 == 0) else np.cos(angle)
return pe
# ---- For each token, three slider‐blocks ----
for t_idx, tok in enumerate(token_strs, start=1):
emb = np.random.uniform(-1.0, 1.0, size=dim)
pe = get_positional_encoding(t_idx - 1, dim)
combined = emb + pe
with st.expander(f"Token {t_idx}: `{tok}`"):
st.markdown("**1️⃣ Embedding**")
for d in range(dim):
st.slider(
label=f"Emb Dim {d+1}",
min_value=-1.0, max_value=1.0,
value=float(emb[d]),
key=f"t{t_idx}_emb{d+1}",
disabled=True
)
st.markdown("**2️⃣ Positional Encoding (sin / cos)**")
for d in range(dim):
st.slider(
label=f"PE Dim {d+1}",
min_value=-1.0, max_value=1.0,
value=float(pe[d]),
key=f"t{t_idx}_pe{d+1}",
disabled=True
)
st.markdown("**3️⃣ Embedding + Positional Encoding**")
for d in range(dim):
st.slider(
label=f"Sum Dim {d+1}",
min_value=-2.0, max_value=2.0,
value=float(combined[d]),
key=f"t{t_idx}_sum{d+1}",
disabled=True
)
# ---- NEW FINAL SECTION ----
st.write("---")
st.subheader("Final Input Embedding Plus Positional Encoding Ready to Send to ATtention Heads")
for t_idx, tid in enumerate(token_ids, start=1):
with st.expander(f"Token ID {tid}"):
for d in range(1, dim+1):
# pull the “sum” value out of session state
val = st.session_state.get(f"t{t_idx}_sum{d}", None)
st.write(f"Dim {d}: {val:.4f}" if val is not None else f"Dim {d}: N/A")
|