Spaces:
Running
Running
File size: 10,620 Bytes
71afaf8 c994801 5832682 c994801 5832682 c994801 5832682 c994801 5832682 c994801 5832682 c994801 206b146 c994801 f63f16f c994801 e63421f c994801 38a238d da578ca c994801 5832682 c994801 da578ca c994801 5832682 da578ca e63421f c994801 5832682 da578ca e63421f c994801 da578ca c994801 5832682 c994801 5832682 9783073 5832682 c994801 65b682e d407658 c994801 5832682 c994801 5832682 c994801 5832682 ef0f6be d407658 5832682 c994801 45693cc d93e1b9 c994801 5832682 c0e798c eba1e2c c0e798c eba1e2c c0e798c d93e1b9 adcd39d c0e798c d93e1b9 c0e798c d93e1b9 c0e798c adcd39d eba1e2c c0e798c eba1e2c c0e798c 5832682 c994801 06efbe9 c994801 5832682 c0e798c 5832682 c0e798c 5832682 c994801 c0e798c 5832682 c994801 c0e798c 5832682 c994801 5832682 c994801 5832682 c994801 5832682 c994801 77c778e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 |
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>TokenVisualizer — Minimal</title>
<link rel="preconnect" href="https://fonts.googleapis.com">
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600&family=JetBrains+Mono:wght@400;600&display=swap" rel="stylesheet">
<style>
:root{
--bg:#0b0f14; --text:#ffffff; --muted:#9aa4b2; --accent:#38bdf8; --border:#1f2a3a;
--card1:#0c1624; --card2:#0a1220; --chip:#111827; --chip-border:#263246; --chip-hover:#1a2434;
--mono:'JetBrains Mono',ui-monospace,Menlo,Consolas,monospace; --sans:Inter,system-ui,-apple-system,"Segoe UI",Roboto,Ubuntu,"Helvetica Neue",Arial;
}
*{box-sizing:border-box} body{margin:0;background:radial-gradient(900px 500px at 10% -10%, #07314a, transparent),var(--bg);color:var(--text);font-family:var(--sans)}
.container{max-width:1100px;margin:0 auto;padding:1.25rem}
header{padding-top:1.5rem} h1{margin:.2rem 0 .4rem;font-size:1.9rem}
.sub{color:var(--muted);margin:.25rem 0 1rem}
.card{background:linear-gradient(180deg,var(--card1),var(--card2));border:1px solid var(--border);border-radius:14px;padding:1rem;box-shadow:0 10px 40px rgba(0,0,0,.35)}
label span{color:var(--muted);font-size:.9rem}
select,textarea{width:100%;border-radius:10px;border:1px solid var(--border);background:#0a1220;color:var(--text);padding:.7rem .85rem;outline:none}
select:focus,textarea:focus{border-color:var(--accent)}
.controls{display:grid;gap:.8rem;margin-bottom:1rem}
.row{display:flex;gap:.75rem;align-items:center}
.status{color:var(--muted)}
.grid{display:grid;gap:1rem;grid-template-columns:1fr}
@media (min-width:900px){.grid{grid-template-columns:1fr 1fr}}
.head{display:block;align-items:center;justify-content:space-between;margin-bottom:0rem}
.tokens{display:flex;flex-wrap:wrap;gap:.5rem;max-height:360px;overflow:auto;padding:.25rem}
.chip{border:1px solid var(--chip-border);background:var(--chip);padding:.35rem .5rem;border-radius:10px;font-family:var(--mono);font-size:.9rem;transition:background .12s,border-color .12s}
.chip:hover{background:var(--chip-hover);border-color:var(--accent)}
.chip.active{outline:2px solid var(--accent)}
.chip.special {border-color: #38bdf8;background: #0b2235;}
pre.ids{font-family:var(--mono);background:#0a1220;border:1px solid var(--border);border-radius:10px;padding:.75rem;max-height:360px;overflow:auto;white-space:pre-wrap}
.caption{color:var(--muted);font-size:.9rem;margin-top:0rem;margin-bottom:.75rem;}
footer{color:var(--muted);text-align:center;padding:1.25rem 0 2rem}
a{color:var(--accent)}
</style>
</head>
<body>
<header class="container">
<h1>Token Visualizer</h1>
<p class="sub">Enter any text and see how AI turns it into tokens and IDs, the building blocks of its thinking.</p>
</header>
<main class="container">
<section class="card controls">
<label>
<span>Model</span>
<select id="model">
<!-- Tip: keep this first so the demo works instantly once you upload /assets/gpt2/* -->
<option value="local:gpt2">GPT-2 (local, fast)</option>
<option value="Xenova/llama2-tokenizer">Llama-2 (Hub)</option>
<option value="Xenova/mistral-tokenizer">Mistral (Hub)</option>
<option value="Xenova/gemma-tokenizer">Gemma (Hub)</option>
<option value="Xenova/bert-base-uncased">BERT Base Uncased (Hub)</option>
</select>
</label>
<label>
<span>Text</span>
<textarea id="input" rows="3">Curiosity propelled the cat to unfathomable heights.</textarea>
</label>
<div class="row">
<span id="status" class="status">Loading tokenizer…</span>
</div>
</section>
<section class="grid">
<article class="card">
<div class="head"><h3>Tokens</h3></div>
<p class="caption">The smallest language units the model works with.</p>
<div id="tokens" class="tokens"></div>
</article>
<article class="card">
<div class="head"><h3>Token IDs</h3></div>
<p class="caption">Their numeric form inside the model’s computations.</p>
<pre id="ids" class="ids"></pre>
</article>
</section>
</main>
<footer class="container">
<small>Built by Peter Adams • Powered in your browser by <a href="https://github.com/xenova/transformers.js" target="_blank" rel="noreferrer">Transformers.js</a></small>
</footer>
<!-- Minimal, robust script (no copy/export) -->
<script type="module">
// Prefer keeping all requests on huggingface.co to avoid CORS/VPN issues.
// Option 1 (simple): CDN import (works on many networks)
const tf = await import('https://cdn.jsdelivr.net/npm/@xenova/[email protected]');
// Option 2 (bulletproof): self-host the file in your Space and use:
// const tf = await import('./assets/vendor/transformers.min.js');
tf.env.useBrowserCache = true;
tf.env.allowLocalModels = true; // <-- REQUIRED for local folder paths
tf.env.localModelPath = '/';
const $ = s => document.querySelector(s);
const modelSel = $('#model');
const inputEl = $('#input');
const statusEl = $('#status');
const tokensEl = $('#tokens');
const idsEl = $('#ids');
// Single state object; never reassign
const state = { tokens: [], ids: [] };
let tokenizer = null;
let runId = 0;
const status = (msg) => { statusEl.textContent = msg; };
const debounce = (fn, ms=200) => { let t; return (...a)=>{ clearTimeout(t); t=setTimeout(()=>fn(...a), ms); }; };
async function loadTokenizer(modelId){
status('Loading tokenizer…');
try {
if (modelId === 'local:gpt2') {
// Note: no double slashes, no /resolve/main – just your folder.
tokenizer = await tf.AutoTokenizer.from_pretrained('assets/gpt2');
} else {
tokenizer = await tf.AutoTokenizer.from_pretrained(modelId);
}
status('Tokenizer ready.');
} catch (e) {
console.error('Tokenizer load failed:', e);
tokenizer = null;
status('Failed to load tokenizer (network blocked or slow). Try GPT-2 or a different VPN route.');
}
}
async function tokenize(){
const myRun = ++runId;
if (!tokenizer) {
await loadTokenizer(modelSel.value);
if (!tokenizer) { render(); return; }
}
// Make sure we always pass a string to encode()
const text = String(inputEl.value ?? '').trim();
if (!text) {
state.tokens = [];
state.ids = [];
render();
status('Type to tokenize…');
return;
}
status('Tokenizing…');
try {
const enc = await tokenizer.encode(text); // include specials (default)
// Handle both array/object return shapes
const ids = Array.isArray(enc)
? enc
: (enc && (enc.ids ?? enc.input_ids ?? enc.inputIds)) || [];
// Map special IDs -> special token strings (if available)
const specialIds = Array.from(tokenizer.all_special_ids || []);
const specialTokens = Array.from(tokenizer.all_special_tokens || []);
const idToSpecial = new Map(specialIds.map((id, i) => [id, specialTokens[i]]));
// Build token strings for every ID (specials included)
let tokens = [];
if (typeof tokenizer.convert_ids_to_tokens === 'function') {
tokens = tokenizer.convert_ids_to_tokens(ids);
} else if (typeof tokenizer.id_to_token === 'function') {
tokens = ids.map(id => tokenizer.id_to_token(id));
} else if (!Array.isArray(enc) && Array.isArray(enc.tokens)) {
tokens = enc.tokens;
} else {
// Fallback: decode each ID as a single-piece token
tokens = ids.map(id =>
tokenizer.decode([id], {
// we WANT specials in the stream; decode may return "" for them
skip_special_tokens: false,
clean_up_tokenization_spaces: false,
})
);
}
// Ensure specials are visible: if a special token decodes to empty,
// replace it with its canonical name or a generic tag.
tokens = tokens.map((tok, i) => {
const id = ids[i];
if (tok && tok.length) return tok;
if (idToSpecial.has(id)) return idToSpecial.get(id); // e.g., <|endoftext|> for GPT-2
return `<special:${id}>`;
});
if (myRun !== runId) return;
state.tokens = tokens;
state.ids = ids; // include specials in the count
render();
status(`Done. ${state.tokens.length} tokens.`);
} catch (e) {
console.error('Tokenize failed:', e);
render();
status('Error tokenizing. See console.');
}
}
function render(){
const tokens = Array.isArray(state.tokens) ? state.tokens : [];
const ids = Array.isArray(state.ids) ? state.ids : [];
const specialSet = new Set(tokenizer.all_special_ids || []);
tokensEl.innerHTML = '';
tokens.forEach((tok, i) => {
const chip = document.createElement('span');
chip.className = 'chip';
if (specialSet.has(ids[i])) chip.classList.add('special'); // <-- highlight specials
chip.dataset.i = i;
chip.textContent = tok;
chip.addEventListener('mouseenter', ()=>highlight(i, true));
chip.addEventListener('mouseleave', ()=>highlight(i, false));
tokensEl.appendChild(chip);
});
idsEl.textContent = ids.join(' ');
if (tokens.length === 0) status('Type to tokenize…');
}
function highlight(i, on){
const ids = Array.isArray(state.ids) ? state.ids : [];
if (!ids.length) return;
const parts = ids.map((id, idx) => (idx === i && on) ? `[${id}]` : String(id));
idsEl.textContent = parts.join(' ');
const chip = tokensEl.querySelector(`[data-i="${i}"]`);
if (chip) chip.classList.toggle('active', on);
}
const debounced = debounce(tokenize, 200);
inputEl.addEventListener('input', debounced);
modelSel.addEventListener('change', async ()=>{
tokenizer = null; // force reload
await loadTokenizer(modelSel.value);
tokenize();
});
// Initial load
await loadTokenizer(modelSel.value);
tokenize();
</script>
</body>
</html>
|