Spaces:
Running
Running
File size: 8,793 Bytes
71afaf8 c994801 5832682 c994801 5832682 c994801 5832682 c994801 5832682 c994801 5832682 c994801 5832682 c994801 5832682 c994801 5832682 c994801 5832682 c994801 5832682 c994801 5832682 c994801 5832682 c994801 5832682 c994801 5832682 9783073 5832682 c994801 65b682e d407658 c994801 5832682 c994801 5832682 c994801 5832682 ef0f6be d407658 5832682 c994801 ef0f6be c994801 5832682 c994801 5832682 c994801 5832682 c994801 5832682 c994801 5832682 c994801 5832682 c994801 5832682 c994801 5832682 c994801 5832682 c994801 5832682 c994801 5832682 c994801 5832682 c994801 77c778e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 |
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>TokenVisualizer — Minimal</title>
<link rel="preconnect" href="https://fonts.googleapis.com">
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600&family=JetBrains+Mono:wght@400;600&display=swap" rel="stylesheet">
<style>
:root{
--bg:#0b0f14; --text:#ffffff; --muted:#9aa4b2; --accent:#38bdf8; --border:#1f2a3a;
--card1:#0c1624; --card2:#0a1220; --chip:#111827; --chip-border:#263246; --chip-hover:#1a2434;
--mono:'JetBrains Mono',ui-monospace,Menlo,Consolas,monospace; --sans:Inter,system-ui,-apple-system,"Segoe UI",Roboto,Ubuntu,"Helvetica Neue",Arial;
}
*{box-sizing:border-box} body{margin:0;background:radial-gradient(900px 500px at 10% -10%, #07314a, transparent),var(--bg);color:var(--text);font-family:var(--sans)}
.container{max-width:1100px;margin:0 auto;padding:1.25rem}
header{padding-top:1.5rem} h1{margin:.2rem 0 .4rem;font-size:1.9rem}
.sub{color:var(--muted);margin:.25rem 0 1rem}
.card{background:linear-gradient(180deg,var(--card1),var(--card2));border:1px solid var(--border);border-radius:14px;padding:1rem;box-shadow:0 10px 40px rgba(0,0,0,.35)}
label span{color:var(--muted);font-size:.9rem}
select,textarea{width:100%;border-radius:10px;border:1px solid var(--border);background:#0a1220;color:var(--text);padding:.7rem .85rem;outline:none}
select:focus,textarea:focus{border-color:var(--accent)}
.controls{display:grid;gap:.8rem;margin-bottom:1rem}
.row{display:flex;gap:.75rem;align-items:center}
.status{color:var(--muted)}
.grid{display:grid;gap:1rem;grid-template-columns:1fr}
@media (min-width:900px){.grid{grid-template-columns:1fr 1fr}}
.head{display:flex;align-items:center;justify-content:space-between;margin-bottom:.5rem}
.tokens{display:flex;flex-wrap:wrap;gap:.5rem;max-height:360px;overflow:auto;padding:.25rem}
.chip{border:1px solid var(--chip-border);background:var(--chip);padding:.35rem .5rem;border-radius:10px;font-family:var(--mono);font-size:.9rem;transition:background .12s,border-color .12s}
.chip:hover{background:var(--chip-hover);border-color:var(--accent)}
.chip.active{outline:2px solid var(--accent)}
pre.ids{font-family:var(--mono);background:#0a1220;border:1px solid var(--border);border-radius:10px;padding:.75rem;max-height:360px;overflow:auto;white-space:pre-wrap}
.caption{color:var(--muted);font-size:.9rem;margin-top:.5rem}
footer{color:var(--muted);text-align:center;padding:1.25rem 0 2rem}
a{color:var(--accent)}
</style>
</head>
<body>
<header class="container">
<h1>TokenVisualizer</h1>
<p class="sub">Live view of tokens and token IDs. Powered by Transformers.js — all in your browser.</p>
</header>
<main class="container">
<section class="card controls">
<label>
<span>Model</span>
<select id="model">
<!-- Tip: keep this first so the demo works instantly once you upload /assets/gpt2/* -->
<option value="local:gpt2">GPT-2 (local, fast)</option>
<option value="Xenova/gpt2">GPT-2 (Hub)</option>
<option value="Xenova/llama2-tokenizer">Llama-2 (Hub)</option>
<option value="Xenova/mistral-tokenizer">Mistral (Hub)</option>
<option value="Xenova/gemma-tokenizer">Gemma (Hub)</option>
<option value="Xenova/bert-base-uncased">BERT Base Uncased (Hub)</option>
</select>
</label>
<label>
<span>Text</span>
<textarea id="input" rows="3">Hello world! This is a tokenizer demo.</textarea>
</label>
<div class="row">
<span id="status" class="status">Loading tokenizer…</span>
</div>
</section>
<section class="grid">
<article class="card">
<div class="head"><h3>Tokens</h3></div>
<div id="tokens" class="tokens"></div>
<p class="caption">Tokens are subword chunks the model learned from lots of text.</p>
</article>
<article class="card">
<div class="head"><h3>Token IDs</h3></div>
<pre id="ids" class="ids"></pre>
<p class="caption">IDs are how the model “sees” tokens — just numbers.</p>
</article>
</section>
</main>
<footer class="container">
<small>Built by Peter Adams • Powered by <a href="https://github.com/xenova/transformers.js" target="_blank" rel="noreferrer">Transformers.js</a></small>
</footer>
<!-- Minimal, robust script (no copy/export) -->
<script type="module">
// Prefer keeping all requests on huggingface.co to avoid CORS/VPN issues.
// Option 1 (simple): CDN import (works on many networks)
const tf = await import('https://cdn.jsdelivr.net/npm/@xenova/[email protected]');
// Option 2 (bulletproof): self-host the file in your Space and use:
// const tf = await import('./assets/vendor/transformers.min.js');
tf.env.useBrowserCache = true;
tf.env.allowLocalModels = true; // <-- REQUIRED for local folder paths
tf.env.localModelPath = '/';
const $ = s => document.querySelector(s);
const modelSel = $('#model');
const inputEl = $('#input');
const statusEl = $('#status');
const tokensEl = $('#tokens');
const idsEl = $('#ids');
// Single state object; never reassign
const state = { tokens: [], ids: [] };
let tokenizer = null;
let runId = 0;
const status = (msg) => { statusEl.textContent = msg; };
const debounce = (fn, ms=200) => { let t; return (...a)=>{ clearTimeout(t); t=setTimeout(()=>fn(...a), ms); }; };
async function loadTokenizer(modelId){
status('Loading tokenizer…');
try {
if (modelId === 'local:gpt2') {
// Note: no double slashes, no /resolve/main – just your folder.
tokenizer = await tf.AutoTokenizer.from_pretrained('assets/gpt2');
} else {
tokenizer = await tf.AutoTokenizer.from_pretrained(modelId);
}
status('Tokenizer ready.');
} catch (e) {
console.error('Tokenizer load failed:', e);
tokenizer = null;
status('Failed to load tokenizer (network blocked or slow). Try GPT-2 or a different VPN route.');
}
}
async function tokenize(){
const myRun = ++runId;
if (!tokenizer) {
await loadTokenizer(modelSel.value);
if (!tokenizer) { render(); return; } // keep UI stable on failure
}
const text = (inputEl.value ?? '').trim();
if (!text) {
state.tokens = [];
state.ids = [];
render();
status('Type to tokenize…');
return;
}
status('Tokenizing…');
try {
const enc = await tokenizer.encode(text);
if (myRun !== runId) return; // drop stale results
state.tokens = Array.isArray(enc?.tokens) ? enc.tokens : [];
state.ids = Array.isArray(enc?.ids) ? enc.ids : [];
render();
status(`Done. ${state.tokens.length} tokens.`);
} catch (e) {
console.error('Tokenize failed:', e);
state.tokens = state.tokens ?? [];
state.ids = state.ids ?? [];
render();
status('Error tokenizing. See console.');
}
}
function render(){
const tokens = Array.isArray(state.tokens) ? state.tokens : [];
const ids = Array.isArray(state.ids) ? state.ids : [];
// Tokens pane
tokensEl.innerHTML = '';
tokens.forEach((tok, i) => {
const chip = document.createElement('span');
chip.className = 'chip';
chip.dataset.i = i;
chip.textContent = tok;
chip.addEventListener('mouseenter', ()=>highlight(i, true));
chip.addEventListener('mouseleave', ()=>highlight(i, false));
tokensEl.appendChild(chip);
});
// IDs pane
idsEl.textContent = ids.join(' ');
if (tokens.length === 0) status('Type to tokenize…');
}
function highlight(i, on){
const ids = Array.isArray(state.ids) ? state.ids : [];
if (!ids.length) return;
const parts = ids.map((id, idx) => (idx === i && on) ? `[${id}]` : String(id));
idsEl.textContent = parts.join(' ');
const chip = tokensEl.querySelector(`[data-i="${i}"]`);
if (chip) chip.classList.toggle('active', on);
}
const debounced = debounce(tokenize, 200);
inputEl.addEventListener('input', debounced);
modelSel.addEventListener('change', async ()=>{
tokenizer = null; // force reload
await loadTokenizer(modelSel.value);
tokenize();
});
// Initial load
await loadTokenizer(modelSel.value);
tokenize();
</script>
</body>
</html>
|