PeterPinetree commited on
Commit
5832682
·
verified ·
1 Parent(s): e78cadc

Update index.html

Browse files
Files changed (1) hide show
  1. index.html +98 -139
index.html CHANGED
@@ -2,51 +2,36 @@
2
  <html lang="en">
3
  <head>
4
  <meta charset="utf-8" />
5
- <meta
6
- name="viewport"
7
- content="width=device-width, initial-scale=1"
8
- />
9
- <title>TokenVisualizer — Two-Pane</title>
10
  <link rel="preconnect" href="https://fonts.googleapis.com">
11
  <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600&family=JetBrains+Mono:wght@400;600&display=swap" rel="stylesheet">
12
  <style>
13
  :root{
14
- --bg:#0b0f14; /* black-leaning bg */
15
- --text:#ffffff; /* white */
16
- --muted:#9aa4b2;
17
- --accent:#38bdf8; /* sky blue */
18
- --card:#0e1624;
19
- --border:#1f2a3a;
20
- --chip:#111827;
21
- --chip-border:#263246;
22
- --chip-hover:#1a2434;
23
- --mono:'JetBrains Mono', ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
24
- --sans:Inter, system-ui, -apple-system, Segoe UI, Roboto, Ubuntu, Cantarell, Noto Sans, 'Helvetica Neue', Arial;
25
  }
26
- *{box-sizing:border-box}
27
- body{margin:0;background:radial-gradient(900px 500px at 10% -10%, #07314a, transparent),var(--bg);color:var(--text);font-family:var(--sans)}
28
  .container{max-width:1100px;margin:0 auto;padding:1.25rem}
29
- header{padding-top:1.5rem}
30
- h1{margin:.2rem 0 .4rem;font-size:1.9rem}
31
  .sub{color:var(--muted);margin:.25rem 0 1rem}
32
- .card{background:linear-gradient(180deg,#0c1624,#0a1220);border:1px solid var(--border);border-radius:14px;padding:1rem;box-shadow:0 10px 40px rgba(0,0,0,.35)}
33
  label span{color:var(--muted);font-size:.9rem}
34
  select,textarea{width:100%;border-radius:10px;border:1px solid var(--border);background:#0a1220;color:var(--text);padding:.7rem .85rem;outline:none}
35
  select:focus,textarea:focus{border-color:var(--accent)}
36
  .controls{display:grid;gap:.8rem;margin-bottom:1rem}
37
  .row{display:flex;gap:.75rem;align-items:center}
38
- .btn{background:var(--accent);color:#07222d;border:0;border-radius:10px;padding:.55rem .95rem;font-weight:600;cursor:pointer}
39
- .btn.secondary{background:#152236;color:var(--text);border:1px solid var(--border)}
40
  .status{color:var(--muted)}
41
  .grid{display:grid;gap:1rem;grid-template-columns:1fr}
42
  @media (min-width:900px){.grid{grid-template-columns:1fr 1fr}}
43
  .head{display:flex;align-items:center;justify-content:space-between;margin-bottom:.5rem}
44
- .actions .link{background:none;border:none;color:var(--accent);cursor:pointer;margin-left:.5rem}
45
  .tokens{display:flex;flex-wrap:wrap;gap:.5rem;max-height:360px;overflow:auto;padding:.25rem}
46
  .chip{border:1px solid var(--chip-border);background:var(--chip);padding:.35rem .5rem;border-radius:10px;font-family:var(--mono);font-size:.9rem;transition:background .12s,border-color .12s}
47
  .chip:hover{background:var(--chip-hover);border-color:var(--accent)}
48
  .chip.active{outline:2px solid var(--accent)}
49
  pre.ids{font-family:var(--mono);background:#0a1220;border:1px solid var(--border);border-radius:10px;padding:.75rem;max-height:360px;overflow:auto;white-space:pre-wrap}
 
50
  footer{color:var(--muted);text-align:center;padding:1.25rem 0 2rem}
51
  a{color:var(--accent)}
52
  </style>
@@ -54,7 +39,7 @@
54
  <body>
55
  <header class="container">
56
  <h1>TokenVisualizer</h1>
57
- <p class="sub">See tokens and token IDs at the same time. (Runs entirely in your browser.)</p>
58
  </header>
59
 
60
  <main class="container">
@@ -62,11 +47,13 @@
62
  <label>
63
  <span>Model</span>
64
  <select id="model">
65
- <option value="Xenova/gpt2">GPT-2 (BPE)</option>
66
- <option value="Xenova/llama2-tokenizer">Llama-2 (SentencePiece/BPE)</option>
67
- <option value="Xenova/mistral-tokenizer">Mistral (SentencePiece/BPE)</option>
68
- <option value="Xenova/gemma-tokenizer">Gemma (SentencePiece/BPE)</option>
69
- <option value="Xenova/bert-base-uncased">BERT Base Uncased (WordPiece)</option>
 
 
70
  </select>
71
  </label>
72
  <label>
@@ -74,176 +61,148 @@
74
  <textarea id="input" rows="3">Hello world! This is a tokenizer demo.</textarea>
75
  </label>
76
  <div class="row">
77
- <button id="tokenize" class="btn">Tokenize</button>
78
  <span id="status" class="status">Loading tokenizer…</span>
79
  </div>
80
  </section>
81
 
82
  <section class="grid">
83
  <article class="card">
84
- <div class="head">
85
- <h3>Tokens</h3>
86
- <div class="actions">
87
- <button id="copyTokens" class="link">Copy</button>
88
- <button id="exportTokens" class="link">Export JSON</button>
89
- </div>
90
- </div>
91
  <div id="tokens" class="tokens"></div>
 
92
  </article>
93
 
94
  <article class="card">
95
- <div class="head">
96
- <h3>Token IDs</h3>
97
- <div class="actions">
98
- <button id="copyIds" class="link">Copy</button>
99
- <button id="exportIds" class="link">Export JSON</button>
100
- </div>
101
- </div>
102
  <pre id="ids" class="ids"></pre>
 
103
  </article>
104
  </section>
105
-
106
- <section class="container" style="text-align:right;margin:1rem 0">
107
- <button id="exportCSV" class="btn secondary">Download CSV (index, token, id)</button>
108
- </section>
109
  </main>
110
 
111
  <footer class="container">
112
  <small>Built by Peter Adams • Powered by <a href="https://github.com/xenova/transformers.js" target="_blank" rel="noreferrer">Transformers.js</a></small>
113
  </footer>
114
 
115
- <!-- Transformers.js (ESM) -->
116
  <script type="module">
 
 
117
  const tf = await import('https://cdn.jsdelivr.net/npm/@xenova/[email protected]');
 
 
 
118
  tf.env.useBrowserCache = true;
119
- tf.env.allowLocalModels = false;
120
-
121
- const $ = sel => document.querySelector(sel);
122
- const el = {
123
- model: $('#model'),
124
- input: $('#input'),
125
- btn: $('#tokenize'),
126
- status: $('#status'),
127
- tokens: $('#tokens'),
128
- ids: $('#ids'),
129
- copyTokens: $('#copyTokens'),
130
- exportTokens: $('#exportTokens'),
131
- copyIds: $('#copyIds'),
132
- exportIds: $('#exportIds'),
133
- exportCSV: $('#exportCSV'),
134
- };
135
 
 
 
 
 
 
 
 
 
 
136
  let tokenizer = null;
137
- let last = { tokens: [], ids: [] };
138
  let runId = 0;
139
 
140
- function status(msg){ el.status.textContent = msg; }
141
- function debounce(fn, ms=250){ let t; return (...a)=>{ clearTimeout(t); t=setTimeout(()=>fn(...a), ms); }; }
142
 
143
  async function loadTokenizer(modelId){
144
  status('Loading tokenizer…');
145
- tokenizer = await tf.AutoTokenizer.from_pretrained(modelId);
146
- status('Tokenizer ready.');
 
 
 
 
 
 
 
 
 
 
 
147
  }
148
 
149
  async function tokenize(){
150
  const myRun = ++runId;
151
- if (!tokenizer) await loadTokenizer(el.model.value);
 
 
 
152
 
153
- const text = el.input.value ?? '';
154
- if (!text.trim()){
155
- last = { tokens: [], ids: [] };
 
156
  render();
157
  status('Type to tokenize…');
158
  return;
159
  }
160
 
161
  status('Tokenizing…');
162
- try{
163
  const enc = await tokenizer.encode(text);
164
- if (myRun !== runId) return; // stale run
165
- last.tokens = enc.tokens;
166
- last.ids = enc.ids;
 
 
 
 
 
 
167
  render();
168
- status(`Done. ${last.tokens.length} tokens.`);
169
- }catch(err){
170
- console.error(err);
171
  status('Error tokenizing. See console.');
172
  }
173
  }
174
 
175
  function render(){
 
 
 
176
  // Tokens pane
177
- el.tokens.innerHTML = '';
178
- last.tokens.forEach((tok, i)=>{
179
- const span = document.createElement('span');
180
- span.className = 'chip';
181
- span.dataset.idx = i;
182
- span.textContent = tok;
183
- span.addEventListener('mouseenter', ()=>highlightID(i,true));
184
- span.addEventListener('mouseleave', ()=>highlightID(i,false));
185
- el.tokens.appendChild(span);
186
  });
187
 
188
  // IDs pane
189
- el.ids.textContent = last.ids.join(' ');
190
- if (!last.tokens.length) status('Type to tokenize…');
191
- }
192
 
193
- function highlightID(i, on){
194
- if (!last.ids.length) return;
195
- const parts = last.ids.map((id, idx)=> (idx===i && on ? `[${id}]` : String(id)));
196
- el.ids.textContent = parts.join(' ');
197
- const chip = el.tokens.querySelector(`[data-idx="${i}"]`);
198
- if (chip) chip.classList.toggle('active', on);
199
  }
200
 
201
- // Copy / Export
202
- el.copyTokens.addEventListener('click', async ()=>{
203
- if (!last.tokens.length) return;
204
- await navigator.clipboard.writeText(last.tokens.join(' '));
205
- status('Tokens copied.');
206
- });
207
- el.exportTokens.addEventListener('click', ()=>{
208
- download('tokens.json', JSON.stringify(last.tokens, null, 2), 'application/json');
209
- });
210
- el.copyIds.addEventListener('click', async ()=>{
211
- if (!last.ids.length) return;
212
- await navigator.clipboard.writeText(last.ids.join(' '));
213
- status('IDs copied.');
214
- });
215
- el.exportIds.addEventListener('click', ()=>{
216
- download('ids.json', JSON.stringify(last.ids, null, 2), 'application/json');
217
- });
218
- el.exportCSV.addEventListener('click', ()=>{
219
- if (!last.tokens.length) return;
220
- const rows = last.tokens.map((t,i)=>[i,t,last.ids[i]]);
221
- const csv = [['index','token','id'], ...rows].map(r=>r.map(csvCell).join(',')).join('\n');
222
- download('tokens_and_ids.csv', csv, 'text/csv');
223
- });
224
 
225
- function csvCell(v){
226
- const s = String(v);
227
- return /[",\n]/.test(s) ? `"${s.replace(/"/g,'""')}"` : s;
228
- }
229
- function download(name, data, type){
230
- const blob = new Blob([data], {type});
231
- const url = URL.createObjectURL(blob);
232
- const a = Object.assign(document.createElement('a'), {href:url, download:name});
233
- a.click(); URL.revokeObjectURL(url);
234
  }
235
 
236
- // Events
237
- el.btn.addEventListener('click', tokenize);
238
- el.input.addEventListener('input', debounce(tokenize, 250));
239
- el.model.addEventListener('change', async ()=>{
240
- tokenizer = null;
241
- await loadTokenizer(el.model.value);
242
  tokenize();
243
  });
244
 
245
  // Initial load
246
- await loadTokenizer(el.model.value);
247
  tokenize();
248
  </script>
249
  </body>
 
2
  <html lang="en">
3
  <head>
4
  <meta charset="utf-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
6
+ <title>TokenVisualizer — Minimal</title>
 
 
 
7
  <link rel="preconnect" href="https://fonts.googleapis.com">
8
  <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600&family=JetBrains+Mono:wght@400;600&display=swap" rel="stylesheet">
9
  <style>
10
  :root{
11
+ --bg:#0b0f14; --text:#ffffff; --muted:#9aa4b2; --accent:#38bdf8; --border:#1f2a3a;
12
+ --card1:#0c1624; --card2:#0a1220; --chip:#111827; --chip-border:#263246; --chip-hover:#1a2434;
13
+ --mono:'JetBrains Mono',ui-monospace,Menlo,Consolas,monospace; --sans:Inter,system-ui,-apple-system,"Segoe UI",Roboto,Ubuntu,"Helvetica Neue",Arial;
 
 
 
 
 
 
 
 
14
  }
15
+ *{box-sizing:border-box} body{margin:0;background:radial-gradient(900px 500px at 10% -10%, #07314a, transparent),var(--bg);color:var(--text);font-family:var(--sans)}
 
16
  .container{max-width:1100px;margin:0 auto;padding:1.25rem}
17
+ header{padding-top:1.5rem} h1{margin:.2rem 0 .4rem;font-size:1.9rem}
 
18
  .sub{color:var(--muted);margin:.25rem 0 1rem}
19
+ .card{background:linear-gradient(180deg,var(--card1),var(--card2));border:1px solid var(--border);border-radius:14px;padding:1rem;box-shadow:0 10px 40px rgba(0,0,0,.35)}
20
  label span{color:var(--muted);font-size:.9rem}
21
  select,textarea{width:100%;border-radius:10px;border:1px solid var(--border);background:#0a1220;color:var(--text);padding:.7rem .85rem;outline:none}
22
  select:focus,textarea:focus{border-color:var(--accent)}
23
  .controls{display:grid;gap:.8rem;margin-bottom:1rem}
24
  .row{display:flex;gap:.75rem;align-items:center}
 
 
25
  .status{color:var(--muted)}
26
  .grid{display:grid;gap:1rem;grid-template-columns:1fr}
27
  @media (min-width:900px){.grid{grid-template-columns:1fr 1fr}}
28
  .head{display:flex;align-items:center;justify-content:space-between;margin-bottom:.5rem}
 
29
  .tokens{display:flex;flex-wrap:wrap;gap:.5rem;max-height:360px;overflow:auto;padding:.25rem}
30
  .chip{border:1px solid var(--chip-border);background:var(--chip);padding:.35rem .5rem;border-radius:10px;font-family:var(--mono);font-size:.9rem;transition:background .12s,border-color .12s}
31
  .chip:hover{background:var(--chip-hover);border-color:var(--accent)}
32
  .chip.active{outline:2px solid var(--accent)}
33
  pre.ids{font-family:var(--mono);background:#0a1220;border:1px solid var(--border);border-radius:10px;padding:.75rem;max-height:360px;overflow:auto;white-space:pre-wrap}
34
+ .caption{color:var(--muted);font-size:.9rem;margin-top:.5rem}
35
  footer{color:var(--muted);text-align:center;padding:1.25rem 0 2rem}
36
  a{color:var(--accent)}
37
  </style>
 
39
  <body>
40
  <header class="container">
41
  <h1>TokenVisualizer</h1>
42
+ <p class="sub">Live view of tokens and token IDs. Powered by Transformers.js all in your browser.</p>
43
  </header>
44
 
45
  <main class="container">
 
47
  <label>
48
  <span>Model</span>
49
  <select id="model">
50
+ <!-- Tip: keep this first so the demo works instantly once you upload /assets/gpt2/* -->
51
+ <option value="local:gpt2">GPT-2 (local, fast)</option>
52
+ <option value="Xenova/gpt2">GPT-2 (Hub)</option>
53
+ <option value="Xenova/llama2-tokenizer">Llama-2 (Hub)</option>
54
+ <option value="Xenova/mistral-tokenizer">Mistral (Hub)</option>
55
+ <option value="Xenova/gemma-tokenizer">Gemma (Hub)</option>
56
+ <option value="Xenova/bert-base-uncased">BERT Base Uncased (Hub)</option>
57
  </select>
58
  </label>
59
  <label>
 
61
  <textarea id="input" rows="3">Hello world! This is a tokenizer demo.</textarea>
62
  </label>
63
  <div class="row">
 
64
  <span id="status" class="status">Loading tokenizer…</span>
65
  </div>
66
  </section>
67
 
68
  <section class="grid">
69
  <article class="card">
70
+ <div class="head"><h3>Tokens</h3></div>
 
 
 
 
 
 
71
  <div id="tokens" class="tokens"></div>
72
+ <p class="caption">Tokens are subword chunks the model learned from lots of text.</p>
73
  </article>
74
 
75
  <article class="card">
76
+ <div class="head"><h3>Token IDs</h3></div>
 
 
 
 
 
 
77
  <pre id="ids" class="ids"></pre>
78
+ <p class="caption">IDs are how the model “sees” tokens — just numbers.</p>
79
  </article>
80
  </section>
 
 
 
 
81
  </main>
82
 
83
  <footer class="container">
84
  <small>Built by Peter Adams • Powered by <a href="https://github.com/xenova/transformers.js" target="_blank" rel="noreferrer">Transformers.js</a></small>
85
  </footer>
86
 
87
+ <!-- Minimal, robust script (no copy/export) -->
88
  <script type="module">
89
+ // Prefer keeping all requests on huggingface.co to avoid CORS/VPN issues.
90
+ // Option 1 (simple): CDN import (works on many networks)
91
  const tf = await import('https://cdn.jsdelivr.net/npm/@xenova/[email protected]');
92
+ // Option 2 (bulletproof): self-host the file in your Space and use:
93
+ // const tf = await import('./assets/vendor/transformers.min.js');
94
+
95
  tf.env.useBrowserCache = true;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
+ const $ = s => document.querySelector(s);
98
+ const modelSel = $('#model');
99
+ const inputEl = $('#input');
100
+ const statusEl = $('#status');
101
+ const tokensEl = $('#tokens');
102
+ const idsEl = $('#ids');
103
+
104
+ // Single state object; never reassign
105
+ const state = { tokens: [], ids: [] };
106
  let tokenizer = null;
 
107
  let runId = 0;
108
 
109
+ const status = (msg) => { statusEl.textContent = msg; };
110
+ const debounce = (fn, ms=200) => { let t; return (...a)=>{ clearTimeout(t); t=setTimeout(()=>fn(...a), ms); }; };
111
 
112
  async function loadTokenizer(modelId){
113
  status('Loading tokenizer…');
114
+ try {
115
+ if (modelId === 'local:gpt2') {
116
+ // Requires you to upload /assets/gpt2/* files into the Space
117
+ tokenizer = await tf.AutoTokenizer.from_pretrained('./assets/gpt2/');
118
+ } else {
119
+ tokenizer = await tf.AutoTokenizer.from_pretrained(modelId);
120
+ }
121
+ status('Tokenizer ready.');
122
+ } catch (e) {
123
+ console.error('Tokenizer load failed:', e);
124
+ tokenizer = null;
125
+ status('Failed to load tokenizer (network blocked or slow). Try GPT-2 or a different VPN route.');
126
+ }
127
  }
128
 
129
  async function tokenize(){
130
  const myRun = ++runId;
131
+ if (!tokenizer) {
132
+ await loadTokenizer(modelSel.value);
133
+ if (!tokenizer) { render(); return; } // keep UI stable on failure
134
+ }
135
 
136
+ const text = (inputEl.value ?? '').trim();
137
+ if (!text) {
138
+ state.tokens = [];
139
+ state.ids = [];
140
  render();
141
  status('Type to tokenize…');
142
  return;
143
  }
144
 
145
  status('Tokenizing…');
146
+ try {
147
  const enc = await tokenizer.encode(text);
148
+ if (myRun !== runId) return; // drop stale results
149
+ state.tokens = Array.isArray(enc?.tokens) ? enc.tokens : [];
150
+ state.ids = Array.isArray(enc?.ids) ? enc.ids : [];
151
+ render();
152
+ status(`Done. ${state.tokens.length} tokens.`);
153
+ } catch (e) {
154
+ console.error('Tokenize failed:', e);
155
+ state.tokens = state.tokens ?? [];
156
+ state.ids = state.ids ?? [];
157
  render();
 
 
 
158
  status('Error tokenizing. See console.');
159
  }
160
  }
161
 
162
  function render(){
163
+ const tokens = Array.isArray(state.tokens) ? state.tokens : [];
164
+ const ids = Array.isArray(state.ids) ? state.ids : [];
165
+
166
  // Tokens pane
167
+ tokensEl.innerHTML = '';
168
+ tokens.forEach((tok, i) => {
169
+ const chip = document.createElement('span');
170
+ chip.className = 'chip';
171
+ chip.dataset.i = i;
172
+ chip.textContent = tok;
173
+ chip.addEventListener('mouseenter', ()=>highlight(i, true));
174
+ chip.addEventListener('mouseleave', ()=>highlight(i, false));
175
+ tokensEl.appendChild(chip);
176
  });
177
 
178
  // IDs pane
179
+ idsEl.textContent = ids.join(' ');
 
 
180
 
181
+ if (tokens.length === 0) status('Type to tokenize…');
 
 
 
 
 
182
  }
183
 
184
+ function highlight(i, on){
185
+ const ids = Array.isArray(state.ids) ? state.ids : [];
186
+ if (!ids.length) return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
+ const parts = ids.map((id, idx) => (idx === i && on) ? `[${id}]` : String(id));
189
+ idsEl.textContent = parts.join(' ');
190
+
191
+ const chip = tokensEl.querySelector(`[data-i="${i}"]`);
192
+ if (chip) chip.classList.toggle('active', on);
 
 
 
 
193
  }
194
 
195
+ const debounced = debounce(tokenize, 200);
196
+ inputEl.addEventListener('input', debounced);
197
+
198
+ modelSel.addEventListener('change', async ()=>{
199
+ tokenizer = null; // force reload
200
+ await loadTokenizer(modelSel.value);
201
  tokenize();
202
  });
203
 
204
  // Initial load
205
+ await loadTokenizer(modelSel.value);
206
  tokenize();
207
  </script>
208
  </body>