PeterPinetree commited on
Commit
c0e798c
·
verified ·
1 Parent(s): 06efbe9

Update index.html

Browse files
Files changed (1) hide show
  1. index.html +40 -22
index.html CHANGED
@@ -30,6 +30,10 @@
30
  .chip{border:1px solid var(--chip-border);background:var(--chip);padding:.35rem .5rem;border-radius:10px;font-family:var(--mono);font-size:.9rem;transition:background .12s,border-color .12s}
31
  .chip:hover{background:var(--chip-hover);border-color:var(--accent)}
32
  .chip.active{outline:2px solid var(--accent)}
 
 
 
 
33
  pre.ids{font-family:var(--mono);background:#0a1220;border:1px solid var(--border);border-radius:10px;padding:.75rem;max-height:360px;overflow:auto;white-space:pre-wrap}
34
  .caption{color:var(--muted);font-size:.9rem;margin-top:.5rem}
35
  footer{color:var(--muted);text-align:center;padding:1.25rem 0 2rem}
@@ -148,36 +152,49 @@
148
 
149
  status('Tokenizing…');
150
  try {
151
- const enc = await tokenizer.encode(text); // returns EITHER an array OR an object, depending on tokenizer
152
-
153
- // >>> handle both shapes
154
- let ids = Array.isArray(enc)
155
  ? enc
156
  : (enc && (enc.ids ?? enc.input_ids ?? enc.inputIds)) || [];
157
 
158
- // Drop special tokens (for GPT-2, usually [50256])
159
- const specials = new Set(tokenizer.all_special_ids || []);
160
- const idsNoSpecials = ids.filter(id => !specials.has(id));
 
161
 
162
- // Derive token strings from IDs
163
  let tokens = [];
164
  if (typeof tokenizer.convert_ids_to_tokens === 'function') {
165
- tokens = tokenizer.convert_ids_to_tokens(idsNoSpecials);
166
  } else if (typeof tokenizer.id_to_token === 'function') {
167
- tokens = idsNoSpecials.map(id => tokenizer.id_to_token(id));
168
- } else if (!Array.isArray(enc)) {
169
- // Some builds expose enc.tokens when enc is an object
170
- const encTokens = Array.isArray(enc.tokens) ? enc.tokens : [];
171
- tokens = encTokens.filter((_, i) => !specials.has(ids[i]));
172
  } else {
173
- // Last resort: stringify IDs (shouldn’t be needed with GPT-2)
174
- tokens = idsNoSpecials.map(String);
 
 
 
 
 
 
175
  }
176
 
 
 
 
 
 
 
 
 
 
177
  if (myRun !== runId) return;
178
 
179
  state.tokens = tokens;
180
- state.ids = idsNoSpecials;
181
  render();
182
  status(`Done. ${state.tokens.length} tokens.`);
183
  } catch (e) {
@@ -190,25 +207,26 @@
190
  function render(){
191
  const tokens = Array.isArray(state.tokens) ? state.tokens : [];
192
  const ids = Array.isArray(state.ids) ? state.ids : [];
193
-
194
- // Tokens pane
 
195
  tokensEl.innerHTML = '';
196
  tokens.forEach((tok, i) => {
197
  const chip = document.createElement('span');
198
  chip.className = 'chip';
 
199
  chip.dataset.i = i;
200
  chip.textContent = tok;
201
  chip.addEventListener('mouseenter', ()=>highlight(i, true));
202
  chip.addEventListener('mouseleave', ()=>highlight(i, false));
203
  tokensEl.appendChild(chip);
204
  });
205
-
206
- // IDs pane
207
  idsEl.textContent = ids.join(' ');
208
-
209
  if (tokens.length === 0) status('Type to tokenize…');
210
  }
211
 
 
212
  function highlight(i, on){
213
  const ids = Array.isArray(state.ids) ? state.ids : [];
214
  if (!ids.length) return;
 
30
  .chip{border:1px solid var(--chip-border);background:var(--chip);padding:.35rem .5rem;border-radius:10px;font-family:var(--mono);font-size:.9rem;transition:background .12s,border-color .12s}
31
  .chip:hover{background:var(--chip-hover);border-color:var(--accent)}
32
  .chip.active{outline:2px solid var(--accent)}
33
+ .chip.special {
34
+ border-color: #38bdf8;
35
+ background: #0b2235;
36
+ }
37
  pre.ids{font-family:var(--mono);background:#0a1220;border:1px solid var(--border);border-radius:10px;padding:.75rem;max-height:360px;overflow:auto;white-space:pre-wrap}
38
  .caption{color:var(--muted);font-size:.9rem;margin-top:.5rem}
39
  footer{color:var(--muted);text-align:center;padding:1.25rem 0 2rem}
 
152
 
153
  status('Tokenizing…');
154
  try {
155
+ const enc = await tokenizer.encode(text); // include specials (default)
156
+ // Handle both array/object return shapes
157
+ const ids = Array.isArray(enc)
 
158
  ? enc
159
  : (enc && (enc.ids ?? enc.input_ids ?? enc.inputIds)) || [];
160
 
161
+ // Map special IDs -> special token strings (if available)
162
+ const specialIds = Array.from(tokenizer.all_special_ids || []);
163
+ const specialTokens = Array.from(tokenizer.all_special_tokens || []);
164
+ const idToSpecial = new Map(specialIds.map((id, i) => [id, specialTokens[i]]));
165
 
166
+ // Build token strings for every ID (specials included)
167
  let tokens = [];
168
  if (typeof tokenizer.convert_ids_to_tokens === 'function') {
169
+ tokens = tokenizer.convert_ids_to_tokens(ids);
170
  } else if (typeof tokenizer.id_to_token === 'function') {
171
+ tokens = ids.map(id => tokenizer.id_to_token(id));
172
+ } else if (!Array.isArray(enc) && Array.isArray(enc.tokens)) {
173
+ tokens = enc.tokens;
 
 
174
  } else {
175
+ // Fallback: decode each ID as a single-piece token
176
+ tokens = ids.map(id =>
177
+ tokenizer.decode([id], {
178
+ // we WANT specials in the stream; decode may return "" for them
179
+ skip_special_tokens: false,
180
+ clean_up_tokenization_spaces: false,
181
+ })
182
+ );
183
  }
184
 
185
+ // Ensure specials are visible: if a special token decodes to empty,
186
+ // replace it with its canonical name or a generic tag.
187
+ tokens = tokens.map((tok, i) => {
188
+ const id = ids[i];
189
+ if (tok && tok.length) return tok;
190
+ if (idToSpecial.has(id)) return idToSpecial.get(id); // e.g., <|endoftext|> for GPT-2
191
+ return `<special:${id}>`;
192
+ });
193
+
194
  if (myRun !== runId) return;
195
 
196
  state.tokens = tokens;
197
+ state.ids = ids; // include specials in the count
198
  render();
199
  status(`Done. ${state.tokens.length} tokens.`);
200
  } catch (e) {
 
207
  function render(){
208
  const tokens = Array.isArray(state.tokens) ? state.tokens : [];
209
  const ids = Array.isArray(state.ids) ? state.ids : [];
210
+
211
+ const specialSet = new Set(tokenizer.all_special_ids || []);
212
+
213
  tokensEl.innerHTML = '';
214
  tokens.forEach((tok, i) => {
215
  const chip = document.createElement('span');
216
  chip.className = 'chip';
217
+ if (specialSet.has(ids[i])) chip.classList.add('special'); // <-- highlight specials
218
  chip.dataset.i = i;
219
  chip.textContent = tok;
220
  chip.addEventListener('mouseenter', ()=>highlight(i, true));
221
  chip.addEventListener('mouseleave', ()=>highlight(i, false));
222
  tokensEl.appendChild(chip);
223
  });
224
+
 
225
  idsEl.textContent = ids.join(' ');
 
226
  if (tokens.length === 0) status('Type to tokenize…');
227
  }
228
 
229
+
230
  function highlight(i, on){
231
  const ids = Array.isArray(state.ids) ? state.ids : [];
232
  if (!ids.length) return;