Spaces:
Running
Running
Update index.html
Browse files- index.html +40 -22
index.html
CHANGED
@@ -30,6 +30,10 @@
|
|
30 |
.chip{border:1px solid var(--chip-border);background:var(--chip);padding:.35rem .5rem;border-radius:10px;font-family:var(--mono);font-size:.9rem;transition:background .12s,border-color .12s}
|
31 |
.chip:hover{background:var(--chip-hover);border-color:var(--accent)}
|
32 |
.chip.active{outline:2px solid var(--accent)}
|
|
|
|
|
|
|
|
|
33 |
pre.ids{font-family:var(--mono);background:#0a1220;border:1px solid var(--border);border-radius:10px;padding:.75rem;max-height:360px;overflow:auto;white-space:pre-wrap}
|
34 |
.caption{color:var(--muted);font-size:.9rem;margin-top:.5rem}
|
35 |
footer{color:var(--muted);text-align:center;padding:1.25rem 0 2rem}
|
@@ -148,36 +152,49 @@
|
|
148 |
|
149 |
status('Tokenizing…');
|
150 |
try {
|
151 |
-
const enc = await tokenizer.encode(text); //
|
152 |
-
|
153 |
-
|
154 |
-
let ids = Array.isArray(enc)
|
155 |
? enc
|
156 |
: (enc && (enc.ids ?? enc.input_ids ?? enc.inputIds)) || [];
|
157 |
|
158 |
-
//
|
159 |
-
const
|
160 |
-
const
|
|
|
161 |
|
162 |
-
//
|
163 |
let tokens = [];
|
164 |
if (typeof tokenizer.convert_ids_to_tokens === 'function') {
|
165 |
-
tokens = tokenizer.convert_ids_to_tokens(
|
166 |
} else if (typeof tokenizer.id_to_token === 'function') {
|
167 |
-
tokens =
|
168 |
-
} else if (!Array.isArray(enc)) {
|
169 |
-
|
170 |
-
const encTokens = Array.isArray(enc.tokens) ? enc.tokens : [];
|
171 |
-
tokens = encTokens.filter((_, i) => !specials.has(ids[i]));
|
172 |
} else {
|
173 |
-
//
|
174 |
-
tokens =
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
}
|
176 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
if (myRun !== runId) return;
|
178 |
|
179 |
state.tokens = tokens;
|
180 |
-
state.ids =
|
181 |
render();
|
182 |
status(`Done. ${state.tokens.length} tokens.`);
|
183 |
} catch (e) {
|
@@ -190,25 +207,26 @@
|
|
190 |
function render(){
|
191 |
const tokens = Array.isArray(state.tokens) ? state.tokens : [];
|
192 |
const ids = Array.isArray(state.ids) ? state.ids : [];
|
193 |
-
|
194 |
-
|
|
|
195 |
tokensEl.innerHTML = '';
|
196 |
tokens.forEach((tok, i) => {
|
197 |
const chip = document.createElement('span');
|
198 |
chip.className = 'chip';
|
|
|
199 |
chip.dataset.i = i;
|
200 |
chip.textContent = tok;
|
201 |
chip.addEventListener('mouseenter', ()=>highlight(i, true));
|
202 |
chip.addEventListener('mouseleave', ()=>highlight(i, false));
|
203 |
tokensEl.appendChild(chip);
|
204 |
});
|
205 |
-
|
206 |
-
// IDs pane
|
207 |
idsEl.textContent = ids.join(' ');
|
208 |
-
|
209 |
if (tokens.length === 0) status('Type to tokenize…');
|
210 |
}
|
211 |
|
|
|
212 |
function highlight(i, on){
|
213 |
const ids = Array.isArray(state.ids) ? state.ids : [];
|
214 |
if (!ids.length) return;
|
|
|
30 |
.chip{border:1px solid var(--chip-border);background:var(--chip);padding:.35rem .5rem;border-radius:10px;font-family:var(--mono);font-size:.9rem;transition:background .12s,border-color .12s}
|
31 |
.chip:hover{background:var(--chip-hover);border-color:var(--accent)}
|
32 |
.chip.active{outline:2px solid var(--accent)}
|
33 |
+
.chip.special {
|
34 |
+
border-color: #38bdf8;
|
35 |
+
background: #0b2235;
|
36 |
+
}
|
37 |
pre.ids{font-family:var(--mono);background:#0a1220;border:1px solid var(--border);border-radius:10px;padding:.75rem;max-height:360px;overflow:auto;white-space:pre-wrap}
|
38 |
.caption{color:var(--muted);font-size:.9rem;margin-top:.5rem}
|
39 |
footer{color:var(--muted);text-align:center;padding:1.25rem 0 2rem}
|
|
|
152 |
|
153 |
status('Tokenizing…');
|
154 |
try {
|
155 |
+
const enc = await tokenizer.encode(text); // include specials (default)
|
156 |
+
// Handle both array/object return shapes
|
157 |
+
const ids = Array.isArray(enc)
|
|
|
158 |
? enc
|
159 |
: (enc && (enc.ids ?? enc.input_ids ?? enc.inputIds)) || [];
|
160 |
|
161 |
+
// Map special IDs -> special token strings (if available)
|
162 |
+
const specialIds = Array.from(tokenizer.all_special_ids || []);
|
163 |
+
const specialTokens = Array.from(tokenizer.all_special_tokens || []);
|
164 |
+
const idToSpecial = new Map(specialIds.map((id, i) => [id, specialTokens[i]]));
|
165 |
|
166 |
+
// Build token strings for every ID (specials included)
|
167 |
let tokens = [];
|
168 |
if (typeof tokenizer.convert_ids_to_tokens === 'function') {
|
169 |
+
tokens = tokenizer.convert_ids_to_tokens(ids);
|
170 |
} else if (typeof tokenizer.id_to_token === 'function') {
|
171 |
+
tokens = ids.map(id => tokenizer.id_to_token(id));
|
172 |
+
} else if (!Array.isArray(enc) && Array.isArray(enc.tokens)) {
|
173 |
+
tokens = enc.tokens;
|
|
|
|
|
174 |
} else {
|
175 |
+
// Fallback: decode each ID as a single-piece token
|
176 |
+
tokens = ids.map(id =>
|
177 |
+
tokenizer.decode([id], {
|
178 |
+
// we WANT specials in the stream; decode may return "" for them
|
179 |
+
skip_special_tokens: false,
|
180 |
+
clean_up_tokenization_spaces: false,
|
181 |
+
})
|
182 |
+
);
|
183 |
}
|
184 |
|
185 |
+
// Ensure specials are visible: if a special token decodes to empty,
|
186 |
+
// replace it with its canonical name or a generic tag.
|
187 |
+
tokens = tokens.map((tok, i) => {
|
188 |
+
const id = ids[i];
|
189 |
+
if (tok && tok.length) return tok;
|
190 |
+
if (idToSpecial.has(id)) return idToSpecial.get(id); // e.g., <|endoftext|> for GPT-2
|
191 |
+
return `<special:${id}>`;
|
192 |
+
});
|
193 |
+
|
194 |
if (myRun !== runId) return;
|
195 |
|
196 |
state.tokens = tokens;
|
197 |
+
state.ids = ids; // include specials in the count
|
198 |
render();
|
199 |
status(`Done. ${state.tokens.length} tokens.`);
|
200 |
} catch (e) {
|
|
|
207 |
function render(){
|
208 |
const tokens = Array.isArray(state.tokens) ? state.tokens : [];
|
209 |
const ids = Array.isArray(state.ids) ? state.ids : [];
|
210 |
+
|
211 |
+
const specialSet = new Set(tokenizer.all_special_ids || []);
|
212 |
+
|
213 |
tokensEl.innerHTML = '';
|
214 |
tokens.forEach((tok, i) => {
|
215 |
const chip = document.createElement('span');
|
216 |
chip.className = 'chip';
|
217 |
+
if (specialSet.has(ids[i])) chip.classList.add('special'); // <-- highlight specials
|
218 |
chip.dataset.i = i;
|
219 |
chip.textContent = tok;
|
220 |
chip.addEventListener('mouseenter', ()=>highlight(i, true));
|
221 |
chip.addEventListener('mouseleave', ()=>highlight(i, false));
|
222 |
tokensEl.appendChild(chip);
|
223 |
});
|
224 |
+
|
|
|
225 |
idsEl.textContent = ids.join(' ');
|
|
|
226 |
if (tokens.length === 0) status('Type to tokenize…');
|
227 |
}
|
228 |
|
229 |
+
|
230 |
function highlight(i, on){
|
231 |
const ids = Array.isArray(state.ids) ? state.ids : [];
|
232 |
if (!ids.length) return;
|