Spaces:
Running
Running
Update index.html
Browse files- index.html +21 -21
index.html
CHANGED
@@ -148,36 +148,36 @@
|
|
148 |
|
149 |
status('Tokenizing…');
|
150 |
try {
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
// 2) Drop special tokens (e.g., BOS/EOS) for the demo
|
160 |
const specials = new Set(tokenizer.all_special_ids || []);
|
161 |
const idsNoSpecials = ids.filter(id => !specials.has(id));
|
162 |
-
|
163 |
-
//
|
164 |
let tokens = [];
|
165 |
if (typeof tokenizer.convert_ids_to_tokens === 'function') {
|
166 |
tokens = tokenizer.convert_ids_to_tokens(idsNoSpecials);
|
167 |
} else if (typeof tokenizer.id_to_token === 'function') {
|
168 |
tokens = idsNoSpecials.map(id => tokenizer.id_to_token(id));
|
169 |
-
} else if (Array.isArray(enc
|
170 |
-
//
|
171 |
-
|
|
|
172 |
} else {
|
|
|
173 |
tokens = idsNoSpecials.map(String);
|
174 |
}
|
175 |
-
|
176 |
-
if (myRun !== runId) return;
|
177 |
-
|
178 |
-
state.tokens =
|
179 |
-
state.ids
|
180 |
-
|
181 |
render();
|
182 |
status(`Done. ${state.tokens.length} tokens.`);
|
183 |
} catch (e) {
|
@@ -185,7 +185,7 @@
|
|
185 |
render();
|
186 |
status('Error tokenizing. See console.');
|
187 |
}
|
188 |
-
|
189 |
|
190 |
function render(){
|
191 |
const tokens = Array.isArray(state.tokens) ? state.tokens : [];
|
|
|
148 |
|
149 |
status('Tokenizing…');
|
150 |
try {
|
151 |
+
const enc = await tokenizer.encode(text); // returns EITHER an array OR an object, depending on tokenizer
|
152 |
+
|
153 |
+
// >>> handle both shapes
|
154 |
+
let ids = Array.isArray(enc)
|
155 |
+
? enc
|
156 |
+
: (enc && (enc.ids ?? enc.input_ids ?? enc.inputIds)) || [];
|
157 |
+
|
158 |
+
// Drop special tokens (for GPT-2, usually [50256])
|
|
|
159 |
const specials = new Set(tokenizer.all_special_ids || []);
|
160 |
const idsNoSpecials = ids.filter(id => !specials.has(id));
|
161 |
+
|
162 |
+
// Derive token strings from IDs
|
163 |
let tokens = [];
|
164 |
if (typeof tokenizer.convert_ids_to_tokens === 'function') {
|
165 |
tokens = tokenizer.convert_ids_to_tokens(idsNoSpecials);
|
166 |
} else if (typeof tokenizer.id_to_token === 'function') {
|
167 |
tokens = idsNoSpecials.map(id => tokenizer.id_to_token(id));
|
168 |
+
} else if (!Array.isArray(enc)) {
|
169 |
+
// Some builds expose enc.tokens when enc is an object
|
170 |
+
const encTokens = Array.isArray(enc.tokens) ? enc.tokens : [];
|
171 |
+
tokens = encTokens.filter((_, i) => !specials.has(ids[i]));
|
172 |
} else {
|
173 |
+
// Last resort: stringify IDs (shouldn’t be needed with GPT-2)
|
174 |
tokens = idsNoSpecials.map(String);
|
175 |
}
|
176 |
+
|
177 |
+
if (myRun !== runId) return;
|
178 |
+
|
179 |
+
state.tokens = tokens;
|
180 |
+
state.ids = idsNoSpecials;
|
|
|
181 |
render();
|
182 |
status(`Done. ${state.tokens.length} tokens.`);
|
183 |
} catch (e) {
|
|
|
185 |
render();
|
186 |
status('Error tokenizing. See console.');
|
187 |
}
|
188 |
+
|
189 |
|
190 |
function render(){
|
191 |
const tokens = Array.isArray(state.tokens) ? state.tokens : [];
|