PeterPinetree commited on
Commit
eba1e2c
·
verified ·
1 Parent(s): edf8cab

Update index.html

Browse files
Files changed (1) hide show
  1. index.html +21 -21
index.html CHANGED
@@ -148,36 +148,36 @@
148
 
149
  status('Tokenizing…');
150
  try {
151
- // 1) Get IDs (no options arg!)
152
- const enc = await tokenizer.encode(text);
153
-
154
- console.log('enc =', enc); // See what the tokenizer returns
155
- // robustly pluck ids out of whatever shape the lib returns
156
- let ids = (enc && (enc.ids ?? enc.input_ids ?? enc.inputIds)) || [];
157
- console.log('ids=', ids, 'specials=', tokenizer.all_special_ids);
158
-
159
- // 2) Drop special tokens (e.g., BOS/EOS) for the demo
160
  const specials = new Set(tokenizer.all_special_ids || []);
161
  const idsNoSpecials = ids.filter(id => !specials.has(id));
162
-
163
- // 3) Turn IDs into token strings
164
  let tokens = [];
165
  if (typeof tokenizer.convert_ids_to_tokens === 'function') {
166
  tokens = tokenizer.convert_ids_to_tokens(idsNoSpecials);
167
  } else if (typeof tokenizer.id_to_token === 'function') {
168
  tokens = idsNoSpecials.map(id => tokenizer.id_to_token(id));
169
- } else if (Array.isArray(enc.tokens)) {
170
- // fallback: some builds expose tokens directly (may include specials)
171
- tokens = enc.tokens.filter((_, i) => !specials.has(ids[i]));
 
172
  } else {
 
173
  tokens = idsNoSpecials.map(String);
174
  }
175
-
176
- if (myRun !== runId) return; // drop stale result
177
-
178
- state.tokens = Array.isArray(tokens) ? tokens : [];
179
- state.ids = Array.isArray(idsNoSpecials) ? idsNoSpecials : [];
180
-
181
  render();
182
  status(`Done. ${state.tokens.length} tokens.`);
183
  } catch (e) {
@@ -185,7 +185,7 @@
185
  render();
186
  status('Error tokenizing. See console.');
187
  }
188
- }
189
 
190
  function render(){
191
  const tokens = Array.isArray(state.tokens) ? state.tokens : [];
 
148
 
149
  status('Tokenizing…');
150
  try {
151
+ const enc = await tokenizer.encode(text); // returns EITHER an array OR an object, depending on tokenizer
152
+
153
+ // >>> handle both shapes
154
+ let ids = Array.isArray(enc)
155
+ ? enc
156
+ : (enc && (enc.ids ?? enc.input_ids ?? enc.inputIds)) || [];
157
+
158
+ // Drop special tokens (for GPT-2, usually [50256])
 
159
  const specials = new Set(tokenizer.all_special_ids || []);
160
  const idsNoSpecials = ids.filter(id => !specials.has(id));
161
+
162
+ // Derive token strings from IDs
163
  let tokens = [];
164
  if (typeof tokenizer.convert_ids_to_tokens === 'function') {
165
  tokens = tokenizer.convert_ids_to_tokens(idsNoSpecials);
166
  } else if (typeof tokenizer.id_to_token === 'function') {
167
  tokens = idsNoSpecials.map(id => tokenizer.id_to_token(id));
168
+ } else if (!Array.isArray(enc)) {
169
+ // Some builds expose enc.tokens when enc is an object
170
+ const encTokens = Array.isArray(enc.tokens) ? enc.tokens : [];
171
+ tokens = encTokens.filter((_, i) => !specials.has(ids[i]));
172
  } else {
173
+ // Last resort: stringify IDs (shouldn’t be needed with GPT-2)
174
  tokens = idsNoSpecials.map(String);
175
  }
176
+
177
+ if (myRun !== runId) return;
178
+
179
+ state.tokens = tokens;
180
+ state.ids = idsNoSpecials;
 
181
  render();
182
  status(`Done. ${state.tokens.length} tokens.`);
183
  } catch (e) {
 
185
  render();
186
  status('Error tokenizing. See console.');
187
  }
188
+
189
 
190
  function render(){
191
  const tokens = Array.isArray(state.tokens) ? state.tokens : [];