milwright commited on
Commit
e26fd03
·
1 Parent(s): bac89b2

Improve content filtering and word validation

Browse files

- Enhanced passage quality filtering with statistical analysis instead of hardcoded patterns
- Added problematic word filtering to prevent inappropriate vocabulary selection
- Improved fallback logic when AI word selection fails validation
- Better narrative content detection to avoid technical/reference material

Files changed (2) hide show
  1. src/aiService.js +26 -3
  2. src/clozeGameEngine.js +28 -13
src/aiService.js CHANGED
@@ -169,6 +169,7 @@ REQUIREMENTS:
169
  - Words must appear EXACTLY as written in the passage
170
  - Avoid: capitalized words, ALL-CAPS words, function words, archaic terms, proper nouns, technical jargon
171
  - Skip any words that look malformed or concatenated
 
172
  - NEVER select words from the first or last sentence/clause of the passage
173
  - Choose words from the middle portions for better context dependency
174
 
@@ -205,9 +206,16 @@ Passage: "${passage}"`
205
  try {
206
  const words = JSON.parse(content);
207
  if (Array.isArray(words)) {
208
- // Validate word lengths based on level
 
209
  const validWords = words.filter(word => {
210
  const cleanWord = word.replace(/[^a-zA-Z]/g, '');
 
 
 
 
 
 
211
  if (level <= 2) {
212
  return cleanWord.length >= 4 && cleanWord.length <= 7;
213
  } else if (level <= 4) {
@@ -230,9 +238,16 @@ Passage: "${passage}"`
230
  const matches = content.match(/"([^"]+)"/g);
231
  if (matches) {
232
  const words = matches.map(m => m.replace(/"/g, ''));
233
- // Validate word lengths
 
234
  const validWords = words.filter(word => {
235
  const cleanWord = word.replace(/[^a-zA-Z]/g, '');
 
 
 
 
 
 
236
  if (level <= 2) {
237
  return cleanWord.length >= 4 && cleanWord.length <= 7;
238
  } else if (level <= 4) {
@@ -316,6 +331,7 @@ SELECTION RULES:
316
  - Select EXACTLY ${blanksPerPassage} word${blanksPerPassage > 1 ? 's' : ''} per passage, no more, no less
317
  - Choose meaningful nouns, verbs, or adjectives (${wordLengthConstraint})
318
  - Avoid capitalized words, ALL-CAPS words, and table of contents entries
 
319
  - NEVER select words from the first or last sentence/clause of each passage
320
  - Choose words from the middle portions for better context dependency
321
  - Words must appear EXACTLY as written in the passage
@@ -411,10 +427,17 @@ Return as JSON: {"passage1": {...}, "passage2": {...}}`
411
  parsed.passage1.words = parsed.passage1.words.filter(word => word && word.trim() !== '');
412
  parsed.passage2.words = parsed.passage2.words.filter(word => word && word.trim() !== '');
413
 
414
- // Validate word lengths based on level
415
  const validateWords = (words) => {
 
416
  return words.filter(word => {
417
  const cleanWord = word.replace(/[^a-zA-Z]/g, '');
 
 
 
 
 
 
418
  if (level <= 2) {
419
  return cleanWord.length >= 4 && cleanWord.length <= 7;
420
  } else if (level <= 4) {
 
169
  - Words must appear EXACTLY as written in the passage
170
  - Avoid: capitalized words, ALL-CAPS words, function words, archaic terms, proper nouns, technical jargon
171
  - Skip any words that look malformed or concatenated
172
+ - Avoid dated or potentially offensive terms
173
  - NEVER select words from the first or last sentence/clause of the passage
174
  - Choose words from the middle portions for better context dependency
175
 
 
206
  try {
207
  const words = JSON.parse(content);
208
  if (Array.isArray(words)) {
209
+ // Filter problematic words and validate word lengths based on level
210
+ const problematicWords = ['negro', 'retard', 'retarded', 'nigger', 'chinaman', 'jap', 'gypsy', 'savage', 'primitive', 'heathen'];
211
  const validWords = words.filter(word => {
212
  const cleanWord = word.replace(/[^a-zA-Z]/g, '');
213
+ const lowerWord = cleanWord.toLowerCase();
214
+
215
+ // Skip problematic words
216
+ if (problematicWords.includes(lowerWord)) return false;
217
+
218
+ // Check length constraints
219
  if (level <= 2) {
220
  return cleanWord.length >= 4 && cleanWord.length <= 7;
221
  } else if (level <= 4) {
 
238
  const matches = content.match(/"([^"]+)"/g);
239
  if (matches) {
240
  const words = matches.map(m => m.replace(/"/g, ''));
241
+ // Filter problematic words and validate word lengths
242
+ const problematicWords = ['negro', 'retard', 'retarded', 'nigger', 'chinaman', 'jap', 'gypsy', 'savage', 'primitive', 'heathen'];
243
  const validWords = words.filter(word => {
244
  const cleanWord = word.replace(/[^a-zA-Z]/g, '');
245
+ const lowerWord = cleanWord.toLowerCase();
246
+
247
+ // Skip problematic words
248
+ if (problematicWords.includes(lowerWord)) return false;
249
+
250
+ // Check length constraints
251
  if (level <= 2) {
252
  return cleanWord.length >= 4 && cleanWord.length <= 7;
253
  } else if (level <= 4) {
 
331
  - Select EXACTLY ${blanksPerPassage} word${blanksPerPassage > 1 ? 's' : ''} per passage, no more, no less
332
  - Choose meaningful nouns, verbs, or adjectives (${wordLengthConstraint})
333
  - Avoid capitalized words, ALL-CAPS words, and table of contents entries
334
+ - Avoid dated or potentially offensive terms
335
  - NEVER select words from the first or last sentence/clause of each passage
336
  - Choose words from the middle portions for better context dependency
337
  - Words must appear EXACTLY as written in the passage
 
427
  parsed.passage1.words = parsed.passage1.words.filter(word => word && word.trim() !== '');
428
  parsed.passage2.words = parsed.passage2.words.filter(word => word && word.trim() !== '');
429
 
430
+ // Filter problematic words and validate word lengths based on level
431
  const validateWords = (words) => {
432
+ const problematicWords = ['negro', 'retard', 'retarded', 'nigger', 'chinaman', 'jap', 'gypsy', 'savage', 'primitive', 'heathen'];
433
  return words.filter(word => {
434
  const cleanWord = word.replace(/[^a-zA-Z]/g, '');
435
+ const lowerWord = cleanWord.toLowerCase();
436
+
437
+ // Skip problematic words
438
+ if (problematicWords.includes(lowerWord)) return false;
439
+
440
+ // Check length constraints
441
  if (level <= 2) {
442
  return cleanWord.length >= 4 && cleanWord.length <= 7;
443
  } else if (level <= 4) {
src/clozeGameEngine.js CHANGED
@@ -148,22 +148,37 @@ class ClozeGame {
148
  passage = sentences.join(' ');
149
  }
150
 
151
- // Quality check: reject passages with excessive caps, numbers, or special formatting
152
  const words = passage.split(/\s+/);
153
- const capsCount = words.filter(w => w.length > 1 && w === w.toUpperCase()).length;
154
- const numbersCount = words.filter(w => /\d/.test(w)).length;
155
  const totalWords = words.length;
156
 
157
- // Skip if more than 10% caps or 5% numbers
158
- if (capsCount / totalWords > 0.1 || numbersCount / totalWords > 0.05) {
159
- console.log(`Skipping passage with ${capsCount} caps and ${numbersCount} numbers out of ${totalWords} words`);
160
- attempts++;
161
- continue;
162
- }
163
-
164
- // Check for other quality issues
165
- if (passage.includes('CHAPTER') || passage.includes('Section') ||
166
- passage.match(/\b(Fig\.|Table|Illustration)\b/)) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  attempts++;
168
  continue;
169
  }
 
148
  passage = sentences.join(' ');
149
  }
150
 
151
+ // Enhanced quality check based on narrative flow characteristics
152
  const words = passage.split(/\s+/);
 
 
153
  const totalWords = words.length;
154
 
155
+ // Count various quality indicators
156
+ const capsCount = words.filter(w => w.length > 1 && w === w.toUpperCase()).length;
157
+ const numbersCount = words.filter(w => /\d/.test(w)).length;
158
+ const shortWords = words.filter(w => w.length <= 3).length;
159
+ const punctuationMarks = (passage.match(/[;:()[\]{}]/g) || []).length;
160
+ const sentenceList = passage.split(/[.!?]+/).filter(s => s.trim().length > 10);
161
+
162
+ // Calculate quality ratios
163
+ const capsRatio = capsCount / totalWords;
164
+ const numbersRatio = numbersCount / totalWords;
165
+ const shortWordRatio = shortWords / totalWords;
166
+ const punctuationRatio = punctuationMarks / totalWords;
167
+ const avgWordsPerSentence = totalWords / Math.max(1, sentenceList.length);
168
+
169
+ // Reject if passage shows signs of being technical/reference material
170
+ let qualityScore = 0;
171
+ let issues = [];
172
+
173
+ if (capsRatio > 0.05) { qualityScore += capsRatio * 20; issues.push(`caps: ${Math.round(capsRatio * 100)}%`); }
174
+ if (numbersRatio > 0.03) { qualityScore += numbersRatio * 30; issues.push(`numbers: ${Math.round(numbersRatio * 100)}%`); }
175
+ if (punctuationRatio > 0.08) { qualityScore += punctuationRatio * 15; issues.push(`punct: ${Math.round(punctuationRatio * 100)}%`); }
176
+ if (avgWordsPerSentence < 8 || avgWordsPerSentence > 40) { qualityScore += 2; issues.push(`sent-len: ${Math.round(avgWordsPerSentence)}`); }
177
+ if (shortWordRatio < 0.3) { qualityScore += 2; issues.push(`short-words: ${Math.round(shortWordRatio * 100)}%`); }
178
+
179
+ // Reject if quality score indicates technical/non-narrative content
180
+ if (qualityScore > 3) {
181
+ console.log(`Skipping low-quality passage (score: ${qualityScore.toFixed(1)}, issues: ${issues.join(', ')})`);
182
  attempts++;
183
  continue;
184
  }