milwright commited on
Commit
b37374a
·
1 Parent(s): d2c7203

fix: improve word selection and passage quality filtering

Browse files

- Add passage quality checks to skip texts with excessive caps/numbers
- Update AI prompt to avoid malformed words and OCR errors
- Remove hardcoded word filtering in favor of AI-driven selection
- Increase max word length from 10 to 12 characters

Files changed (2) hide show
  1. src/aiService.js +11 -2
  2. src/clozeGameEngine.js +42 -14
src/aiService.js CHANGED
@@ -121,10 +121,19 @@ class OpenRouterService {
121
  model: this.model,
122
  messages: [{
123
  role: 'system',
124
- content: 'Select words for cloze reading exercises. Choose common, everyday words that students know. Avoid proper nouns (names, places), technical terms, archaic words, and words over 8 letters. Pick words students can guess from surrounding context.'
125
  }, {
126
  role: 'user',
127
- content: `Select exactly ${count} meaningful content words for a cloze exercise. Choose nouns, verbs, adjectives that are 4+ letters long and important to meaning. NEVER select: articles (a, an, the), prepositions (in, on, at, to, for, of, with, by, from), conjunctions (and, or, but), pronouns (I, you, he, she, it, they), or auxiliary verbs (is, are, was, were, have, has, had). Return ONLY a JSON array of words.
 
 
 
 
 
 
 
 
 
128
 
129
  Passage: "${passage}"`
130
  }],
 
121
  model: this.model,
122
  messages: [{
123
  role: 'system',
124
+ content: 'You are a vocabulary selector for educational cloze exercises. Select meaningful, properly-spelled content words that appear exactly as written in the passage.'
125
  }, {
126
  role: 'user',
127
+ content: `Select exactly ${count} words from this passage for a cloze exercise.
128
+
129
+ REQUIREMENTS:
130
+ - Choose clear, properly-spelled words (no OCR errors like "andsatires")
131
+ - Select meaningful nouns, verbs, or adjectives (4-12 letters)
132
+ - Words must appear EXACTLY as written in the passage
133
+ - Avoid: function words, archaic terms, proper nouns, technical jargon
134
+ - Skip any words that look malformed or concatenated
135
+
136
+ Return ONLY a JSON array of the selected words.
137
 
138
  Passage: "${passage}"`
139
  }],
src/clozeGameEngine.js CHANGED
@@ -85,13 +85,17 @@ class ClozeGame {
85
  const startFromMiddle = Math.floor(textLength * 0.3); // Skip first 30%
86
  const endAtThreeQuarters = Math.floor(textLength * 0.8); // Stop before last 20%
87
 
88
- // Random position in the middle section
89
- const availableLength = endAtThreeQuarters - startFromMiddle;
90
- const randomOffset = Math.floor(Math.random() * Math.max(0, availableLength - 1000));
91
- const startIndex = startFromMiddle + randomOffset;
92
-
93
- // Extract longer initial passage for better sentence completion
94
- let passage = text.substring(startIndex, startIndex + 1000);
 
 
 
 
95
 
96
  // Clean up start - find first complete sentence that starts with capital letter
97
  const firstSentenceMatch = passage.match(/[.!?]\s+([A-Z][^.!?]*)/);
@@ -106,12 +110,36 @@ class ClozeGame {
106
  }
107
  }
108
 
109
- // Clean up end - ensure we end at a complete sentence
110
- const sentences = passage.split(/(?<=[.!?])\s+/);
111
- if (sentences.length > 1) {
112
- // Remove the last sentence if it might be incomplete
113
- sentences.pop();
114
- passage = sentences.join(' ');
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  }
116
 
117
  // Ensure minimum length - if too short, return what we have rather than infinite recursion
@@ -333,7 +361,7 @@ class ClozeGame {
333
  const contentWordIndices = [];
334
  words.forEach((word, index) => {
335
  const cleanWord = word.toLowerCase().replace(/[^\w]/g, '');
336
- if (cleanWord.length > 3 && cleanWord.length <= 10 && !functionWords.has(cleanWord)) {
337
  contentWordIndices.push({ word: cleanWord, index });
338
  }
339
  });
 
85
  const startFromMiddle = Math.floor(textLength * 0.3); // Skip first 30%
86
  const endAtThreeQuarters = Math.floor(textLength * 0.8); // Stop before last 20%
87
 
88
+ let attempts = 0;
89
+ let passage = '';
90
+
91
+ while (attempts < 5) {
92
+ // Random position in the middle section
93
+ const availableLength = endAtThreeQuarters - startFromMiddle;
94
+ const randomOffset = Math.floor(Math.random() * Math.max(0, availableLength - 1000));
95
+ const startIndex = startFromMiddle + randomOffset;
96
+
97
+ // Extract longer initial passage for better sentence completion
98
+ passage = text.substring(startIndex, startIndex + 1000);
99
 
100
  // Clean up start - find first complete sentence that starts with capital letter
101
  const firstSentenceMatch = passage.match(/[.!?]\s+([A-Z][^.!?]*)/);
 
110
  }
111
  }
112
 
113
+ // Clean up end - ensure we end at a complete sentence
114
+ const sentences = passage.split(/(?<=[.!?])\s+/);
115
+ if (sentences.length > 1) {
116
+ // Remove the last sentence if it might be incomplete
117
+ sentences.pop();
118
+ passage = sentences.join(' ');
119
+ }
120
+
121
+ // Quality check: reject passages with excessive caps, numbers, or special formatting
122
+ const words = passage.split(/\s+/);
123
+ const capsCount = words.filter(w => w.length > 1 && w === w.toUpperCase()).length;
124
+ const numbersCount = words.filter(w => /\d/.test(w)).length;
125
+ const totalWords = words.length;
126
+
127
+ // Skip if more than 10% caps or 5% numbers
128
+ if (capsCount / totalWords > 0.1 || numbersCount / totalWords > 0.05) {
129
+ console.log(`Skipping passage with ${capsCount} caps and ${numbersCount} numbers out of ${totalWords} words`);
130
+ attempts++;
131
+ continue;
132
+ }
133
+
134
+ // Check for other quality issues
135
+ if (passage.includes('CHAPTER') || passage.includes('Section') ||
136
+ passage.match(/\b(Fig\.|Table|Illustration)\b/)) {
137
+ attempts++;
138
+ continue;
139
+ }
140
+
141
+ // Good passage found
142
+ break;
143
  }
144
 
145
  // Ensure minimum length - if too short, return what we have rather than infinite recursion
 
361
  const contentWordIndices = [];
362
  words.forEach((word, index) => {
363
  const cleanWord = word.toLowerCase().replace(/[^\w]/g, '');
364
+ if (cleanWord.length > 3 && cleanWord.length <= 12 && !functionWords.has(cleanWord)) {
365
  contentWordIndices.push({ word: cleanWord, index });
366
  }
367
  });