Spaces:
Running
Running
Improve content filtering and word validation
Browse files- Enhanced passage quality filtering with statistical analysis instead of hardcoded patterns
- Added problematic word filtering to prevent inappropriate vocabulary selection
- Improved fallback logic when AI word selection fails validation
- Better narrative content detection to avoid technical/reference material
- src/aiService.js +26 -3
- src/clozeGameEngine.js +28 -13
src/aiService.js
CHANGED
@@ -169,6 +169,7 @@ REQUIREMENTS:
|
|
169 |
- Words must appear EXACTLY as written in the passage
|
170 |
- Avoid: capitalized words, ALL-CAPS words, function words, archaic terms, proper nouns, technical jargon
|
171 |
- Skip any words that look malformed or concatenated
|
|
|
172 |
- NEVER select words from the first or last sentence/clause of the passage
|
173 |
- Choose words from the middle portions for better context dependency
|
174 |
|
@@ -205,9 +206,16 @@ Passage: "${passage}"`
|
|
205 |
try {
|
206 |
const words = JSON.parse(content);
|
207 |
if (Array.isArray(words)) {
|
208 |
-
//
|
|
|
209 |
const validWords = words.filter(word => {
|
210 |
const cleanWord = word.replace(/[^a-zA-Z]/g, '');
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
if (level <= 2) {
|
212 |
return cleanWord.length >= 4 && cleanWord.length <= 7;
|
213 |
} else if (level <= 4) {
|
@@ -230,9 +238,16 @@ Passage: "${passage}"`
|
|
230 |
const matches = content.match(/"([^"]+)"/g);
|
231 |
if (matches) {
|
232 |
const words = matches.map(m => m.replace(/"/g, ''));
|
233 |
-
//
|
|
|
234 |
const validWords = words.filter(word => {
|
235 |
const cleanWord = word.replace(/[^a-zA-Z]/g, '');
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
if (level <= 2) {
|
237 |
return cleanWord.length >= 4 && cleanWord.length <= 7;
|
238 |
} else if (level <= 4) {
|
@@ -316,6 +331,7 @@ SELECTION RULES:
|
|
316 |
- Select EXACTLY ${blanksPerPassage} word${blanksPerPassage > 1 ? 's' : ''} per passage, no more, no less
|
317 |
- Choose meaningful nouns, verbs, or adjectives (${wordLengthConstraint})
|
318 |
- Avoid capitalized words, ALL-CAPS words, and table of contents entries
|
|
|
319 |
- NEVER select words from the first or last sentence/clause of each passage
|
320 |
- Choose words from the middle portions for better context dependency
|
321 |
- Words must appear EXACTLY as written in the passage
|
@@ -411,10 +427,17 @@ Return as JSON: {"passage1": {...}, "passage2": {...}}`
|
|
411 |
parsed.passage1.words = parsed.passage1.words.filter(word => word && word.trim() !== '');
|
412 |
parsed.passage2.words = parsed.passage2.words.filter(word => word && word.trim() !== '');
|
413 |
|
414 |
-
//
|
415 |
const validateWords = (words) => {
|
|
|
416 |
return words.filter(word => {
|
417 |
const cleanWord = word.replace(/[^a-zA-Z]/g, '');
|
|
|
|
|
|
|
|
|
|
|
|
|
418 |
if (level <= 2) {
|
419 |
return cleanWord.length >= 4 && cleanWord.length <= 7;
|
420 |
} else if (level <= 4) {
|
|
|
169 |
- Words must appear EXACTLY as written in the passage
|
170 |
- Avoid: capitalized words, ALL-CAPS words, function words, archaic terms, proper nouns, technical jargon
|
171 |
- Skip any words that look malformed or concatenated
|
172 |
+
- Avoid dated or potentially offensive terms
|
173 |
- NEVER select words from the first or last sentence/clause of the passage
|
174 |
- Choose words from the middle portions for better context dependency
|
175 |
|
|
|
206 |
try {
|
207 |
const words = JSON.parse(content);
|
208 |
if (Array.isArray(words)) {
|
209 |
+
// Filter problematic words and validate word lengths based on level
|
210 |
+
const problematicWords = ['negro', 'retard', 'retarded', 'nigger', 'chinaman', 'jap', 'gypsy', 'savage', 'primitive', 'heathen'];
|
211 |
const validWords = words.filter(word => {
|
212 |
const cleanWord = word.replace(/[^a-zA-Z]/g, '');
|
213 |
+
const lowerWord = cleanWord.toLowerCase();
|
214 |
+
|
215 |
+
// Skip problematic words
|
216 |
+
if (problematicWords.includes(lowerWord)) return false;
|
217 |
+
|
218 |
+
// Check length constraints
|
219 |
if (level <= 2) {
|
220 |
return cleanWord.length >= 4 && cleanWord.length <= 7;
|
221 |
} else if (level <= 4) {
|
|
|
238 |
const matches = content.match(/"([^"]+)"/g);
|
239 |
if (matches) {
|
240 |
const words = matches.map(m => m.replace(/"/g, ''));
|
241 |
+
// Filter problematic words and validate word lengths
|
242 |
+
const problematicWords = ['negro', 'retard', 'retarded', 'nigger', 'chinaman', 'jap', 'gypsy', 'savage', 'primitive', 'heathen'];
|
243 |
const validWords = words.filter(word => {
|
244 |
const cleanWord = word.replace(/[^a-zA-Z]/g, '');
|
245 |
+
const lowerWord = cleanWord.toLowerCase();
|
246 |
+
|
247 |
+
// Skip problematic words
|
248 |
+
if (problematicWords.includes(lowerWord)) return false;
|
249 |
+
|
250 |
+
// Check length constraints
|
251 |
if (level <= 2) {
|
252 |
return cleanWord.length >= 4 && cleanWord.length <= 7;
|
253 |
} else if (level <= 4) {
|
|
|
331 |
- Select EXACTLY ${blanksPerPassage} word${blanksPerPassage > 1 ? 's' : ''} per passage, no more, no less
|
332 |
- Choose meaningful nouns, verbs, or adjectives (${wordLengthConstraint})
|
333 |
- Avoid capitalized words, ALL-CAPS words, and table of contents entries
|
334 |
+
- Avoid dated or potentially offensive terms
|
335 |
- NEVER select words from the first or last sentence/clause of each passage
|
336 |
- Choose words from the middle portions for better context dependency
|
337 |
- Words must appear EXACTLY as written in the passage
|
|
|
427 |
parsed.passage1.words = parsed.passage1.words.filter(word => word && word.trim() !== '');
|
428 |
parsed.passage2.words = parsed.passage2.words.filter(word => word && word.trim() !== '');
|
429 |
|
430 |
+
// Filter problematic words and validate word lengths based on level
|
431 |
const validateWords = (words) => {
|
432 |
+
const problematicWords = ['negro', 'retard', 'retarded', 'nigger', 'chinaman', 'jap', 'gypsy', 'savage', 'primitive', 'heathen'];
|
433 |
return words.filter(word => {
|
434 |
const cleanWord = word.replace(/[^a-zA-Z]/g, '');
|
435 |
+
const lowerWord = cleanWord.toLowerCase();
|
436 |
+
|
437 |
+
// Skip problematic words
|
438 |
+
if (problematicWords.includes(lowerWord)) return false;
|
439 |
+
|
440 |
+
// Check length constraints
|
441 |
if (level <= 2) {
|
442 |
return cleanWord.length >= 4 && cleanWord.length <= 7;
|
443 |
} else if (level <= 4) {
|
src/clozeGameEngine.js
CHANGED
@@ -148,22 +148,37 @@ class ClozeGame {
|
|
148 |
passage = sentences.join(' ');
|
149 |
}
|
150 |
|
151 |
-
//
|
152 |
const words = passage.split(/\s+/);
|
153 |
-
const capsCount = words.filter(w => w.length > 1 && w === w.toUpperCase()).length;
|
154 |
-
const numbersCount = words.filter(w => /\d/.test(w)).length;
|
155 |
const totalWords = words.length;
|
156 |
|
157 |
-
//
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
//
|
165 |
-
|
166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
attempts++;
|
168 |
continue;
|
169 |
}
|
|
|
148 |
passage = sentences.join(' ');
|
149 |
}
|
150 |
|
151 |
+
// Enhanced quality check based on narrative flow characteristics
|
152 |
const words = passage.split(/\s+/);
|
|
|
|
|
153 |
const totalWords = words.length;
|
154 |
|
155 |
+
// Count various quality indicators
|
156 |
+
const capsCount = words.filter(w => w.length > 1 && w === w.toUpperCase()).length;
|
157 |
+
const numbersCount = words.filter(w => /\d/.test(w)).length;
|
158 |
+
const shortWords = words.filter(w => w.length <= 3).length;
|
159 |
+
const punctuationMarks = (passage.match(/[;:()[\]{}]/g) || []).length;
|
160 |
+
const sentenceList = passage.split(/[.!?]+/).filter(s => s.trim().length > 10);
|
161 |
+
|
162 |
+
// Calculate quality ratios
|
163 |
+
const capsRatio = capsCount / totalWords;
|
164 |
+
const numbersRatio = numbersCount / totalWords;
|
165 |
+
const shortWordRatio = shortWords / totalWords;
|
166 |
+
const punctuationRatio = punctuationMarks / totalWords;
|
167 |
+
const avgWordsPerSentence = totalWords / Math.max(1, sentenceList.length);
|
168 |
+
|
169 |
+
// Reject if passage shows signs of being technical/reference material
|
170 |
+
let qualityScore = 0;
|
171 |
+
let issues = [];
|
172 |
+
|
173 |
+
if (capsRatio > 0.05) { qualityScore += capsRatio * 20; issues.push(`caps: ${Math.round(capsRatio * 100)}%`); }
|
174 |
+
if (numbersRatio > 0.03) { qualityScore += numbersRatio * 30; issues.push(`numbers: ${Math.round(numbersRatio * 100)}%`); }
|
175 |
+
if (punctuationRatio > 0.08) { qualityScore += punctuationRatio * 15; issues.push(`punct: ${Math.round(punctuationRatio * 100)}%`); }
|
176 |
+
if (avgWordsPerSentence < 8 || avgWordsPerSentence > 40) { qualityScore += 2; issues.push(`sent-len: ${Math.round(avgWordsPerSentence)}`); }
|
177 |
+
if (shortWordRatio < 0.3) { qualityScore += 2; issues.push(`short-words: ${Math.round(shortWordRatio * 100)}%`); }
|
178 |
+
|
179 |
+
// Reject if quality score indicates technical/non-narrative content
|
180 |
+
if (qualityScore > 3) {
|
181 |
+
console.log(`Skipping low-quality passage (score: ${qualityScore.toFixed(1)}, issues: ${issues.join(', ')})`);
|
182 |
attempts++;
|
183 |
continue;
|
184 |
}
|