Spaces:
Sleeping
Sleeping
fix: improve word selection and passage quality filtering
Browse files- Add passage quality checks to skip texts with excessive caps/numbers
- Update AI prompt to avoid malformed words and OCR errors
- Remove hardcoded word filtering in favor of AI-driven selection
- Increase max word length from 10 to 12 characters
- src/aiService.js +11 -2
- src/clozeGameEngine.js +42 -14
src/aiService.js
CHANGED
@@ -121,10 +121,19 @@ class OpenRouterService {
|
|
121 |
model: this.model,
|
122 |
messages: [{
|
123 |
role: 'system',
|
124 |
-
content: '
|
125 |
}, {
|
126 |
role: 'user',
|
127 |
-
content: `Select exactly ${count}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
Passage: "${passage}"`
|
130 |
}],
|
|
|
121 |
model: this.model,
|
122 |
messages: [{
|
123 |
role: 'system',
|
124 |
+
content: 'You are a vocabulary selector for educational cloze exercises. Select meaningful, properly-spelled content words that appear exactly as written in the passage.'
|
125 |
}, {
|
126 |
role: 'user',
|
127 |
+
content: `Select exactly ${count} words from this passage for a cloze exercise.
|
128 |
+
|
129 |
+
REQUIREMENTS:
|
130 |
+
- Choose clear, properly-spelled words (no OCR errors like "andsatires")
|
131 |
+
- Select meaningful nouns, verbs, or adjectives (4-12 letters)
|
132 |
+
- Words must appear EXACTLY as written in the passage
|
133 |
+
- Avoid: function words, archaic terms, proper nouns, technical jargon
|
134 |
+
- Skip any words that look malformed or concatenated
|
135 |
+
|
136 |
+
Return ONLY a JSON array of the selected words.
|
137 |
|
138 |
Passage: "${passage}"`
|
139 |
}],
|
src/clozeGameEngine.js
CHANGED
@@ -85,13 +85,17 @@ class ClozeGame {
|
|
85 |
const startFromMiddle = Math.floor(textLength * 0.3); // Skip first 30%
|
86 |
const endAtThreeQuarters = Math.floor(textLength * 0.8); // Stop before last 20%
|
87 |
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
|
|
|
|
|
|
|
|
95 |
|
96 |
// Clean up start - find first complete sentence that starts with capital letter
|
97 |
const firstSentenceMatch = passage.match(/[.!?]\s+([A-Z][^.!?]*)/);
|
@@ -106,12 +110,36 @@ class ClozeGame {
|
|
106 |
}
|
107 |
}
|
108 |
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
}
|
116 |
|
117 |
// Ensure minimum length - if too short, return what we have rather than infinite recursion
|
@@ -333,7 +361,7 @@ class ClozeGame {
|
|
333 |
const contentWordIndices = [];
|
334 |
words.forEach((word, index) => {
|
335 |
const cleanWord = word.toLowerCase().replace(/[^\w]/g, '');
|
336 |
-
if (cleanWord.length > 3 && cleanWord.length <=
|
337 |
contentWordIndices.push({ word: cleanWord, index });
|
338 |
}
|
339 |
});
|
|
|
85 |
const startFromMiddle = Math.floor(textLength * 0.3); // Skip first 30%
|
86 |
const endAtThreeQuarters = Math.floor(textLength * 0.8); // Stop before last 20%
|
87 |
|
88 |
+
let attempts = 0;
|
89 |
+
let passage = '';
|
90 |
+
|
91 |
+
while (attempts < 5) {
|
92 |
+
// Random position in the middle section
|
93 |
+
const availableLength = endAtThreeQuarters - startFromMiddle;
|
94 |
+
const randomOffset = Math.floor(Math.random() * Math.max(0, availableLength - 1000));
|
95 |
+
const startIndex = startFromMiddle + randomOffset;
|
96 |
+
|
97 |
+
// Extract longer initial passage for better sentence completion
|
98 |
+
passage = text.substring(startIndex, startIndex + 1000);
|
99 |
|
100 |
// Clean up start - find first complete sentence that starts with capital letter
|
101 |
const firstSentenceMatch = passage.match(/[.!?]\s+([A-Z][^.!?]*)/);
|
|
|
110 |
}
|
111 |
}
|
112 |
|
113 |
+
// Clean up end - ensure we end at a complete sentence
|
114 |
+
const sentences = passage.split(/(?<=[.!?])\s+/);
|
115 |
+
if (sentences.length > 1) {
|
116 |
+
// Remove the last sentence if it might be incomplete
|
117 |
+
sentences.pop();
|
118 |
+
passage = sentences.join(' ');
|
119 |
+
}
|
120 |
+
|
121 |
+
// Quality check: reject passages with excessive caps, numbers, or special formatting
|
122 |
+
const words = passage.split(/\s+/);
|
123 |
+
const capsCount = words.filter(w => w.length > 1 && w === w.toUpperCase()).length;
|
124 |
+
const numbersCount = words.filter(w => /\d/.test(w)).length;
|
125 |
+
const totalWords = words.length;
|
126 |
+
|
127 |
+
// Skip if more than 10% caps or 5% numbers
|
128 |
+
if (capsCount / totalWords > 0.1 || numbersCount / totalWords > 0.05) {
|
129 |
+
console.log(`Skipping passage with ${capsCount} caps and ${numbersCount} numbers out of ${totalWords} words`);
|
130 |
+
attempts++;
|
131 |
+
continue;
|
132 |
+
}
|
133 |
+
|
134 |
+
// Check for other quality issues
|
135 |
+
if (passage.includes('CHAPTER') || passage.includes('Section') ||
|
136 |
+
passage.match(/\b(Fig\.|Table|Illustration)\b/)) {
|
137 |
+
attempts++;
|
138 |
+
continue;
|
139 |
+
}
|
140 |
+
|
141 |
+
// Good passage found
|
142 |
+
break;
|
143 |
}
|
144 |
|
145 |
// Ensure minimum length - if too short, return what we have rather than infinite recursion
|
|
|
361 |
const contentWordIndices = [];
|
362 |
words.forEach((word, index) => {
|
363 |
const cleanWord = word.toLowerCase().replace(/[^\w]/g, '');
|
364 |
+
if (cleanWord.length > 3 && cleanWord.length <= 12 && !functionWords.has(cleanWord)) {
|
365 |
contentWordIndices.push({ word: cleanWord, index });
|
366 |
}
|
367 |
});
|