Spaces:
Sleeping
Sleeping
Strengthen content filtering to reject dictionary and academic material
Browse files- Add detection for hash symbols, abbreviations, and etymology brackets
- Detect academic reference patterns (Roman numerals, citations)
- Filter out dictionary definition structures and technical terminology
- Add quality scoring for glossary and reference material patterns
- Prevent passages like vocabulary lists from bypassing filters
- src/clozeGameEngine.js +31 -0
src/clozeGameEngine.js
CHANGED
@@ -178,6 +178,23 @@ class ClozeGame {
|
|
178 |
const parenthesesCount = (passage.match(/[()]/g) || []).length;
|
179 |
const squareBrackets = (passage.match(/[\[\]]/g) || []).length;
|
180 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
// Check for repetitive patterns (common in indexes/TOCs)
|
182 |
const repeatedPhrases = ['CONTENTS', 'CHAPTER', 'Volume', 'Vol.', 'Part', 'Book'];
|
183 |
const repetitionCount = repeatedPhrases.reduce((count, phrase) =>
|
@@ -199,6 +216,11 @@ class ClozeGame {
|
|
199 |
const dashRatio = totalDashes / totalWords;
|
200 |
const parenthesesRatio = parenthesesCount / totalWords;
|
201 |
const squareBracketRatio = squareBrackets / totalWords;
|
|
|
|
|
|
|
|
|
|
|
202 |
|
203 |
// Stricter thresholds for higher levels
|
204 |
const capsThreshold = this.currentLevel >= 3 ? 0.03 : 0.05;
|
@@ -225,6 +247,15 @@ class ClozeGame {
|
|
225 |
if (parenthesesRatio > 0.05) { qualityScore += 2; issues.push(`excessive-parentheses: ${Math.round(parenthesesRatio * 100)}%`); }
|
226 |
if (squareBracketRatio > 0.02) { qualityScore += 2; issues.push(`excessive-brackets: ${Math.round(squareBracketRatio * 100)}%`); }
|
227 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
// Reject if quality score indicates technical/non-narrative content
|
229 |
if (qualityScore > 3) {
|
230 |
console.log(`Skipping low-quality passage (score: ${qualityScore.toFixed(1)}, issues: ${issues.join(', ')})`);
|
|
|
178 |
const parenthesesCount = (passage.match(/[()]/g) || []).length;
|
179 |
const squareBrackets = (passage.match(/[\[\]]/g) || []).length;
|
180 |
|
181 |
+
// Dictionary/glossary patterns
|
182 |
+
const hashSymbols = (passage.match(/#/g) || []).length;
|
183 |
+
const abbreviationPattern = /\b(n\.|adj\.|adv\.|v\.|pl\.|sg\.|cf\.|e\.g\.|i\.e\.|etc\.|vs\.|viz\.|OE\.|OFr\.|L\.|ME\.|NE\.|AN\.|ON\.|MDu\.|MLG\.|MHG\.|Ger\.|Du\.|Dan\.|Sw\.|Icel\.)\b/gi;
|
184 |
+
const abbreviations = (passage.match(abbreviationPattern) || []).length;
|
185 |
+
const etymologyBrackets = (passage.match(/\[[^\]]+\]/g) || []).length;
|
186 |
+
const referenceNumbers = (passage.match(/\b[IVX]+\s+[abc]?\s*\d+/g) || []).length;
|
187 |
+
const definitionPattern = /^[^.]+,\s*(n\.|adj\.|adv\.|v\.)/gm;
|
188 |
+
const definitionLines = (passage.match(definitionPattern) || []).length;
|
189 |
+
|
190 |
+
// Academic/reference patterns
|
191 |
+
const citationPattern = /\(\d{4}\)|p\.\s*\d+|pp\.\s*\d+-\d+|vol\.\s*\d+|ch\.\s*\d+/gi;
|
192 |
+
const citations = (passage.match(citationPattern) || []).length;
|
193 |
+
const technicalTerms = ['etymology', 'phoneme', 'morpheme', 'lexicon', 'syntax', 'semantics', 'glossary', 'vocabulary', 'dialect', 'pronunciation'];
|
194 |
+
const technicalTermCount = technicalTerms.reduce((count, term) =>
|
195 |
+
count + (passage.match(new RegExp(term, 'gi')) || []).length, 0
|
196 |
+
);
|
197 |
+
|
198 |
// Check for repetitive patterns (common in indexes/TOCs)
|
199 |
const repeatedPhrases = ['CONTENTS', 'CHAPTER', 'Volume', 'Vol.', 'Part', 'Book'];
|
200 |
const repetitionCount = repeatedPhrases.reduce((count, phrase) =>
|
|
|
216 |
const dashRatio = totalDashes / totalWords;
|
217 |
const parenthesesRatio = parenthesesCount / totalWords;
|
218 |
const squareBracketRatio = squareBrackets / totalWords;
|
219 |
+
const hashRatio = hashSymbols / totalWords;
|
220 |
+
const abbreviationRatio = abbreviations / totalWords;
|
221 |
+
const etymologyRatio = etymologyBrackets / totalWords;
|
222 |
+
const definitionRatio = definitionLines / Math.max(1, lines.length);
|
223 |
+
const technicalRatio = technicalTermCount / totalWords;
|
224 |
|
225 |
// Stricter thresholds for higher levels
|
226 |
const capsThreshold = this.currentLevel >= 3 ? 0.03 : 0.05;
|
|
|
247 |
if (parenthesesRatio > 0.05) { qualityScore += 2; issues.push(`excessive-parentheses: ${Math.round(parenthesesRatio * 100)}%`); }
|
248 |
if (squareBracketRatio > 0.02) { qualityScore += 2; issues.push(`excessive-brackets: ${Math.round(squareBracketRatio * 100)}%`); }
|
249 |
|
250 |
+
// Dictionary/glossary/academic content detection
|
251 |
+
if (hashRatio > 0.01) { qualityScore += hashRatio * 100; issues.push(`hash-symbols: ${hashSymbols}`); }
|
252 |
+
if (abbreviationRatio > 0.03) { qualityScore += abbreviationRatio * 50; issues.push(`abbreviations: ${abbreviations}`); }
|
253 |
+
if (etymologyRatio > 0.005) { qualityScore += etymologyRatio * 100; issues.push(`etymology-brackets: ${etymologyBrackets}`); }
|
254 |
+
if (definitionRatio > 0.1) { qualityScore += definitionRatio * 20; issues.push(`definition-lines: ${Math.round(definitionRatio * 100)}%`); }
|
255 |
+
if (referenceNumbers > 0) { qualityScore += referenceNumbers * 2; issues.push(`reference-numbers: ${referenceNumbers}`); }
|
256 |
+
if (citations > 0) { qualityScore += citations * 2; issues.push(`citations: ${citations}`); }
|
257 |
+
if (technicalRatio > 0.01) { qualityScore += technicalRatio * 30; issues.push(`technical-terms: ${technicalTermCount}`); }
|
258 |
+
|
259 |
// Reject if quality score indicates technical/non-narrative content
|
260 |
if (qualityScore > 3) {
|
261 |
console.log(`Skipping low-quality passage (score: ${qualityScore.toFixed(1)}, issues: ${issues.join(', ')})`);
|