milwright commited on
Commit
dc9cc8c
·
1 Parent(s): baad690

Strengthen content filtering to reject dictionary and academic material

Browse files

- Add detection for hash symbols, abbreviations, and etymology brackets
- Detect academic reference patterns (Roman numerals, citations)
- Filter out dictionary definition structures and technical terminology
- Add quality scoring for glossary and reference material patterns
- Prevent passages like vocabulary lists from bypassing filters

Files changed (1) hide show
  1. src/clozeGameEngine.js +31 -0
src/clozeGameEngine.js CHANGED
@@ -178,6 +178,23 @@ class ClozeGame {
178
  const parenthesesCount = (passage.match(/[()]/g) || []).length;
179
  const squareBrackets = (passage.match(/[\[\]]/g) || []).length;
180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  // Check for repetitive patterns (common in indexes/TOCs)
182
  const repeatedPhrases = ['CONTENTS', 'CHAPTER', 'Volume', 'Vol.', 'Part', 'Book'];
183
  const repetitionCount = repeatedPhrases.reduce((count, phrase) =>
@@ -199,6 +216,11 @@ class ClozeGame {
199
  const dashRatio = totalDashes / totalWords;
200
  const parenthesesRatio = parenthesesCount / totalWords;
201
  const squareBracketRatio = squareBrackets / totalWords;
 
 
 
 
 
202
 
203
  // Stricter thresholds for higher levels
204
  const capsThreshold = this.currentLevel >= 3 ? 0.03 : 0.05;
@@ -225,6 +247,15 @@ class ClozeGame {
225
  if (parenthesesRatio > 0.05) { qualityScore += 2; issues.push(`excessive-parentheses: ${Math.round(parenthesesRatio * 100)}%`); }
226
  if (squareBracketRatio > 0.02) { qualityScore += 2; issues.push(`excessive-brackets: ${Math.round(squareBracketRatio * 100)}%`); }
227
 
 
 
 
 
 
 
 
 
 
228
  // Reject if quality score indicates technical/non-narrative content
229
  if (qualityScore > 3) {
230
  console.log(`Skipping low-quality passage (score: ${qualityScore.toFixed(1)}, issues: ${issues.join(', ')})`);
 
178
  const parenthesesCount = (passage.match(/[()]/g) || []).length;
179
  const squareBrackets = (passage.match(/[\[\]]/g) || []).length;
180
 
181
+ // Dictionary/glossary patterns
182
+ const hashSymbols = (passage.match(/#/g) || []).length;
183
+ const abbreviationPattern = /\b(n\.|adj\.|adv\.|v\.|pl\.|sg\.|cf\.|e\.g\.|i\.e\.|etc\.|vs\.|viz\.|OE\.|OFr\.|L\.|ME\.|NE\.|AN\.|ON\.|MDu\.|MLG\.|MHG\.|Ger\.|Du\.|Dan\.|Sw\.|Icel\.)\b/gi;
184
+ const abbreviations = (passage.match(abbreviationPattern) || []).length;
185
+ const etymologyBrackets = (passage.match(/\[[^\]]+\]/g) || []).length;
186
+ const referenceNumbers = (passage.match(/\b[IVX]+\s+[abc]?\s*\d+/g) || []).length;
187
+ const definitionPattern = /^[^.]+,\s*(n\.|adj\.|adv\.|v\.)/gm;
188
+ const definitionLines = (passage.match(definitionPattern) || []).length;
189
+
190
+ // Academic/reference patterns
191
+ const citationPattern = /\(\d{4}\)|p\.\s*\d+|pp\.\s*\d+-\d+|vol\.\s*\d+|ch\.\s*\d+/gi;
192
+ const citations = (passage.match(citationPattern) || []).length;
193
+ const technicalTerms = ['etymology', 'phoneme', 'morpheme', 'lexicon', 'syntax', 'semantics', 'glossary', 'vocabulary', 'dialect', 'pronunciation'];
194
+ const technicalTermCount = technicalTerms.reduce((count, term) =>
195
+ count + (passage.match(new RegExp(term, 'gi')) || []).length, 0
196
+ );
197
+
198
  // Check for repetitive patterns (common in indexes/TOCs)
199
  const repeatedPhrases = ['CONTENTS', 'CHAPTER', 'Volume', 'Vol.', 'Part', 'Book'];
200
  const repetitionCount = repeatedPhrases.reduce((count, phrase) =>
 
216
  const dashRatio = totalDashes / totalWords;
217
  const parenthesesRatio = parenthesesCount / totalWords;
218
  const squareBracketRatio = squareBrackets / totalWords;
219
+ const hashRatio = hashSymbols / totalWords;
220
+ const abbreviationRatio = abbreviations / totalWords;
221
+ const etymologyRatio = etymologyBrackets / totalWords;
222
+ const definitionRatio = definitionLines / Math.max(1, lines.length);
223
+ const technicalRatio = technicalTermCount / totalWords;
224
 
225
  // Stricter thresholds for higher levels
226
  const capsThreshold = this.currentLevel >= 3 ? 0.03 : 0.05;
 
247
  if (parenthesesRatio > 0.05) { qualityScore += 2; issues.push(`excessive-parentheses: ${Math.round(parenthesesRatio * 100)}%`); }
248
  if (squareBracketRatio > 0.02) { qualityScore += 2; issues.push(`excessive-brackets: ${Math.round(squareBracketRatio * 100)}%`); }
249
 
250
+ // Dictionary/glossary/academic content detection
251
+ if (hashRatio > 0.01) { qualityScore += hashRatio * 100; issues.push(`hash-symbols: ${hashSymbols}`); }
252
+ if (abbreviationRatio > 0.03) { qualityScore += abbreviationRatio * 50; issues.push(`abbreviations: ${abbreviations}`); }
253
+ if (etymologyRatio > 0.005) { qualityScore += etymologyRatio * 100; issues.push(`etymology-brackets: ${etymologyBrackets}`); }
254
+ if (definitionRatio > 0.1) { qualityScore += definitionRatio * 20; issues.push(`definition-lines: ${Math.round(definitionRatio * 100)}%`); }
255
+ if (referenceNumbers > 0) { qualityScore += referenceNumbers * 2; issues.push(`reference-numbers: ${referenceNumbers}`); }
256
+ if (citations > 0) { qualityScore += citations * 2; issues.push(`citations: ${citations}`); }
257
+ if (technicalRatio > 0.01) { qualityScore += technicalRatio * 30; issues.push(`technical-terms: ${technicalTermCount}`); }
258
+
259
  // Reject if quality score indicates technical/non-narrative content
260
  if (qualityScore > 3) {
261
  console.log(`Skipping low-quality passage (score: ${qualityScore.toFixed(1)}, issues: ${issues.join(', ')})`);