milwright commited on
Commit
9e70845
Β·
1 Parent(s): e26fd03

Strengthen content filtering to reject indexes and TOCs

Browse files

- Added repetitive pattern detection for CONTENTS, CHAPTER, Volume keywords
- Implemented title line detection for all-caps lines common in TOCs
- Made filtering stricter for Level 3+ (caps threshold 3% vs 5%)
- Added book-level early detection of index/catalog content
- Reject books with index/catalog/bibliography in title
- Increased scoring weights for better technical content rejection

Files changed (2) hide show
  1. src/bookDataService.js +32 -3
  2. src/clozeGameEngine.js +24 -4
src/bookDataService.js CHANGED
@@ -385,17 +385,46 @@ class HuggingFaceDatasetService {
385
 
386
  const textLength = book.text.length;
387
 
388
- // Relaxed filter criteria for cloze exercises
389
  if (textLength < 2000) return false; // Minimum readable length
390
  if (textLength > 500000) return false; // Too long for performance
391
 
392
  // Check for excessive formatting (likely reference material)
393
  const lineBreakRatio = (book.text.match(/\n\n/g) || []).length / textLength;
394
- if (lineBreakRatio > 0.05) return false; // Relaxed fragmentation threshold
395
 
396
  // Ensure it has actual narrative content
397
  const sentenceCount = (book.text.match(/[.!?]+/g) || []).length;
398
- if (sentenceCount < 10) return false; // Relaxed sentence requirement
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
 
400
  console.log(`πŸ“– Book validated: "${book.title}" (${textLength} chars, ${sentenceCount} sentences)`);
401
  return true;
 
385
 
386
  const textLength = book.text.length;
387
 
388
+ // Basic length criteria
389
  if (textLength < 2000) return false; // Minimum readable length
390
  if (textLength > 500000) return false; // Too long for performance
391
 
392
  // Check for excessive formatting (likely reference material)
393
  const lineBreakRatio = (book.text.match(/\n\n/g) || []).length / textLength;
394
+ if (lineBreakRatio > 0.05) return false; // Fragmentation threshold
395
 
396
  // Ensure it has actual narrative content
397
  const sentenceCount = (book.text.match(/[.!?]+/g) || []).length;
398
+ if (sentenceCount < 10) return false; // Sentence requirement
399
+
400
+ // Sample text for quality check (first 5000 chars should be representative)
401
+ const sampleText = book.text.substring(0, 5000);
402
+
403
+ // Check for index/TOC patterns
404
+ const indexPatterns = [
405
+ 'CONTENTS', 'INDEX', 'CHAPTER', 'Volume', 'Vol.',
406
+ 'Part I', 'Part II', 'BOOK I', 'APPENDIX'
407
+ ];
408
+ const indexCount = indexPatterns.reduce((count, pattern) =>
409
+ count + (sampleText.match(new RegExp(pattern, 'gi')) || []).length, 0
410
+ );
411
+ const indexRatio = indexCount / (sampleText.split(/\s+/).length || 1);
412
+
413
+ if (indexRatio > 0.05) {
414
+ console.log(`❌ Book rejected - appears to be index/TOC: "${book.title}" (index ratio: ${Math.round(indexRatio * 100)}%)`);
415
+ return false;
416
+ }
417
+
418
+ // Check for catalog/bibliography patterns
419
+ if (book.title && (
420
+ book.title.toLowerCase().includes('index') ||
421
+ book.title.toLowerCase().includes('catalog') ||
422
+ book.title.toLowerCase().includes('bibliography') ||
423
+ book.title.toLowerCase().includes('contents')
424
+ )) {
425
+ console.log(`❌ Book rejected - title suggests index/catalog: "${book.title}"`);
426
+ return false;
427
+ }
428
 
429
  console.log(`πŸ“– Book validated: "${book.title}" (${textLength} chars, ${sentenceCount} sentences)`);
430
  return true;
src/clozeGameEngine.js CHANGED
@@ -153,11 +153,23 @@ class ClozeGame {
153
  const totalWords = words.length;
154
 
155
  // Count various quality indicators
156
- const capsCount = words.filter(w => w.length > 1 && w === w.toUpperCase()).length;
 
157
  const numbersCount = words.filter(w => /\d/.test(w)).length;
158
  const shortWords = words.filter(w => w.length <= 3).length;
159
- const punctuationMarks = (passage.match(/[;:()[\]{}]/g) || []).length;
160
  const sentenceList = passage.split(/[.!?]+/).filter(s => s.trim().length > 10);
 
 
 
 
 
 
 
 
 
 
 
161
 
162
  // Calculate quality ratios
163
  const capsRatio = capsCount / totalWords;
@@ -165,16 +177,24 @@ class ClozeGame {
165
  const shortWordRatio = shortWords / totalWords;
166
  const punctuationRatio = punctuationMarks / totalWords;
167
  const avgWordsPerSentence = totalWords / Math.max(1, sentenceList.length);
 
 
 
 
 
 
168
 
169
  // Reject if passage shows signs of being technical/reference material
170
  let qualityScore = 0;
171
  let issues = [];
172
 
173
- if (capsRatio > 0.05) { qualityScore += capsRatio * 20; issues.push(`caps: ${Math.round(capsRatio * 100)}%`); }
174
- if (numbersRatio > 0.03) { qualityScore += numbersRatio * 30; issues.push(`numbers: ${Math.round(numbersRatio * 100)}%`); }
175
  if (punctuationRatio > 0.08) { qualityScore += punctuationRatio * 15; issues.push(`punct: ${Math.round(punctuationRatio * 100)}%`); }
176
  if (avgWordsPerSentence < 8 || avgWordsPerSentence > 40) { qualityScore += 2; issues.push(`sent-len: ${Math.round(avgWordsPerSentence)}`); }
177
  if (shortWordRatio < 0.3) { qualityScore += 2; issues.push(`short-words: ${Math.round(shortWordRatio * 100)}%`); }
 
 
178
 
179
  // Reject if quality score indicates technical/non-narrative content
180
  if (qualityScore > 3) {
 
153
  const totalWords = words.length;
154
 
155
  // Count various quality indicators
156
+ const capsWords = words.filter(w => w.length > 1 && w === w.toUpperCase());
157
+ const capsCount = capsWords.length;
158
  const numbersCount = words.filter(w => /\d/.test(w)).length;
159
  const shortWords = words.filter(w => w.length <= 3).length;
160
+ const punctuationMarks = (passage.match(/[;:()[\]{}—–]/g) || []).length;
161
  const sentenceList = passage.split(/[.!?]+/).filter(s => s.trim().length > 10);
162
+ const lines = passage.split('\n').filter(l => l.trim());
163
+
164
+ // Check for repetitive patterns (common in indexes/TOCs)
165
+ const repeatedPhrases = ['CONTENTS', 'CHAPTER', 'Volume', 'Vol.', 'Part', 'Book'];
166
+ const repetitionCount = repeatedPhrases.reduce((count, phrase) =>
167
+ count + (passage.match(new RegExp(phrase, 'gi')) || []).length, 0
168
+ );
169
+
170
+ // Check for title patterns (common in TOCs)
171
+ const titlePattern = /^[A-Z][A-Z\s]+$/m;
172
+ const titleLines = lines.filter(line => titlePattern.test(line.trim())).length;
173
 
174
  // Calculate quality ratios
175
  const capsRatio = capsCount / totalWords;
 
177
  const shortWordRatio = shortWords / totalWords;
178
  const punctuationRatio = punctuationMarks / totalWords;
179
  const avgWordsPerSentence = totalWords / Math.max(1, sentenceList.length);
180
+ const repetitionRatio = repetitionCount / totalWords;
181
+ const titleLineRatio = titleLines / Math.max(1, lines.length);
182
+
183
+ // Stricter thresholds for higher levels
184
+ const capsThreshold = this.currentLevel >= 3 ? 0.03 : 0.05;
185
+ const numbersThreshold = this.currentLevel >= 3 ? 0.02 : 0.03;
186
 
187
  // Reject if passage shows signs of being technical/reference material
188
  let qualityScore = 0;
189
  let issues = [];
190
 
191
+ if (capsRatio > capsThreshold) { qualityScore += capsRatio * 30; issues.push(`caps: ${Math.round(capsRatio * 100)}%`); }
192
+ if (numbersRatio > numbersThreshold) { qualityScore += numbersRatio * 40; issues.push(`numbers: ${Math.round(numbersRatio * 100)}%`); }
193
  if (punctuationRatio > 0.08) { qualityScore += punctuationRatio * 15; issues.push(`punct: ${Math.round(punctuationRatio * 100)}%`); }
194
  if (avgWordsPerSentence < 8 || avgWordsPerSentence > 40) { qualityScore += 2; issues.push(`sent-len: ${Math.round(avgWordsPerSentence)}`); }
195
  if (shortWordRatio < 0.3) { qualityScore += 2; issues.push(`short-words: ${Math.round(shortWordRatio * 100)}%`); }
196
+ if (repetitionRatio > 0.02) { qualityScore += repetitionRatio * 50; issues.push(`repetitive: ${Math.round(repetitionRatio * 100)}%`); }
197
+ if (titleLineRatio > 0.2) { qualityScore += 5; issues.push(`title-lines: ${Math.round(titleLineRatio * 100)}%`); }
198
 
199
  // Reject if quality score indicates technical/non-narrative content
200
  if (qualityScore > 3) {