Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	Strengthen content filtering to reject indexes and TOCs
Browse files- Added repetitive pattern detection for CONTENTS, CHAPTER, Volume keywords
- Implemented title line detection for all-caps lines common in TOCs
- Made filtering stricter for Level 3+ (caps threshold 3% vs 5%)
- Added book-level early detection of index/catalog content
- Reject books with index/catalog/bibliography in title
- Increased scoring weights for better technical content rejection
- src/bookDataService.js +32 -3
 - src/clozeGameEngine.js +24 -4
 
    	
        src/bookDataService.js
    CHANGED
    
    | 
         @@ -385,17 +385,46 @@ class HuggingFaceDatasetService { 
     | 
|
| 385 | 
         | 
| 386 | 
         
             
                const textLength = book.text.length;
         
     | 
| 387 | 
         | 
| 388 | 
         
            -
                //  
     | 
| 389 | 
         
             
                if (textLength < 2000) return false;        // Minimum readable length
         
     | 
| 390 | 
         
             
                if (textLength > 500000) return false;      // Too long for performance
         
     | 
| 391 | 
         | 
| 392 | 
         
             
                // Check for excessive formatting (likely reference material)
         
     | 
| 393 | 
         
             
                const lineBreakRatio = (book.text.match(/\n\n/g) || []).length / textLength;
         
     | 
| 394 | 
         
            -
                if (lineBreakRatio > 0.05) return false;    //  
     | 
| 395 | 
         | 
| 396 | 
         
             
                // Ensure it has actual narrative content
         
     | 
| 397 | 
         
             
                const sentenceCount = (book.text.match(/[.!?]+/g) || []).length;
         
     | 
| 398 | 
         
            -
                if (sentenceCount < 10) return false;       //  
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 399 | 
         | 
| 400 | 
         
             
                console.log(`π Book validated: "${book.title}" (${textLength} chars, ${sentenceCount} sentences)`);
         
     | 
| 401 | 
         
             
                return true;
         
     | 
| 
         | 
|
| 385 | 
         | 
| 386 | 
         
             
                const textLength = book.text.length;
         
     | 
| 387 | 
         | 
| 388 | 
         
            +
                // Basic length criteria
         
     | 
| 389 | 
         
             
                if (textLength < 2000) return false;        // Minimum readable length
         
     | 
| 390 | 
         
             
                if (textLength > 500000) return false;      // Too long for performance
         
     | 
| 391 | 
         | 
| 392 | 
         
             
                // Check for excessive formatting (likely reference material)
         
     | 
| 393 | 
         
             
                const lineBreakRatio = (book.text.match(/\n\n/g) || []).length / textLength;
         
     | 
| 394 | 
         
            +
                if (lineBreakRatio > 0.05) return false;    // Fragmentation threshold
         
     | 
| 395 | 
         | 
| 396 | 
         
             
                // Ensure it has actual narrative content
         
     | 
| 397 | 
         
             
                const sentenceCount = (book.text.match(/[.!?]+/g) || []).length;
         
     | 
| 398 | 
         
            +
                if (sentenceCount < 10) return false;       // Sentence requirement
         
     | 
| 399 | 
         
            +
                
         
     | 
| 400 | 
         
            +
                // Sample text for quality check (first 5000 chars should be representative)
         
     | 
| 401 | 
         
            +
                const sampleText = book.text.substring(0, 5000);
         
     | 
| 402 | 
         
            +
                
         
     | 
| 403 | 
         
            +
                // Check for index/TOC patterns
         
     | 
| 404 | 
         
            +
                const indexPatterns = [
         
     | 
| 405 | 
         
            +
                  'CONTENTS', 'INDEX', 'CHAPTER', 'Volume', 'Vol.', 
         
     | 
| 406 | 
         
            +
                  'Part I', 'Part II', 'BOOK I', 'APPENDIX'
         
     | 
| 407 | 
         
            +
                ];
         
     | 
| 408 | 
         
            +
                const indexCount = indexPatterns.reduce((count, pattern) => 
         
     | 
| 409 | 
         
            +
                  count + (sampleText.match(new RegExp(pattern, 'gi')) || []).length, 0
         
     | 
| 410 | 
         
            +
                );
         
     | 
| 411 | 
         
            +
                const indexRatio = indexCount / (sampleText.split(/\s+/).length || 1);
         
     | 
| 412 | 
         
            +
                
         
     | 
| 413 | 
         
            +
                if (indexRatio > 0.05) {
         
     | 
| 414 | 
         
            +
                  console.log(`β Book rejected - appears to be index/TOC: "${book.title}" (index ratio: ${Math.round(indexRatio * 100)}%)`);
         
     | 
| 415 | 
         
            +
                  return false;
         
     | 
| 416 | 
         
            +
                }
         
     | 
| 417 | 
         
            +
                
         
     | 
| 418 | 
         
            +
                // Check for catalog/bibliography patterns
         
     | 
| 419 | 
         
            +
                if (book.title && (
         
     | 
| 420 | 
         
            +
                  book.title.toLowerCase().includes('index') ||
         
     | 
| 421 | 
         
            +
                  book.title.toLowerCase().includes('catalog') ||
         
     | 
| 422 | 
         
            +
                  book.title.toLowerCase().includes('bibliography') ||
         
     | 
| 423 | 
         
            +
                  book.title.toLowerCase().includes('contents')
         
     | 
| 424 | 
         
            +
                )) {
         
     | 
| 425 | 
         
            +
                  console.log(`β Book rejected - title suggests index/catalog: "${book.title}"`);
         
     | 
| 426 | 
         
            +
                  return false;
         
     | 
| 427 | 
         
            +
                }
         
     | 
| 428 | 
         | 
| 429 | 
         
             
                console.log(`π Book validated: "${book.title}" (${textLength} chars, ${sentenceCount} sentences)`);
         
     | 
| 430 | 
         
             
                return true;
         
     | 
    	
        src/clozeGameEngine.js
    CHANGED
    
    | 
         @@ -153,11 +153,23 @@ class ClozeGame { 
     | 
|
| 153 | 
         
             
                  const totalWords = words.length;
         
     | 
| 154 | 
         | 
| 155 | 
         
             
                  // Count various quality indicators
         
     | 
| 156 | 
         
            -
                  const  
     | 
| 
         | 
|
| 157 | 
         
             
                  const numbersCount = words.filter(w => /\d/.test(w)).length;
         
     | 
| 158 | 
         
             
                  const shortWords = words.filter(w => w.length <= 3).length;
         
     | 
| 159 | 
         
            -
                  const punctuationMarks = (passage.match(/[;:()[\]{}]/g) || []).length;
         
     | 
| 160 | 
         
             
                  const sentenceList = passage.split(/[.!?]+/).filter(s => s.trim().length > 10);
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 161 | 
         | 
| 162 | 
         
             
                  // Calculate quality ratios
         
     | 
| 163 | 
         
             
                  const capsRatio = capsCount / totalWords;
         
     | 
| 
         @@ -165,16 +177,24 @@ class ClozeGame { 
     | 
|
| 165 | 
         
             
                  const shortWordRatio = shortWords / totalWords;
         
     | 
| 166 | 
         
             
                  const punctuationRatio = punctuationMarks / totalWords;
         
     | 
| 167 | 
         
             
                  const avgWordsPerSentence = totalWords / Math.max(1, sentenceList.length);
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 168 | 
         | 
| 169 | 
         
             
                  // Reject if passage shows signs of being technical/reference material
         
     | 
| 170 | 
         
             
                  let qualityScore = 0;
         
     | 
| 171 | 
         
             
                  let issues = [];
         
     | 
| 172 | 
         | 
| 173 | 
         
            -
                  if (capsRatio >  
     | 
| 174 | 
         
            -
                  if (numbersRatio >  
     | 
| 175 | 
         
             
                  if (punctuationRatio > 0.08) { qualityScore += punctuationRatio * 15; issues.push(`punct: ${Math.round(punctuationRatio * 100)}%`); }
         
     | 
| 176 | 
         
             
                  if (avgWordsPerSentence < 8 || avgWordsPerSentence > 40) { qualityScore += 2; issues.push(`sent-len: ${Math.round(avgWordsPerSentence)}`); }
         
     | 
| 177 | 
         
             
                  if (shortWordRatio < 0.3) { qualityScore += 2; issues.push(`short-words: ${Math.round(shortWordRatio * 100)}%`); }
         
     | 
| 
         | 
|
| 
         | 
|
| 178 | 
         | 
| 179 | 
         
             
                  // Reject if quality score indicates technical/non-narrative content
         
     | 
| 180 | 
         
             
                  if (qualityScore > 3) {
         
     | 
| 
         | 
|
| 153 | 
         
             
                  const totalWords = words.length;
         
     | 
| 154 | 
         | 
| 155 | 
         
             
                  // Count various quality indicators
         
     | 
| 156 | 
         
            +
                  const capsWords = words.filter(w => w.length > 1 && w === w.toUpperCase());
         
     | 
| 157 | 
         
            +
                  const capsCount = capsWords.length;
         
     | 
| 158 | 
         
             
                  const numbersCount = words.filter(w => /\d/.test(w)).length;
         
     | 
| 159 | 
         
             
                  const shortWords = words.filter(w => w.length <= 3).length;
         
     | 
| 160 | 
         
            +
                  const punctuationMarks = (passage.match(/[;:()[\]{}ββ]/g) || []).length;
         
     | 
| 161 | 
         
             
                  const sentenceList = passage.split(/[.!?]+/).filter(s => s.trim().length > 10);
         
     | 
| 162 | 
         
            +
                  const lines = passage.split('\n').filter(l => l.trim());
         
     | 
| 163 | 
         
            +
                  
         
     | 
| 164 | 
         
            +
                  // Check for repetitive patterns (common in indexes/TOCs)
         
     | 
| 165 | 
         
            +
                  const repeatedPhrases = ['CONTENTS', 'CHAPTER', 'Volume', 'Vol.', 'Part', 'Book'];
         
     | 
| 166 | 
         
            +
                  const repetitionCount = repeatedPhrases.reduce((count, phrase) => 
         
     | 
| 167 | 
         
            +
                    count + (passage.match(new RegExp(phrase, 'gi')) || []).length, 0
         
     | 
| 168 | 
         
            +
                  );
         
     | 
| 169 | 
         
            +
                  
         
     | 
| 170 | 
         
            +
                  // Check for title patterns (common in TOCs)
         
     | 
| 171 | 
         
            +
                  const titlePattern = /^[A-Z][A-Z\s]+$/m;
         
     | 
| 172 | 
         
            +
                  const titleLines = lines.filter(line => titlePattern.test(line.trim())).length;
         
     | 
| 173 | 
         | 
| 174 | 
         
             
                  // Calculate quality ratios
         
     | 
| 175 | 
         
             
                  const capsRatio = capsCount / totalWords;
         
     | 
| 
         | 
|
| 177 | 
         
             
                  const shortWordRatio = shortWords / totalWords;
         
     | 
| 178 | 
         
             
                  const punctuationRatio = punctuationMarks / totalWords;
         
     | 
| 179 | 
         
             
                  const avgWordsPerSentence = totalWords / Math.max(1, sentenceList.length);
         
     | 
| 180 | 
         
            +
                  const repetitionRatio = repetitionCount / totalWords;
         
     | 
| 181 | 
         
            +
                  const titleLineRatio = titleLines / Math.max(1, lines.length);
         
     | 
| 182 | 
         
            +
                  
         
     | 
| 183 | 
         
            +
                  // Stricter thresholds for higher levels
         
     | 
| 184 | 
         
            +
                  const capsThreshold = this.currentLevel >= 3 ? 0.03 : 0.05;
         
     | 
| 185 | 
         
            +
                  const numbersThreshold = this.currentLevel >= 3 ? 0.02 : 0.03;
         
     | 
| 186 | 
         | 
| 187 | 
         
             
                  // Reject if passage shows signs of being technical/reference material
         
     | 
| 188 | 
         
             
                  let qualityScore = 0;
         
     | 
| 189 | 
         
             
                  let issues = [];
         
     | 
| 190 | 
         | 
| 191 | 
         
            +
                  if (capsRatio > capsThreshold) { qualityScore += capsRatio * 30; issues.push(`caps: ${Math.round(capsRatio * 100)}%`); }
         
     | 
| 192 | 
         
            +
                  if (numbersRatio > numbersThreshold) { qualityScore += numbersRatio * 40; issues.push(`numbers: ${Math.round(numbersRatio * 100)}%`); }
         
     | 
| 193 | 
         
             
                  if (punctuationRatio > 0.08) { qualityScore += punctuationRatio * 15; issues.push(`punct: ${Math.round(punctuationRatio * 100)}%`); }
         
     | 
| 194 | 
         
             
                  if (avgWordsPerSentence < 8 || avgWordsPerSentence > 40) { qualityScore += 2; issues.push(`sent-len: ${Math.round(avgWordsPerSentence)}`); }
         
     | 
| 195 | 
         
             
                  if (shortWordRatio < 0.3) { qualityScore += 2; issues.push(`short-words: ${Math.round(shortWordRatio * 100)}%`); }
         
     | 
| 196 | 
         
            +
                  if (repetitionRatio > 0.02) { qualityScore += repetitionRatio * 50; issues.push(`repetitive: ${Math.round(repetitionRatio * 100)}%`); }
         
     | 
| 197 | 
         
            +
                  if (titleLineRatio > 0.2) { qualityScore += 5; issues.push(`title-lines: ${Math.round(titleLineRatio * 100)}%`); }
         
     | 
| 198 | 
         | 
| 199 | 
         
             
                  // Reject if quality score indicates technical/non-narrative content
         
     | 
| 200 | 
         
             
                  if (qualityScore > 3) {
         
     |