Spaces:
Running
Running
Strengthen content filtering to reject indexes and TOCs
Browse files- Added repetitive pattern detection for CONTENTS, CHAPTER, Volume keywords
- Implemented title line detection for all-caps lines common in TOCs
- Made filtering stricter for Level 3+ (caps threshold 3% vs 5%)
- Added book-level early detection of index/catalog content
- Reject books with index/catalog/bibliography in title
- Increased scoring weights for better technical content rejection
- src/bookDataService.js +32 -3
- src/clozeGameEngine.js +24 -4
src/bookDataService.js
CHANGED
@@ -385,17 +385,46 @@ class HuggingFaceDatasetService {
|
|
385 |
|
386 |
const textLength = book.text.length;
|
387 |
|
388 |
-
//
|
389 |
if (textLength < 2000) return false; // Minimum readable length
|
390 |
if (textLength > 500000) return false; // Too long for performance
|
391 |
|
392 |
// Check for excessive formatting (likely reference material)
|
393 |
const lineBreakRatio = (book.text.match(/\n\n/g) || []).length / textLength;
|
394 |
-
if (lineBreakRatio > 0.05) return false; //
|
395 |
|
396 |
// Ensure it has actual narrative content
|
397 |
const sentenceCount = (book.text.match(/[.!?]+/g) || []).length;
|
398 |
-
if (sentenceCount < 10) return false; //
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
399 |
|
400 |
console.log(`π Book validated: "${book.title}" (${textLength} chars, ${sentenceCount} sentences)`);
|
401 |
return true;
|
|
|
385 |
|
386 |
const textLength = book.text.length;
|
387 |
|
388 |
+
// Basic length criteria
|
389 |
if (textLength < 2000) return false; // Minimum readable length
|
390 |
if (textLength > 500000) return false; // Too long for performance
|
391 |
|
392 |
// Check for excessive formatting (likely reference material)
|
393 |
const lineBreakRatio = (book.text.match(/\n\n/g) || []).length / textLength;
|
394 |
+
if (lineBreakRatio > 0.05) return false; // Fragmentation threshold
|
395 |
|
396 |
// Ensure it has actual narrative content
|
397 |
const sentenceCount = (book.text.match(/[.!?]+/g) || []).length;
|
398 |
+
if (sentenceCount < 10) return false; // Sentence requirement
|
399 |
+
|
400 |
+
// Sample text for quality check (first 5000 chars should be representative)
|
401 |
+
const sampleText = book.text.substring(0, 5000);
|
402 |
+
|
403 |
+
// Check for index/TOC patterns
|
404 |
+
const indexPatterns = [
|
405 |
+
'CONTENTS', 'INDEX', 'CHAPTER', 'Volume', 'Vol.',
|
406 |
+
'Part I', 'Part II', 'BOOK I', 'APPENDIX'
|
407 |
+
];
|
408 |
+
const indexCount = indexPatterns.reduce((count, pattern) =>
|
409 |
+
count + (sampleText.match(new RegExp(pattern, 'gi')) || []).length, 0
|
410 |
+
);
|
411 |
+
const indexRatio = indexCount / (sampleText.split(/\s+/).length || 1);
|
412 |
+
|
413 |
+
if (indexRatio > 0.05) {
|
414 |
+
console.log(`β Book rejected - appears to be index/TOC: "${book.title}" (index ratio: ${Math.round(indexRatio * 100)}%)`);
|
415 |
+
return false;
|
416 |
+
}
|
417 |
+
|
418 |
+
// Check for catalog/bibliography patterns
|
419 |
+
if (book.title && (
|
420 |
+
book.title.toLowerCase().includes('index') ||
|
421 |
+
book.title.toLowerCase().includes('catalog') ||
|
422 |
+
book.title.toLowerCase().includes('bibliography') ||
|
423 |
+
book.title.toLowerCase().includes('contents')
|
424 |
+
)) {
|
425 |
+
console.log(`β Book rejected - title suggests index/catalog: "${book.title}"`);
|
426 |
+
return false;
|
427 |
+
}
|
428 |
|
429 |
console.log(`π Book validated: "${book.title}" (${textLength} chars, ${sentenceCount} sentences)`);
|
430 |
return true;
|
src/clozeGameEngine.js
CHANGED
@@ -153,11 +153,23 @@ class ClozeGame {
|
|
153 |
const totalWords = words.length;
|
154 |
|
155 |
// Count various quality indicators
|
156 |
-
const
|
|
|
157 |
const numbersCount = words.filter(w => /\d/.test(w)).length;
|
158 |
const shortWords = words.filter(w => w.length <= 3).length;
|
159 |
-
const punctuationMarks = (passage.match(/[;:()[\]{}]/g) || []).length;
|
160 |
const sentenceList = passage.split(/[.!?]+/).filter(s => s.trim().length > 10);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
|
162 |
// Calculate quality ratios
|
163 |
const capsRatio = capsCount / totalWords;
|
@@ -165,16 +177,24 @@ class ClozeGame {
|
|
165 |
const shortWordRatio = shortWords / totalWords;
|
166 |
const punctuationRatio = punctuationMarks / totalWords;
|
167 |
const avgWordsPerSentence = totalWords / Math.max(1, sentenceList.length);
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
|
169 |
// Reject if passage shows signs of being technical/reference material
|
170 |
let qualityScore = 0;
|
171 |
let issues = [];
|
172 |
|
173 |
-
if (capsRatio >
|
174 |
-
if (numbersRatio >
|
175 |
if (punctuationRatio > 0.08) { qualityScore += punctuationRatio * 15; issues.push(`punct: ${Math.round(punctuationRatio * 100)}%`); }
|
176 |
if (avgWordsPerSentence < 8 || avgWordsPerSentence > 40) { qualityScore += 2; issues.push(`sent-len: ${Math.round(avgWordsPerSentence)}`); }
|
177 |
if (shortWordRatio < 0.3) { qualityScore += 2; issues.push(`short-words: ${Math.round(shortWordRatio * 100)}%`); }
|
|
|
|
|
178 |
|
179 |
// Reject if quality score indicates technical/non-narrative content
|
180 |
if (qualityScore > 3) {
|
|
|
153 |
const totalWords = words.length;
|
154 |
|
155 |
// Count various quality indicators
|
156 |
+
const capsWords = words.filter(w => w.length > 1 && w === w.toUpperCase());
|
157 |
+
const capsCount = capsWords.length;
|
158 |
const numbersCount = words.filter(w => /\d/.test(w)).length;
|
159 |
const shortWords = words.filter(w => w.length <= 3).length;
|
160 |
+
const punctuationMarks = (passage.match(/[;:()[\]{}ββ]/g) || []).length;
|
161 |
const sentenceList = passage.split(/[.!?]+/).filter(s => s.trim().length > 10);
|
162 |
+
const lines = passage.split('\n').filter(l => l.trim());
|
163 |
+
|
164 |
+
// Check for repetitive patterns (common in indexes/TOCs)
|
165 |
+
const repeatedPhrases = ['CONTENTS', 'CHAPTER', 'Volume', 'Vol.', 'Part', 'Book'];
|
166 |
+
const repetitionCount = repeatedPhrases.reduce((count, phrase) =>
|
167 |
+
count + (passage.match(new RegExp(phrase, 'gi')) || []).length, 0
|
168 |
+
);
|
169 |
+
|
170 |
+
// Check for title patterns (common in TOCs)
|
171 |
+
const titlePattern = /^[A-Z][A-Z\s]+$/m;
|
172 |
+
const titleLines = lines.filter(line => titlePattern.test(line.trim())).length;
|
173 |
|
174 |
// Calculate quality ratios
|
175 |
const capsRatio = capsCount / totalWords;
|
|
|
177 |
const shortWordRatio = shortWords / totalWords;
|
178 |
const punctuationRatio = punctuationMarks / totalWords;
|
179 |
const avgWordsPerSentence = totalWords / Math.max(1, sentenceList.length);
|
180 |
+
const repetitionRatio = repetitionCount / totalWords;
|
181 |
+
const titleLineRatio = titleLines / Math.max(1, lines.length);
|
182 |
+
|
183 |
+
// Stricter thresholds for higher levels
|
184 |
+
const capsThreshold = this.currentLevel >= 3 ? 0.03 : 0.05;
|
185 |
+
const numbersThreshold = this.currentLevel >= 3 ? 0.02 : 0.03;
|
186 |
|
187 |
// Reject if passage shows signs of being technical/reference material
|
188 |
let qualityScore = 0;
|
189 |
let issues = [];
|
190 |
|
191 |
+
if (capsRatio > capsThreshold) { qualityScore += capsRatio * 30; issues.push(`caps: ${Math.round(capsRatio * 100)}%`); }
|
192 |
+
if (numbersRatio > numbersThreshold) { qualityScore += numbersRatio * 40; issues.push(`numbers: ${Math.round(numbersRatio * 100)}%`); }
|
193 |
if (punctuationRatio > 0.08) { qualityScore += punctuationRatio * 15; issues.push(`punct: ${Math.round(punctuationRatio * 100)}%`); }
|
194 |
if (avgWordsPerSentence < 8 || avgWordsPerSentence > 40) { qualityScore += 2; issues.push(`sent-len: ${Math.round(avgWordsPerSentence)}`); }
|
195 |
if (shortWordRatio < 0.3) { qualityScore += 2; issues.push(`short-words: ${Math.round(shortWordRatio * 100)}%`); }
|
196 |
+
if (repetitionRatio > 0.02) { qualityScore += repetitionRatio * 50; issues.push(`repetitive: ${Math.round(repetitionRatio * 100)}%`); }
|
197 |
+
if (titleLineRatio > 0.2) { qualityScore += 5; issues.push(`title-lines: ${Math.round(titleLineRatio * 100)}%`); }
|
198 |
|
199 |
// Reject if quality score indicates technical/non-narrative content
|
200 |
if (qualityScore > 3) {
|