Spaces:
Sleeping
Sleeping
fix book preloading and filtering issues
Browse files- Relax isValidForCloze filtering criteria to allow more books through
- Reduce minimum text length from 5000 to 2000 characters
- Increase fragmentation threshold from 0.01 to 0.05
- Reduce minimum sentence count from 20 to 10
- Add enhanced error handling in preloadBooks function
- Add structure validation for HF API responses
- Add try-catch around individual book processing
- Improve logging to track book validation process
- src/bookDataService.js +24 -6
src/bookDataService.js
CHANGED
@@ -138,12 +138,29 @@ class HuggingFaceDatasetService {
|
|
138 |
if (response.ok) {
|
139 |
const data = await response.json();
|
140 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
// Process and filter books
|
142 |
this.preloadedBooks = data.rows
|
143 |
-
.map(row =>
|
144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
|
146 |
console.log(`π Preloaded ${this.preloadedBooks.length} suitable books`);
|
|
|
|
|
147 |
}
|
148 |
} catch (error) {
|
149 |
console.warn('Failed to preload books:', error);
|
@@ -336,18 +353,19 @@ class HuggingFaceDatasetService {
|
|
336 |
|
337 |
const textLength = book.text.length;
|
338 |
|
339 |
-
//
|
340 |
-
if (textLength <
|
341 |
if (textLength > 500000) return false; // Too long for performance
|
342 |
|
343 |
// Check for excessive formatting (likely reference material)
|
344 |
const lineBreakRatio = (book.text.match(/\n\n/g) || []).length / textLength;
|
345 |
-
if (lineBreakRatio > 0.
|
346 |
|
347 |
// Ensure it has actual narrative content
|
348 |
const sentenceCount = (book.text.match(/[.!?]+/g) || []).length;
|
349 |
-
if (sentenceCount <
|
350 |
|
|
|
351 |
return true;
|
352 |
}
|
353 |
|
|
|
138 |
if (response.ok) {
|
139 |
const data = await response.json();
|
140 |
|
141 |
+
// Check if data has expected structure
|
142 |
+
if (!data.rows || !Array.isArray(data.rows)) {
|
143 |
+
console.error('Unexpected HF API response structure:', data);
|
144 |
+
return;
|
145 |
+
}
|
146 |
+
|
147 |
+
console.log(`π₯ Received ${data.rows.length} books from HF API`);
|
148 |
+
|
149 |
// Process and filter books
|
150 |
this.preloadedBooks = data.rows
|
151 |
+
.map(row => {
|
152 |
+
try {
|
153 |
+
return this.processHFBook(row.row);
|
154 |
+
} catch (e) {
|
155 |
+
console.warn('Error processing book:', e);
|
156 |
+
return null;
|
157 |
+
}
|
158 |
+
})
|
159 |
+
.filter(book => book && this.isValidForCloze(book));
|
160 |
|
161 |
console.log(`π Preloaded ${this.preloadedBooks.length} suitable books`);
|
162 |
+
} else {
|
163 |
+
console.error(`HF API request failed: ${response.status} ${response.statusText}`);
|
164 |
}
|
165 |
} catch (error) {
|
166 |
console.warn('Failed to preload books:', error);
|
|
|
353 |
|
354 |
const textLength = book.text.length;
|
355 |
|
356 |
+
// Relaxed filter criteria for cloze exercises
|
357 |
+
if (textLength < 2000) return false; // Minimum readable length
|
358 |
if (textLength > 500000) return false; // Too long for performance
|
359 |
|
360 |
// Check for excessive formatting (likely reference material)
|
361 |
const lineBreakRatio = (book.text.match(/\n\n/g) || []).length / textLength;
|
362 |
+
if (lineBreakRatio > 0.05) return false; // Relaxed fragmentation threshold
|
363 |
|
364 |
// Ensure it has actual narrative content
|
365 |
const sentenceCount = (book.text.match(/[.!?]+/g) || []).length;
|
366 |
+
if (sentenceCount < 10) return false; // Relaxed sentence requirement
|
367 |
|
368 |
+
console.log(`π Book validated: "${book.title}" (${textLength} chars, ${sentenceCount} sentences)`);
|
369 |
return true;
|
370 |
}
|
371 |
|