milwright commited on
Commit
c21250b
Β·
1 Parent(s): a918c3e

fix book preloading and filtering issues

Browse files

- Relax isValidForCloze filtering criteria to allow more books through
- Reduce minimum text length from 5000 to 2000 characters
- Increase fragmentation threshold from 0.01 to 0.05
- Reduce minimum sentence count from 20 to 10
- Add enhanced error handling in preloadBooks function
- Add structure validation for HF API responses
- Add try-catch around individual book processing
- Improve logging to track book validation process

Files changed (1) hide show
  1. src/bookDataService.js +24 -6
src/bookDataService.js CHANGED
@@ -138,12 +138,29 @@ class HuggingFaceDatasetService {
138
  if (response.ok) {
139
  const data = await response.json();
140
 
 
 
 
 
 
 
 
 
141
  // Process and filter books
142
  this.preloadedBooks = data.rows
143
- .map(row => this.processHFBook(row.row))
144
- .filter(book => this.isValidForCloze(book));
 
 
 
 
 
 
 
145
 
146
  console.log(`πŸ“š Preloaded ${this.preloadedBooks.length} suitable books`);
 
 
147
  }
148
  } catch (error) {
149
  console.warn('Failed to preload books:', error);
@@ -336,18 +353,19 @@ class HuggingFaceDatasetService {
336
 
337
  const textLength = book.text.length;
338
 
339
- // Filter criteria for cloze exercises
340
- if (textLength < 5000) return false; // Too short
341
  if (textLength > 500000) return false; // Too long for performance
342
 
343
  // Check for excessive formatting (likely reference material)
344
  const lineBreakRatio = (book.text.match(/\n\n/g) || []).length / textLength;
345
- if (lineBreakRatio > 0.01) return false; // Too fragmented
346
 
347
  // Ensure it has actual narrative content
348
  const sentenceCount = (book.text.match(/[.!?]+/g) || []).length;
349
- if (sentenceCount < 20) return false; // Too few sentences
350
 
 
351
  return true;
352
  }
353
 
 
138
  if (response.ok) {
139
  const data = await response.json();
140
 
141
+ // Check if data has expected structure
142
+ if (!data.rows || !Array.isArray(data.rows)) {
143
+ console.error('Unexpected HF API response structure:', data);
144
+ return;
145
+ }
146
+
147
+ console.log(`πŸ“₯ Received ${data.rows.length} books from HF API`);
148
+
149
  // Process and filter books
150
  this.preloadedBooks = data.rows
151
+ .map(row => {
152
+ try {
153
+ return this.processHFBook(row.row);
154
+ } catch (e) {
155
+ console.warn('Error processing book:', e);
156
+ return null;
157
+ }
158
+ })
159
+ .filter(book => book && this.isValidForCloze(book));
160
 
161
  console.log(`πŸ“š Preloaded ${this.preloadedBooks.length} suitable books`);
162
+ } else {
163
+ console.error(`HF API request failed: ${response.status} ${response.statusText}`);
164
  }
165
  } catch (error) {
166
  console.warn('Failed to preload books:', error);
 
353
 
354
  const textLength = book.text.length;
355
 
356
+ // Relaxed filter criteria for cloze exercises
357
+ if (textLength < 2000) return false; // Minimum readable length
358
  if (textLength > 500000) return false; // Too long for performance
359
 
360
  // Check for excessive formatting (likely reference material)
361
  const lineBreakRatio = (book.text.match(/\n\n/g) || []).length / textLength;
362
+ if (lineBreakRatio > 0.05) return false; // Relaxed fragmentation threshold
363
 
364
  // Ensure it has actual narrative content
365
  const sentenceCount = (book.text.match(/[.!?]+/g) || []).length;
366
+ if (sentenceCount < 10) return false; // Relaxed sentence requirement
367
 
368
+ console.log(`πŸ“– Book validated: "${book.title}" (${textLength} chars, ${sentenceCount} sentences)`);
369
  return true;
370
  }
371