milwright commited on
Commit
bac89b2
Β·
1 Parent(s): d5848fc

Refactor book processing to remove year extraction and improve fallback logic

Browse files
Files changed (2) hide show
  1. src/bookDataService.js +33 -132
  2. src/clozeGameEngine.js +9 -0
src/bookDataService.js CHANGED
@@ -189,7 +189,6 @@ class HuggingFaceDatasetService {
189
  id: rowData.id || Math.random().toString(36),
190
  title: title,
191
  author: author,
192
- year: extractedMetadata.year, // Extract year during lazy processing
193
  rawText: rawText,
194
  text: null, // Will clean when needed
195
  language: rowData.language || 'en',
@@ -204,12 +203,10 @@ class HuggingFaceDatasetService {
204
  console.log(`πŸ”„ Processing "${book.title}" on demand...`);
205
  const startTime = Date.now();
206
 
207
- // Clean text and extract metadata when actually needed
208
  const cleanedText = this.cleanProjectGutenbergText(book.rawText);
209
- const extractedMetadata = this.extractMetadata(book.rawText);
210
 
211
  book.text = cleanedText;
212
- book.year = extractedMetadata.year || this.estimatePublicationYear(book.title, book.author);
213
  book.processed = true;
214
 
215
  // Validate after processing
@@ -311,7 +308,7 @@ class HuggingFaceDatasetService {
311
  }
312
 
313
  extractMetadata(text) {
314
- const metadata = { title: 'Classic Literature', author: 'Unknown Author', year: null };
315
 
316
  if (!text) return metadata;
317
 
@@ -350,12 +347,6 @@ class HuggingFaceDatasetService {
350
  if (author && author.length > 1) {
351
  metadata.author = this.cleanMetadataField(author);
352
  }
353
- } else if (line.includes('Release Date:')) {
354
- // Try to extract year from release date
355
- const yearMatch = line.match(/\b(1[789]\d\d|20[012]\d)\b/);
356
- if (yearMatch) {
357
- metadata.year = parseInt(yearMatch[1]);
358
- }
359
  }
360
  }
361
 
@@ -369,37 +360,6 @@ class HuggingFaceDatasetService {
369
  .trim();
370
  }
371
 
372
- estimatePublicationYear(title, author) {
373
- // Return null to indicate unknown year rather than guessing
374
- // This allows for truly random selection without bias
375
- return null;
376
- }
377
-
378
- extractPublicationPeriod(text) {
379
- // Look for publication year clues in the text itself
380
- if (!text) return null;
381
-
382
- // Check first 200 lines for copyright or publication information
383
- const lines = text.split('\n').slice(0, 200);
384
- const textSnippet = lines.join(' ');
385
-
386
- // Look for explicit year mentions in copyright notices or metadata
387
- const yearPatterns = [
388
- /(?:copyright|Β©|published|publication date)[:\s]+.*?\b(1[6-9]\d{2}|20[0-2]\d)\b/i,
389
- /\b(1[6-9]\d{2}|20[0-2]\d)\b[,\s]+by\s+/i,
390
- /first published[:\s]+.*?\b(1[6-9]\d{2}|20[0-2]\d)\b/i,
391
- /originally published[:\s]+.*?\b(1[6-9]\d{2}|20[0-2]\d)\b/i
392
- ];
393
-
394
- for (const pattern of yearPatterns) {
395
- const match = textSnippet.match(pattern);
396
- if (match) {
397
- return parseInt(match[1]);
398
- }
399
- }
400
-
401
- return null;
402
- }
403
 
404
  isValidTitle(title) {
405
  if (!title || title.length < 3 || title.length > 100) return false;
@@ -446,47 +406,44 @@ class HuggingFaceDatasetService {
446
  throw new Error('Dataset not loaded');
447
  }
448
 
449
- // Try multiple times to get an unused book
450
- for (let attempt = 0; attempt < 10; attempt++) {
451
- let book = null;
 
 
452
 
453
- // Prioritize preloaded books for fast access (90% chance)
454
- if (this.streamingEnabled && this.preloadedBooks.length > 0 && Math.random() > 0.1) {
455
- const availableBooks = this.preloadedBooks.filter(book =>
456
- !this.usedBooks.has(this.getBookId(book))
457
- );
458
-
459
- if (availableBooks.length > 0) {
460
- const randomIndex = Math.floor(Math.random() * availableBooks.length);
461
- book = availableBooks[randomIndex];
462
-
463
- // Process book on demand
464
- book = await this.processBookOnDemand(book);
465
- if (!book) continue; // Book failed validation, try next
466
- } else {
467
- // All preloaded books used, try streaming
468
- book = await this.getStreamingBook();
469
- }
470
- } else {
471
- // Use local samples for remaining 10% + fallback
472
- const fallbackBooks = this.books.length > 0 ? this.books : this.getSampleBooks();
473
- const availableBooks = fallbackBooks.filter(book =>
474
- !this.usedBooks.has(this.getBookId(book))
475
- );
476
-
477
- if (availableBooks.length > 0) {
478
- const randomIndex = Math.floor(Math.random() * availableBooks.length);
479
- book = availableBooks[randomIndex];
480
  }
481
  }
482
 
483
- if (book && !this.usedBooks.has(this.getBookId(book))) {
484
- this.usedBooks.add(this.getBookId(book));
485
- return book;
 
 
486
  }
487
  }
488
 
489
- // If all attempts failed, clear used books and start over
 
 
 
 
 
 
 
 
 
 
 
 
 
 
490
  this.usedBooks.clear();
491
  console.log('All books used, cleared used book cache');
492
  return this.getRandomBook();
@@ -532,65 +489,9 @@ class HuggingFaceDatasetService {
532
  }
533
 
534
  async getBookByLevelCriteria(level) {
535
- let targetPeriod = null;
536
- if (level <= 2) {
537
- targetPeriod = { min: 1850, max: 1925 };
538
- } else if (level <= 4) {
539
- targetPeriod = { min: 1800, max: 1899 };
540
- }
541
-
542
- if (targetPeriod) {
543
- const periodBooks = await this.getBooksByPeriod(targetPeriod.min, targetPeriod.max);
544
-
545
- if (periodBooks.length > 0) {
546
- const randomIndex = Math.floor(Math.random() * periodBooks.length);
547
- let book = periodBooks[randomIndex];
548
-
549
- if (book.source === 'project_gutenberg' && !book.processed) {
550
- book = await this.processBookOnDemand(book);
551
- if (!book) {
552
- return await this.getRandomBook();
553
- }
554
- }
555
-
556
- return book;
557
- }
558
- }
559
-
560
  return await this.getRandomBook();
561
  }
562
 
563
- async getBooksByPeriod(minYear, maxYear) {
564
- const matchingBooks = [];
565
-
566
- if (this.streamingEnabled && this.preloadedBooks.length > 0) {
567
- for (const book of this.preloadedBooks) {
568
- if (!this.usedBooks.has(this.getBookId(book))) {
569
- let year = book.year;
570
- if (!year && book.rawText) {
571
- year = this.extractPublicationPeriod(book.rawText);
572
- book.year = year;
573
- }
574
-
575
- if (year && year >= minYear && year <= maxYear) {
576
- matchingBooks.push(book);
577
- }
578
- }
579
- }
580
- }
581
-
582
- const fallbackBooks = this.books.length > 0 ? this.books : this.getSampleBooks();
583
- for (const book of fallbackBooks) {
584
- if (!this.usedBooks.has(this.getBookId(book))) {
585
- const year = book.year || this.extractPublicationPeriod(book.text);
586
- if (year && year >= minYear && year <= maxYear) {
587
- matchingBooks.push(book);
588
- }
589
- }
590
- }
591
-
592
- return matchingBooks;
593
- }
594
 
595
 
596
  getBookById(id) {
 
189
  id: rowData.id || Math.random().toString(36),
190
  title: title,
191
  author: author,
 
192
  rawText: rawText,
193
  text: null, // Will clean when needed
194
  language: rowData.language || 'en',
 
203
  console.log(`πŸ”„ Processing "${book.title}" on demand...`);
204
  const startTime = Date.now();
205
 
206
+ // Clean text when actually needed
207
  const cleanedText = this.cleanProjectGutenbergText(book.rawText);
 
208
 
209
  book.text = cleanedText;
 
210
  book.processed = true;
211
 
212
  // Validate after processing
 
308
  }
309
 
310
  extractMetadata(text) {
311
+ const metadata = { title: 'Classic Literature', author: 'Unknown Author' };
312
 
313
  if (!text) return metadata;
314
 
 
347
  if (author && author.length > 1) {
348
  metadata.author = this.cleanMetadataField(author);
349
  }
 
 
 
 
 
 
350
  }
351
  }
352
 
 
360
  .trim();
361
  }
362
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
 
364
  isValidTitle(title) {
365
  if (!title || title.length < 3 || title.length > 100) return false;
 
406
  throw new Error('Dataset not loaded');
407
  }
408
 
409
+ // First, try to find a successfully processed HF book
410
+ if (this.streamingEnabled && this.preloadedBooks.length > 0) {
411
+ const availableHFBooks = this.preloadedBooks.filter(book =>
412
+ !this.usedBooks.has(this.getBookId(book))
413
+ );
414
 
415
+ for (const book of availableHFBooks) {
416
+ const processedBook = await this.processBookOnDemand(book);
417
+ if (processedBook) {
418
+ this.usedBooks.add(this.getBookId(processedBook));
419
+ console.log(`πŸ“š Using HF book: "${processedBook.title}"`);
420
+ return processedBook;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
  }
422
  }
423
 
424
+ // If no HF books worked, try streaming
425
+ const streamedBook = await this.getStreamingBook();
426
+ if (streamedBook) {
427
+ this.usedBooks.add(this.getBookId(streamedBook));
428
+ return streamedBook;
429
  }
430
  }
431
 
432
+ // Fallback to local samples
433
+ const fallbackBooks = this.books.length > 0 ? this.books : this.getSampleBooks();
434
+ const availableBooks = fallbackBooks.filter(book =>
435
+ !this.usedBooks.has(this.getBookId(book))
436
+ );
437
+
438
+ if (availableBooks.length > 0) {
439
+ const randomIndex = Math.floor(Math.random() * availableBooks.length);
440
+ const book = availableBooks[randomIndex];
441
+ this.usedBooks.add(this.getBookId(book));
442
+ console.log(`πŸ“š Using local book: "${book.title}"`);
443
+ return book;
444
+ }
445
+
446
+ // If all books used, clear cache and start over
447
  this.usedBooks.clear();
448
  console.log('All books used, cleared used book cache');
449
  return this.getRandomBook();
 
489
  }
490
 
491
  async getBookByLevelCriteria(level) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
492
  return await this.getRandomBook();
493
  }
494
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
 
496
 
497
  getBookById(id) {
src/clozeGameEngine.js CHANGED
@@ -204,6 +204,15 @@ class ClozeGame {
204
  expectedBlanks = 3;
205
  }
206
 
 
 
 
 
 
 
 
 
 
207
  // Limit selected words to expected number
208
  if (selectedWords.length > expectedBlanks) {
209
  console.log(`AI returned ${selectedWords.length} words but expected ${expectedBlanks}, limiting to ${expectedBlanks}`);
 
204
  expectedBlanks = 3;
205
  }
206
 
207
+ // If AI didn't provide enough words, fall back to manual selection
208
+ if (selectedWords.length < expectedBlanks) {
209
+ console.warn(`AI provided ${selectedWords.length} words but need ${expectedBlanks}, using fallback`);
210
+ const words = this.originalText.split(/\s+/);
211
+ const fallbackWords = this.selectWordsManually(words, expectedBlanks - selectedWords.length);
212
+ selectedWords = [...selectedWords, ...fallbackWords].slice(0, expectedBlanks);
213
+ console.log(`Combined AI + fallback words:`, selectedWords);
214
+ }
215
+
216
  // Limit selected words to expected number
217
  if (selectedWords.length > expectedBlanks) {
218
  console.log(`AI returned ${selectedWords.length} words but expected ${expectedBlanks}, limiting to ${expectedBlanks}`);