Spaces:
Running
Running
Refactor book processing to remove year extraction and improve fallback logic
Browse files- src/bookDataService.js +33 -132
- src/clozeGameEngine.js +9 -0
src/bookDataService.js
CHANGED
|
@@ -189,7 +189,6 @@ class HuggingFaceDatasetService {
|
|
| 189 |
id: rowData.id || Math.random().toString(36),
|
| 190 |
title: title,
|
| 191 |
author: author,
|
| 192 |
-
year: extractedMetadata.year, // Extract year during lazy processing
|
| 193 |
rawText: rawText,
|
| 194 |
text: null, // Will clean when needed
|
| 195 |
language: rowData.language || 'en',
|
|
@@ -204,12 +203,10 @@ class HuggingFaceDatasetService {
|
|
| 204 |
console.log(`π Processing "${book.title}" on demand...`);
|
| 205 |
const startTime = Date.now();
|
| 206 |
|
| 207 |
-
// Clean text
|
| 208 |
const cleanedText = this.cleanProjectGutenbergText(book.rawText);
|
| 209 |
-
const extractedMetadata = this.extractMetadata(book.rawText);
|
| 210 |
|
| 211 |
book.text = cleanedText;
|
| 212 |
-
book.year = extractedMetadata.year || this.estimatePublicationYear(book.title, book.author);
|
| 213 |
book.processed = true;
|
| 214 |
|
| 215 |
// Validate after processing
|
|
@@ -311,7 +308,7 @@ class HuggingFaceDatasetService {
|
|
| 311 |
}
|
| 312 |
|
| 313 |
extractMetadata(text) {
|
| 314 |
-
const metadata = { title: 'Classic Literature', author: 'Unknown Author'
|
| 315 |
|
| 316 |
if (!text) return metadata;
|
| 317 |
|
|
@@ -350,12 +347,6 @@ class HuggingFaceDatasetService {
|
|
| 350 |
if (author && author.length > 1) {
|
| 351 |
metadata.author = this.cleanMetadataField(author);
|
| 352 |
}
|
| 353 |
-
} else if (line.includes('Release Date:')) {
|
| 354 |
-
// Try to extract year from release date
|
| 355 |
-
const yearMatch = line.match(/\b(1[789]\d\d|20[012]\d)\b/);
|
| 356 |
-
if (yearMatch) {
|
| 357 |
-
metadata.year = parseInt(yearMatch[1]);
|
| 358 |
-
}
|
| 359 |
}
|
| 360 |
}
|
| 361 |
|
|
@@ -369,37 +360,6 @@ class HuggingFaceDatasetService {
|
|
| 369 |
.trim();
|
| 370 |
}
|
| 371 |
|
| 372 |
-
estimatePublicationYear(title, author) {
|
| 373 |
-
// Return null to indicate unknown year rather than guessing
|
| 374 |
-
// This allows for truly random selection without bias
|
| 375 |
-
return null;
|
| 376 |
-
}
|
| 377 |
-
|
| 378 |
-
extractPublicationPeriod(text) {
|
| 379 |
-
// Look for publication year clues in the text itself
|
| 380 |
-
if (!text) return null;
|
| 381 |
-
|
| 382 |
-
// Check first 200 lines for copyright or publication information
|
| 383 |
-
const lines = text.split('\n').slice(0, 200);
|
| 384 |
-
const textSnippet = lines.join(' ');
|
| 385 |
-
|
| 386 |
-
// Look for explicit year mentions in copyright notices or metadata
|
| 387 |
-
const yearPatterns = [
|
| 388 |
-
/(?:copyright|Β©|published|publication date)[:\s]+.*?\b(1[6-9]\d{2}|20[0-2]\d)\b/i,
|
| 389 |
-
/\b(1[6-9]\d{2}|20[0-2]\d)\b[,\s]+by\s+/i,
|
| 390 |
-
/first published[:\s]+.*?\b(1[6-9]\d{2}|20[0-2]\d)\b/i,
|
| 391 |
-
/originally published[:\s]+.*?\b(1[6-9]\d{2}|20[0-2]\d)\b/i
|
| 392 |
-
];
|
| 393 |
-
|
| 394 |
-
for (const pattern of yearPatterns) {
|
| 395 |
-
const match = textSnippet.match(pattern);
|
| 396 |
-
if (match) {
|
| 397 |
-
return parseInt(match[1]);
|
| 398 |
-
}
|
| 399 |
-
}
|
| 400 |
-
|
| 401 |
-
return null;
|
| 402 |
-
}
|
| 403 |
|
| 404 |
isValidTitle(title) {
|
| 405 |
if (!title || title.length < 3 || title.length > 100) return false;
|
|
@@ -446,47 +406,44 @@ class HuggingFaceDatasetService {
|
|
| 446 |
throw new Error('Dataset not loaded');
|
| 447 |
}
|
| 448 |
|
| 449 |
-
//
|
| 450 |
-
|
| 451 |
-
|
|
|
|
|
|
|
| 452 |
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
if (availableBooks.length > 0) {
|
| 460 |
-
const randomIndex = Math.floor(Math.random() * availableBooks.length);
|
| 461 |
-
book = availableBooks[randomIndex];
|
| 462 |
-
|
| 463 |
-
// Process book on demand
|
| 464 |
-
book = await this.processBookOnDemand(book);
|
| 465 |
-
if (!book) continue; // Book failed validation, try next
|
| 466 |
-
} else {
|
| 467 |
-
// All preloaded books used, try streaming
|
| 468 |
-
book = await this.getStreamingBook();
|
| 469 |
-
}
|
| 470 |
-
} else {
|
| 471 |
-
// Use local samples for remaining 10% + fallback
|
| 472 |
-
const fallbackBooks = this.books.length > 0 ? this.books : this.getSampleBooks();
|
| 473 |
-
const availableBooks = fallbackBooks.filter(book =>
|
| 474 |
-
!this.usedBooks.has(this.getBookId(book))
|
| 475 |
-
);
|
| 476 |
-
|
| 477 |
-
if (availableBooks.length > 0) {
|
| 478 |
-
const randomIndex = Math.floor(Math.random() * availableBooks.length);
|
| 479 |
-
book = availableBooks[randomIndex];
|
| 480 |
}
|
| 481 |
}
|
| 482 |
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
|
|
|
|
|
|
| 486 |
}
|
| 487 |
}
|
| 488 |
|
| 489 |
-
//
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 490 |
this.usedBooks.clear();
|
| 491 |
console.log('All books used, cleared used book cache');
|
| 492 |
return this.getRandomBook();
|
|
@@ -532,65 +489,9 @@ class HuggingFaceDatasetService {
|
|
| 532 |
}
|
| 533 |
|
| 534 |
async getBookByLevelCriteria(level) {
|
| 535 |
-
let targetPeriod = null;
|
| 536 |
-
if (level <= 2) {
|
| 537 |
-
targetPeriod = { min: 1850, max: 1925 };
|
| 538 |
-
} else if (level <= 4) {
|
| 539 |
-
targetPeriod = { min: 1800, max: 1899 };
|
| 540 |
-
}
|
| 541 |
-
|
| 542 |
-
if (targetPeriod) {
|
| 543 |
-
const periodBooks = await this.getBooksByPeriod(targetPeriod.min, targetPeriod.max);
|
| 544 |
-
|
| 545 |
-
if (periodBooks.length > 0) {
|
| 546 |
-
const randomIndex = Math.floor(Math.random() * periodBooks.length);
|
| 547 |
-
let book = periodBooks[randomIndex];
|
| 548 |
-
|
| 549 |
-
if (book.source === 'project_gutenberg' && !book.processed) {
|
| 550 |
-
book = await this.processBookOnDemand(book);
|
| 551 |
-
if (!book) {
|
| 552 |
-
return await this.getRandomBook();
|
| 553 |
-
}
|
| 554 |
-
}
|
| 555 |
-
|
| 556 |
-
return book;
|
| 557 |
-
}
|
| 558 |
-
}
|
| 559 |
-
|
| 560 |
return await this.getRandomBook();
|
| 561 |
}
|
| 562 |
|
| 563 |
-
async getBooksByPeriod(minYear, maxYear) {
|
| 564 |
-
const matchingBooks = [];
|
| 565 |
-
|
| 566 |
-
if (this.streamingEnabled && this.preloadedBooks.length > 0) {
|
| 567 |
-
for (const book of this.preloadedBooks) {
|
| 568 |
-
if (!this.usedBooks.has(this.getBookId(book))) {
|
| 569 |
-
let year = book.year;
|
| 570 |
-
if (!year && book.rawText) {
|
| 571 |
-
year = this.extractPublicationPeriod(book.rawText);
|
| 572 |
-
book.year = year;
|
| 573 |
-
}
|
| 574 |
-
|
| 575 |
-
if (year && year >= minYear && year <= maxYear) {
|
| 576 |
-
matchingBooks.push(book);
|
| 577 |
-
}
|
| 578 |
-
}
|
| 579 |
-
}
|
| 580 |
-
}
|
| 581 |
-
|
| 582 |
-
const fallbackBooks = this.books.length > 0 ? this.books : this.getSampleBooks();
|
| 583 |
-
for (const book of fallbackBooks) {
|
| 584 |
-
if (!this.usedBooks.has(this.getBookId(book))) {
|
| 585 |
-
const year = book.year || this.extractPublicationPeriod(book.text);
|
| 586 |
-
if (year && year >= minYear && year <= maxYear) {
|
| 587 |
-
matchingBooks.push(book);
|
| 588 |
-
}
|
| 589 |
-
}
|
| 590 |
-
}
|
| 591 |
-
|
| 592 |
-
return matchingBooks;
|
| 593 |
-
}
|
| 594 |
|
| 595 |
|
| 596 |
getBookById(id) {
|
|
|
|
| 189 |
id: rowData.id || Math.random().toString(36),
|
| 190 |
title: title,
|
| 191 |
author: author,
|
|
|
|
| 192 |
rawText: rawText,
|
| 193 |
text: null, // Will clean when needed
|
| 194 |
language: rowData.language || 'en',
|
|
|
|
| 203 |
console.log(`π Processing "${book.title}" on demand...`);
|
| 204 |
const startTime = Date.now();
|
| 205 |
|
| 206 |
+
// Clean text when actually needed
|
| 207 |
const cleanedText = this.cleanProjectGutenbergText(book.rawText);
|
|
|
|
| 208 |
|
| 209 |
book.text = cleanedText;
|
|
|
|
| 210 |
book.processed = true;
|
| 211 |
|
| 212 |
// Validate after processing
|
|
|
|
| 308 |
}
|
| 309 |
|
| 310 |
extractMetadata(text) {
|
| 311 |
+
const metadata = { title: 'Classic Literature', author: 'Unknown Author' };
|
| 312 |
|
| 313 |
if (!text) return metadata;
|
| 314 |
|
|
|
|
| 347 |
if (author && author.length > 1) {
|
| 348 |
metadata.author = this.cleanMetadataField(author);
|
| 349 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 350 |
}
|
| 351 |
}
|
| 352 |
|
|
|
|
| 360 |
.trim();
|
| 361 |
}
|
| 362 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 363 |
|
| 364 |
isValidTitle(title) {
|
| 365 |
if (!title || title.length < 3 || title.length > 100) return false;
|
|
|
|
| 406 |
throw new Error('Dataset not loaded');
|
| 407 |
}
|
| 408 |
|
| 409 |
+
// First, try to find a successfully processed HF book
|
| 410 |
+
if (this.streamingEnabled && this.preloadedBooks.length > 0) {
|
| 411 |
+
const availableHFBooks = this.preloadedBooks.filter(book =>
|
| 412 |
+
!this.usedBooks.has(this.getBookId(book))
|
| 413 |
+
);
|
| 414 |
|
| 415 |
+
for (const book of availableHFBooks) {
|
| 416 |
+
const processedBook = await this.processBookOnDemand(book);
|
| 417 |
+
if (processedBook) {
|
| 418 |
+
this.usedBooks.add(this.getBookId(processedBook));
|
| 419 |
+
console.log(`π Using HF book: "${processedBook.title}"`);
|
| 420 |
+
return processedBook;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 421 |
}
|
| 422 |
}
|
| 423 |
|
| 424 |
+
// If no HF books worked, try streaming
|
| 425 |
+
const streamedBook = await this.getStreamingBook();
|
| 426 |
+
if (streamedBook) {
|
| 427 |
+
this.usedBooks.add(this.getBookId(streamedBook));
|
| 428 |
+
return streamedBook;
|
| 429 |
}
|
| 430 |
}
|
| 431 |
|
| 432 |
+
// Fallback to local samples
|
| 433 |
+
const fallbackBooks = this.books.length > 0 ? this.books : this.getSampleBooks();
|
| 434 |
+
const availableBooks = fallbackBooks.filter(book =>
|
| 435 |
+
!this.usedBooks.has(this.getBookId(book))
|
| 436 |
+
);
|
| 437 |
+
|
| 438 |
+
if (availableBooks.length > 0) {
|
| 439 |
+
const randomIndex = Math.floor(Math.random() * availableBooks.length);
|
| 440 |
+
const book = availableBooks[randomIndex];
|
| 441 |
+
this.usedBooks.add(this.getBookId(book));
|
| 442 |
+
console.log(`π Using local book: "${book.title}"`);
|
| 443 |
+
return book;
|
| 444 |
+
}
|
| 445 |
+
|
| 446 |
+
// If all books used, clear cache and start over
|
| 447 |
this.usedBooks.clear();
|
| 448 |
console.log('All books used, cleared used book cache');
|
| 449 |
return this.getRandomBook();
|
|
|
|
| 489 |
}
|
| 490 |
|
| 491 |
async getBookByLevelCriteria(level) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 492 |
return await this.getRandomBook();
|
| 493 |
}
|
| 494 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
|
| 496 |
|
| 497 |
getBookById(id) {
|
src/clozeGameEngine.js
CHANGED
|
@@ -204,6 +204,15 @@ class ClozeGame {
|
|
| 204 |
expectedBlanks = 3;
|
| 205 |
}
|
| 206 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
// Limit selected words to expected number
|
| 208 |
if (selectedWords.length > expectedBlanks) {
|
| 209 |
console.log(`AI returned ${selectedWords.length} words but expected ${expectedBlanks}, limiting to ${expectedBlanks}`);
|
|
|
|
| 204 |
expectedBlanks = 3;
|
| 205 |
}
|
| 206 |
|
| 207 |
+
// If AI didn't provide enough words, fall back to manual selection
|
| 208 |
+
if (selectedWords.length < expectedBlanks) {
|
| 209 |
+
console.warn(`AI provided ${selectedWords.length} words but need ${expectedBlanks}, using fallback`);
|
| 210 |
+
const words = this.originalText.split(/\s+/);
|
| 211 |
+
const fallbackWords = this.selectWordsManually(words, expectedBlanks - selectedWords.length);
|
| 212 |
+
selectedWords = [...selectedWords, ...fallbackWords].slice(0, expectedBlanks);
|
| 213 |
+
console.log(`Combined AI + fallback words:`, selectedWords);
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
// Limit selected words to expected number
|
| 217 |
if (selectedWords.length > expectedBlanks) {
|
| 218 |
console.log(`AI returned ${selectedWords.length} words but expected ${expectedBlanks}, limiting to ${expectedBlanks}`);
|