brickfrog commited on
Commit
7e2bb59
·
verified ·
1 Parent(s): 6604cbf

Upload folder using huggingface_hub

Browse files
ankigen_core/card_generator.py CHANGED
@@ -52,16 +52,6 @@ GENERATION_MODES = [
52
  "label": "Single Subject",
53
  "description": "Generate cards for a specific topic",
54
  },
55
- {
56
- "value": "text",
57
- "label": "From Text",
58
- "description": "Generate cards from provided text",
59
- },
60
- {
61
- "value": "web",
62
- "label": "From Web",
63
- "description": "Generate cards from a web page URL",
64
- },
65
  ]
66
 
67
  # --- Core Functions --- (Moved and adapted from app.py)
@@ -279,97 +269,6 @@ def get_dataframe_columns() -> list[str]:
279
  ]
280
 
281
 
282
- # This function might be specific to the old crawler flow if AnkiCardData is only from there.
283
- # If orchestrate_card_generation now also produces something convertible to AnkiCardData, it might be useful.
284
- # For now, it's used by generate_cards_from_crawled_content.
285
- def deduplicate_cards(cards: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
286
- """Deduplicates a list of card dictionaries based on the 'Question' field."""
287
- seen_questions = set()
288
- unique_cards = []
289
- for card_dict in cards:
290
- question = card_dict.get("Question")
291
- if question is None: # Should not happen if cards are well-formed
292
- logger.warning(f"Card dictionary missing 'Question' key: {card_dict}")
293
- unique_cards.append(card_dict) # Keep it if no question to dedupe on
294
- continue
295
-
296
- # Normalize whitespace and case for deduplication
297
- normalized_question = " ".join(str(question).strip().lower().split())
298
- if normalized_question not in seen_questions:
299
- seen_questions.add(normalized_question)
300
- unique_cards.append(card_dict)
301
- else:
302
- logger.info(f"Deduplicated card with question: {question}")
303
- return unique_cards
304
-
305
-
306
- # --- Modification for generate_cards_from_crawled_content ---
307
-
308
-
309
- def generate_cards_from_crawled_content(
310
- all_cards: List[Card],
311
- ) -> List[Dict[str, Any]]: # Changed AnkiCardData to Card
312
- """
313
- Processes a list of Card objects (expected to have plain text fields after generate_cards_batch)
314
- and formats them into a list of dictionaries suitable for the DataFrame.
315
- """
316
- if not all_cards:
317
- return []
318
-
319
- data_for_dataframe = []
320
- for i, card_obj in enumerate(all_cards):
321
- # Extract data, assuming it's already plain text from Card object creation
322
- topic = (
323
- card_obj.metadata.get("topic", f"Crawled Content - Card {i + 1}")
324
- if card_obj.metadata
325
- else f"Crawled Content - Card {i + 1}"
326
- )
327
-
328
- # Ensure list-based metadata are joined as plain strings for DataFrame
329
- prerequisites = (
330
- card_obj.metadata.get("prerequisites", []) if card_obj.metadata else []
331
- )
332
- learning_outcomes = (
333
- card_obj.metadata.get("learning_outcomes", []) if card_obj.metadata else []
334
- )
335
-
336
- prerequisites_str = strip_html_tags(
337
- ", ".join(prerequisites)
338
- if isinstance(prerequisites, list)
339
- else str(prerequisites)
340
- )
341
- learning_outcomes_str = strip_html_tags(
342
- ", ".join(learning_outcomes)
343
- if isinstance(learning_outcomes, list)
344
- else str(learning_outcomes)
345
- )
346
- difficulty_str = strip_html_tags(
347
- str(
348
- card_obj.metadata.get("difficulty", "N/A")
349
- if card_obj.metadata
350
- else "N/A"
351
- )
352
- )
353
-
354
- card_dict = {
355
- "Index": str(i + 1),
356
- "Topic": strip_html_tags(topic),
357
- "Card_Type": strip_html_tags(card_obj.card_type or "basic"),
358
- "Question": card_obj.front.question or "", # Should be plain
359
- "Answer": card_obj.back.answer or "", # Should be plain
360
- "Explanation": card_obj.back.explanation or "", # Should be plain
361
- "Example": card_obj.back.example or "", # Should be plain
362
- "Prerequisites": prerequisites_str,
363
- "Learning_Outcomes": learning_outcomes_str,
364
- "Difficulty": difficulty_str,
365
- "Source_URL": strip_html_tags(
366
- card_obj.metadata.get("source_url", "") if card_obj.metadata else ""
367
- ),
368
- }
369
- data_for_dataframe.append(card_dict)
370
- return data_for_dataframe
371
-
372
-
373
  def generate_token_usage_html(token_usage=None):
374
  """Generate HTML for token usage display"""
375
  if token_usage and isinstance(token_usage, dict):
 
52
  "label": "Single Subject",
53
  "description": "Generate cards for a specific topic",
54
  },
 
 
 
 
 
 
 
 
 
 
55
  ]
56
 
57
  # --- Core Functions --- (Moved and adapted from app.py)
 
269
  ]
270
 
271
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  def generate_token_usage_html(token_usage=None):
273
  """Generate HTML for token usage display"""
274
  if token_usage and isinstance(token_usage, dict):
ankigen_core/exceptions.py CHANGED
@@ -41,24 +41,6 @@ class Context7APIError(APIError):
41
  pass
42
 
43
 
44
- class CrawlerError(AnkigenError):
45
- """Base exception for web crawler errors."""
46
-
47
- pass
48
-
49
-
50
- class URLValidationError(CrawlerError):
51
- """Raised when URL validation fails."""
52
-
53
- pass
54
-
55
-
56
- class ContentExtractionError(CrawlerError):
57
- """Raised when content extraction from web page fails."""
58
-
59
- pass
60
-
61
-
62
  class ExportError(AnkigenError):
63
  """Base exception for export-related errors."""
64
 
 
41
  pass
42
 
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  class ExportError(AnkigenError):
45
  """Base exception for export-related errors."""
46
 
ankigen_core/llm_interface.py CHANGED
@@ -2,9 +2,8 @@
2
 
3
  import asyncio
4
  import time
5
- from typing import Callable, List, Optional, TypeVar
6
 
7
- import tiktoken
8
  from agents import Agent, ModelSettings, Runner, set_default_openai_client
9
  from openai import (
10
  APIConnectionError,
@@ -14,15 +13,8 @@ from openai import (
14
  RateLimitError,
15
  )
16
  from pydantic import BaseModel
17
- from tenacity import (
18
- retry,
19
- retry_if_exception_type,
20
- stop_after_attempt,
21
- wait_exponential,
22
- )
23
 
24
  from ankigen_core.logging import logger
25
- from ankigen_core.models import Card, CardBack, CardFront, CrawledPage
26
  from ankigen_core.utils import ResponseCache
27
 
28
  T = TypeVar("T", bound=BaseModel)
@@ -343,350 +335,3 @@ class OpenAIRateLimiter:
343
  # This assumes a single rate limit bucket for all calls from this application instance.
344
  # More sophisticated scenarios might need per-model or per-key limiters.
345
  openai_rate_limiter = OpenAIRateLimiter() # Using default 60k TPM for now
346
-
347
-
348
- @retry(
349
- stop=stop_after_attempt(3),
350
- wait=wait_exponential(multiplier=1, min=2, max=10),
351
- retry=retry_if_exception_type(RETRYABLE_OPENAI_ERRORS),
352
- before_sleep=lambda retry_state: logger.warning(
353
- f"Retrying OpenAI call (attempt {retry_state.attempt_number}) for process_crawled_page due to {retry_state.outcome.exception() if retry_state.outcome else 'unknown reason'}"
354
- ),
355
- )
356
- async def process_crawled_page(
357
- openai_client: AsyncOpenAI,
358
- page: CrawledPage,
359
- model: str = "gpt-4o",
360
- custom_system_prompt: Optional[str] = None,
361
- custom_user_prompt_template: Optional[str] = None,
362
- max_prompt_content_tokens: int = 6000,
363
- cache: Optional[ResponseCache] = None,
364
- ) -> List[Card]:
365
- """Process a crawled page and extract structured Card objects using OpenAI.
366
-
367
- Args:
368
- openai_client: The OpenAI client instance
369
- page: The crawled page to process
370
- model: The model to use for generation
371
- custom_system_prompt: Optional custom system prompt
372
- custom_user_prompt_template: Optional custom user prompt template
373
- max_prompt_content_tokens: Maximum tokens for content
374
- cache: Optional ResponseCache for page-level caching
375
-
376
- Returns:
377
- List of generated Card objects
378
- """
379
- # Check page-level cache first
380
- if cache:
381
- cache_key = f"{page.url}:{model}"
382
- cached_cards = cache.get(cache_key, "page_cache")
383
- if cached_cards is not None:
384
- logger.info(f"Using cached cards for page: {page.url}")
385
- return cached_cards
386
-
387
- logger.info(
388
- f"Processing page: {page.url} with model {model}, max_prompt_content_tokens: {max_prompt_content_tokens}"
389
- )
390
-
391
- if not page.text_content or not page.text_content.strip():
392
- logger.info(f"Skipping page {page.url} as it has empty text content.")
393
- return []
394
-
395
- system_prompt = (
396
- custom_system_prompt
397
- if custom_system_prompt and custom_system_prompt.strip()
398
- else """
399
- You are an expert Anki card creator. Your task is to generate Anki flashcards from the provided web page content.
400
- For each card, provide:
401
- - "front": A dictionary with a "question" field.
402
- - "back": A dictionary with "answer", "explanation", and "example" fields.
403
- - "tags": A list of relevant keywords (optional).
404
- - "source_url": The URL of the page the content was extracted from (this will be provided by the system).
405
- - "note_type": Specify "Basic" for question/answer cards or "Cloze" for cloze deletion cards. (This will be mapped to "card_type").
406
- - "metadata": An optional dictionary for additional structured information such as:
407
- - "prerequisites": ["list", "of", "prerequisites"]
408
- - "learning_outcomes": ["list", "of", "learning", "outcomes"]
409
- - "common_misconceptions": ["list", "of", "common", "misconceptions"]
410
- - "difficulty": "beginner" | "intermediate" | "advanced"
411
- - "topic": "The main topic this card relates to, derived from the content"
412
-
413
- Focus on creating clear, concise, and accurate cards that are useful for learning.
414
- If generating cloze cards, ensure the "front.question" field uses Anki's cloze syntax, e.g., "The capital of {{c1::France}} is Paris."
415
- Ensure the entire response is a valid JSON object following this structure:
416
- {
417
- "cards": [
418
- {
419
- "front": {"question": "..."},
420
- "back": {"answer": "...", "explanation": "...", "example": "..."},
421
- "tags": ["...", "..."],
422
- "card_type": "Basic",
423
- "metadata": {"difficulty": "beginner", "prerequisites": [], "topic": "..."}
424
- },
425
- // ... more cards
426
- ]
427
- }
428
- """
429
- )
430
-
431
- # User Prompt
432
- default_user_prompt_template = """
433
- Please generate Anki cards based on the following content from the URL: {url}
434
-
435
- Content:
436
- {content}
437
-
438
- Generate a few high-quality Anki cards from this content.
439
- """
440
- user_prompt: str
441
- if custom_user_prompt_template and custom_user_prompt_template.strip():
442
- try:
443
- user_prompt = custom_user_prompt_template.format(
444
- url=page.url, content=page.text_content
445
- )
446
- except KeyError as e:
447
- logger.warning(
448
- f"Custom user prompt template for {page.url} is malformed (missing key {e}). Falling back to default."
449
- )
450
- user_prompt = default_user_prompt_template.format(
451
- url=page.url, content=page.text_content
452
- )
453
- else:
454
- user_prompt = default_user_prompt_template.format(
455
- url=page.url, content=page.text_content
456
- )
457
- # --- End Prompt Definition ---
458
-
459
- try:
460
- encoding = tiktoken.encoding_for_model(model)
461
- except KeyError:
462
- logger.warning(
463
- f"Tiktoken model {model} not found, using cl100k_base for token estimation and truncation."
464
- )
465
- encoding = tiktoken.get_encoding("cl100k_base")
466
-
467
- prompt_structure_tokens = len(encoding.encode(system_prompt + user_prompt))
468
- available_tokens_for_content = max_prompt_content_tokens - prompt_structure_tokens
469
- if available_tokens_for_content <= 0:
470
- logger.error(
471
- f"Max prompt tokens ({max_prompt_content_tokens}) too small for prompt structure for page {page.url}. Cannot process."
472
- )
473
- return []
474
-
475
- page_content_for_prompt = page.text_content or ""
476
- content_tokens = encoding.encode(page_content_for_prompt)
477
- if len(content_tokens) > available_tokens_for_content:
478
- truncated_content_tokens = content_tokens[:available_tokens_for_content]
479
- page_content_for_prompt = encoding.decode(truncated_content_tokens)
480
- logger.warning(
481
- f"Content for page {page.url} was truncated from {len(content_tokens)} tokens "
482
- f"to {len(truncated_content_tokens)} tokens to fit model's context window (limit: {max_prompt_content_tokens} for content portion)."
483
- )
484
-
485
- estimated_request_tokens = prompt_structure_tokens + len(
486
- encoding.encode(page_content_for_prompt)
487
- )
488
- await openai_rate_limiter.wait_if_needed(estimated_request_tokens)
489
-
490
- try:
491
- logger.debug(
492
- f"Attempting to generate cards for {page.url} using model {model}."
493
- )
494
-
495
- # Use agents SDK for structured output
496
- result = await structured_agent_call(
497
- openai_client=openai_client,
498
- model=model,
499
- instructions=system_prompt,
500
- user_input=user_prompt,
501
- output_type=GenericJsonOutput, # Flexible schema for card generation
502
- temperature=0.5,
503
- timeout=120.0,
504
- )
505
-
506
- if result is None:
507
- logger.error(f"Invalid or empty response from agent for page {page.url}.")
508
- return []
509
-
510
- # Convert Pydantic model to dict for processing
511
- parsed_cards = result.model_dump() if isinstance(result, BaseModel) else result
512
-
513
- validated_cards: List[Card] = []
514
-
515
- cards_list_from_json = []
516
- if (
517
- isinstance(parsed_cards, dict)
518
- and "cards" in parsed_cards
519
- and isinstance(parsed_cards["cards"], list)
520
- ):
521
- cards_list_from_json = parsed_cards["cards"]
522
- logger.info(
523
- f"Found 'cards' key in response from {page.url} with {len(cards_list_from_json)} cards"
524
- )
525
- elif isinstance(parsed_cards, list):
526
- cards_list_from_json = parsed_cards
527
- else:
528
- logger.error(
529
- f"LLM response for {page.url} was not a list or valid dict. Response: {str(parsed_cards)[:200]}..."
530
- )
531
- return []
532
-
533
- for card_dict in cards_list_from_json:
534
- if not isinstance(card_dict, dict):
535
- logger.warning(
536
- f"Skipping non-dict card item for {page.url}: {card_dict}"
537
- )
538
- continue
539
-
540
- try:
541
- front_data = card_dict.get("front")
542
- back_data = card_dict.get("back")
543
-
544
- if not isinstance(front_data, dict) or "question" not in front_data:
545
- logger.warning(
546
- f"Malformed 'front' data in card_dict for {page.url}: {front_data}. Skipping card."
547
- )
548
- continue
549
- if not isinstance(back_data, dict) or "answer" not in back_data:
550
- logger.warning(
551
- f"Malformed 'back' data in card_dict for {page.url}: {back_data}. Skipping card."
552
- )
553
- continue
554
-
555
- metadata_payload = card_dict.get("metadata", {})
556
- if not isinstance(metadata_payload, dict):
557
- metadata_payload = {}
558
- metadata_payload["source_url"] = page.url
559
- if page.title and "topic" not in metadata_payload:
560
- metadata_payload["topic"] = page.title
561
-
562
- tags = card_dict.get("tags", [])
563
- if not isinstance(tags, list) or not all(
564
- isinstance(t, str) for t in tags
565
- ):
566
- tags = []
567
-
568
- if tags:
569
- metadata_payload["tags"] = tags
570
-
571
- card_obj = Card(
572
- front=CardFront(question=str(front_data["question"])),
573
- back=CardBack(
574
- answer=str(back_data["answer"]),
575
- explanation=str(back_data.get("explanation", "")),
576
- example=str(back_data.get("example", "")),
577
- ),
578
- card_type=str(card_dict.get("card_type", "Basic")),
579
- metadata=metadata_payload,
580
- )
581
- validated_cards.append(card_obj)
582
- except Exception as e:
583
- logger.error(
584
- f"Error creating Card object for {page.url} from dict: {card_dict}. Error: {e}",
585
- exc_info=True,
586
- )
587
-
588
- if not validated_cards:
589
- logger.info(
590
- f"No valid Cards generated or parsed from {page.url} after LLM processing."
591
- )
592
- else:
593
- logger.info(
594
- f"Successfully generated {len(validated_cards)} Cards from {page.url}."
595
- )
596
- # Cache successful results for page-level caching
597
- if cache:
598
- cache_key = f"{page.url}:{model}"
599
- cache.set(cache_key, "page_cache", validated_cards)
600
- logger.debug(f"Cached {len(validated_cards)} cards for {page.url}")
601
-
602
- return validated_cards
603
-
604
- except Exception as e:
605
- logger.error(
606
- f"Error processing page {page.url} with agents SDK: {e}", exc_info=True
607
- )
608
- return []
609
-
610
-
611
- async def process_crawled_pages(
612
- openai_client: AsyncOpenAI,
613
- pages: List[CrawledPage],
614
- model: str = "gpt-4o",
615
- max_prompt_content_tokens: int = 6000,
616
- max_concurrent_requests: int = 5,
617
- custom_system_prompt: Optional[str] = None,
618
- custom_user_prompt_template: Optional[str] = None,
619
- progress_callback: Optional[Callable[[int, int], None]] = None,
620
- cache: Optional[ResponseCache] = None,
621
- ) -> List[Card]:
622
- if not pages:
623
- logger.info("No pages provided to process_crawled_pages.")
624
- return []
625
-
626
- logger.info(
627
- f"Starting batch processing of {len(pages)} pages with model {model}. Max concurrent requests: {max_concurrent_requests}."
628
- )
629
-
630
- semaphore = asyncio.Semaphore(max_concurrent_requests)
631
- tasks = []
632
- processed_count = 0
633
-
634
- async def process_with_semaphore(page: CrawledPage):
635
- nonlocal processed_count
636
- async with semaphore:
637
- logger.debug(
638
- f"Submitting task for page: {page.url} (Semaphore count: {semaphore._value})"
639
- )
640
- try:
641
- page_cards = await process_crawled_page(
642
- openai_client=openai_client,
643
- page=page,
644
- model=model,
645
- custom_system_prompt=custom_system_prompt,
646
- custom_user_prompt_template=custom_user_prompt_template,
647
- max_prompt_content_tokens=max_prompt_content_tokens,
648
- cache=cache,
649
- )
650
- if page_cards is None:
651
- logger.warning(
652
- f"process_crawled_page returned None for {page.url}, expected list. Defaulting to empty list."
653
- )
654
- page_cards = []
655
-
656
- logger.info(
657
- f"Completed processing for page: {page.url}. Generated {len(page_cards)} cards."
658
- )
659
- return page_cards
660
- except Exception as e:
661
- logger.error(
662
- f"Error in process_with_semaphore for page {page.url}: {e}",
663
- exc_info=True,
664
- )
665
- return []
666
- finally:
667
- processed_count += 1
668
- if progress_callback:
669
- progress_callback(processed_count, len(pages))
670
-
671
- for page_to_process in pages:
672
- tasks.append(asyncio.create_task(process_with_semaphore(page_to_process)))
673
-
674
- results_from_tasks: List[List[Card]] = []
675
- for i, future in enumerate(asyncio.as_completed(tasks)):
676
- try:
677
- result_list = await future
678
- if result_list:
679
- results_from_tasks.append(result_list)
680
- except Exception as e:
681
- logger.error(
682
- f"Unhandled error gathering result for a page task: {e}", exc_info=True
683
- )
684
-
685
- all_cards: List[Card] = []
686
- for card_list in results_from_tasks:
687
- all_cards.extend(card_list)
688
-
689
- logger.info(
690
- f"Finished processing all {len(pages)} pages. Generated {len(all_cards)} Cards in total."
691
- )
692
- return all_cards
 
2
 
3
  import asyncio
4
  import time
5
+ from typing import Optional, TypeVar
6
 
 
7
  from agents import Agent, ModelSettings, Runner, set_default_openai_client
8
  from openai import (
9
  APIConnectionError,
 
13
  RateLimitError,
14
  )
15
  from pydantic import BaseModel
 
 
 
 
 
 
16
 
17
  from ankigen_core.logging import logger
 
18
  from ankigen_core.utils import ResponseCache
19
 
20
  T = TypeVar("T", bound=BaseModel)
 
335
  # This assumes a single rate limit bucket for all calls from this application instance.
336
  # More sophisticated scenarios might need per-model or per-key limiters.
337
  openai_rate_limiter = OpenAIRateLimiter() # Using default 60k TPM for now
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ankigen_core/models.py CHANGED
@@ -1,4 +1,4 @@
1
- from pydantic import BaseModel, Field
2
  from typing import List, Optional
3
 
4
  # Module for Pydantic data models
@@ -60,14 +60,3 @@ class LearningSequence(BaseModel):
60
  cards: List[CardGeneration]
61
  suggested_study_order: List[str]
62
  review_recommendations: List[str]
63
-
64
-
65
- class CrawledPage(BaseModel):
66
- url: str
67
- html_content: str
68
- text_content: str
69
- title: Optional[str] = None
70
- meta_description: Optional[str] = None
71
- meta_keywords: Optional[List[str]] = Field(default_factory=list)
72
- crawl_depth: int = 0
73
- parent_url: Optional[str] = None
 
1
+ from pydantic import BaseModel
2
  from typing import List, Optional
3
 
4
  # Module for Pydantic data models
 
60
  cards: List[CardGeneration]
61
  suggested_study_order: List[str]
62
  review_recommendations: List[str]
 
 
 
 
 
 
 
 
 
 
 
ankigen_core/ui_logic.py CHANGED
@@ -2,68 +2,20 @@
2
 
3
  import gradio as gr
4
  import pandas as pd
5
- from typing import (
6
- Callable,
7
- List,
8
- Optional,
9
- Tuple,
10
- )
11
- from urllib.parse import urlparse
12
 
13
- # --- Imports moved from later in the file (Task 7, etc.) ---
14
- import re # For URL validation and filename sanitization
15
- import asyncio
16
-
17
- from ankigen_core.crawler import CrawledPage, WebCrawler
18
- from ankigen_core.llm_interface import (
19
- OpenAIClientManager,
20
- )
21
- from ankigen_core.card_generator import (
22
- generate_cards_from_crawled_content,
23
- AVAILABLE_MODELS,
24
- )
25
  from ankigen_core.utils import get_logger
 
26
 
27
- # Only import models that are actually used in this file
28
- from ankigen_core.models import (
29
- Card,
30
- # ModelSettings, # Removed
31
- # LearningPathInput, # Removed
32
- # LearningPath, # Removed
33
- # GeneratedPath, # Removed
34
- # SubjectAnalysis, # Removed
35
- # SubjectCardRequest, # Removed
36
- # TextCardRequest, # Removed
37
- # LearningPathRequest, # Removed
38
- )
39
-
40
- # Import agent system for web crawling
41
- # Agent system is required for web crawling
42
- from ankigen_core.agents.integration import AgentOrchestrator
43
-
44
- AGENTS_AVAILABLE_UI = True
45
- # --- End moved imports ---
46
 
47
- # Get an instance of the logger for this module
48
- crawler_ui_logger = get_logger() # Keep this definition
49
 
 
 
50
 
51
- def update_mode_visibility(
52
- mode: str,
53
- current_subject: str,
54
- current_text: str,
55
- current_url: str,
56
- ):
57
- """Updates visibility and values of UI elements based on generation mode."""
58
- is_subject = mode == "subject"
59
- is_text = mode == "text"
60
- is_web = mode == "web"
61
-
62
- # Determine value persistence or clearing
63
- subject_val = current_subject if is_subject else ""
64
- text_val = current_text if is_text else ""
65
- url_val = current_url if is_web else ""
66
-
67
  # Define standard columns for empty DataFrames
68
  main_output_df_columns = [
69
  "Index",
@@ -79,420 +31,20 @@ def update_mode_visibility(
79
  ]
80
 
81
  return (
82
- gr.update(visible=is_subject), # 1 subject_mode (Group)
83
- gr.update(visible=is_text), # 2 text_mode (Group)
84
- gr.update(visible=is_web), # 3 web_mode (Group for crawler UI)
85
- gr.update(visible=True), # 4 cards_output (always visible now)
86
- gr.update(value=subject_val), # 5 subject
87
- gr.update(value=text_val), # 6 source_text
88
- gr.update(value=url_val), # 7 web_crawl_url_input
89
  gr.update(
90
  value=pd.DataFrame(columns=main_output_df_columns)
91
- ), # 8 output (DataFrame)
92
  gr.update(
93
  value="<div><b>Total Cards Generated:</b> <span id='total-cards-count'>0</span></div>",
94
  visible=False,
95
- ), # 9 total_cards_html
96
- )
97
-
98
-
99
- def create_crawler_main_mode_elements() -> Tuple[
100
- List[gr.components.Component], # ui_components (url_input, max_depth, etc.)
101
- gr.Button, # crawl_button
102
- gr.Progress, # progress_bar
103
- gr.Textbox, # progress_status_textbox
104
- gr.Textbox, # custom_system_prompt
105
- gr.Textbox, # custom_user_prompt_template
106
- gr.Checkbox, # use_sitemap_checkbox
107
- gr.Textbox, # sitemap_url_textbox
108
- ]:
109
- """Creates the UI components for the Web Crawler mode integrated into the main tab."""
110
- ui_components: List[gr.components.Component] = []
111
-
112
- # URL Input
113
- url_input = gr.Textbox(
114
- label="Start URL",
115
- placeholder="Enter the full URL to start crawling (e.g., https://example.com/docs)",
116
- elem_id="crawler_url_input",
117
- )
118
- ui_components.append(url_input)
119
-
120
- with gr.Row():
121
- max_depth_slider = gr.Slider(
122
- minimum=0,
123
- maximum=5,
124
- value=1,
125
- step=1,
126
- label="Max Crawl Depth",
127
- elem_id="crawler_max_depth_slider",
128
- )
129
- ui_components.append(max_depth_slider)
130
-
131
- crawler_req_per_sec_slider = gr.Slider(
132
- minimum=0.1,
133
- maximum=10,
134
- value=2,
135
- step=0.1,
136
- label="Requests per Second (Crawler)",
137
- elem_id="crawler_req_per_sec_slider",
138
- )
139
- ui_components.append(crawler_req_per_sec_slider)
140
-
141
- model_choices_ui_crawler = [(m["label"], m["value"]) for m in AVAILABLE_MODELS]
142
- default_model_value_crawler = next(
143
- (m["value"] for m in AVAILABLE_MODELS if "nano" in m["value"].lower()),
144
- AVAILABLE_MODELS[0]["value"] if AVAILABLE_MODELS else "",
145
- )
146
- model_dropdown = gr.Dropdown(
147
- choices=model_choices_ui_crawler,
148
- label="AI Model for Content Processing", # Clarified label
149
- value=default_model_value_crawler,
150
- elem_id="crawler_model_dropdown",
151
- allow_custom_value=True,
152
  )
153
- ui_components.append(model_dropdown)
154
-
155
- with gr.Row():
156
- include_patterns_textbox = gr.Textbox(
157
- label="Include URL Patterns (one per line, regex compatible)",
158
- placeholder="""e.g., /blog/.*
159
- example.com/articles/.*""",
160
- lines=3,
161
- elem_id="crawler_include_patterns",
162
- scale=1,
163
- )
164
- ui_components.append(include_patterns_textbox)
165
-
166
- exclude_patterns_textbox = gr.Textbox(
167
- label="Exclude URL Patterns (one per line, regex compatible)",
168
- placeholder="""e.g., /category/.*
169
- .*/login""",
170
- lines=3,
171
- elem_id="crawler_exclude_patterns",
172
- scale=1,
173
- )
174
- ui_components.append(exclude_patterns_textbox)
175
-
176
- with gr.Accordion(
177
- "Sitemap Options", open=False, elem_id="crawler_sitemap_options_accordion"
178
- ):
179
- use_sitemap_checkbox = gr.Checkbox(
180
- label="Use Sitemap?",
181
- value=False,
182
- elem_id="crawler_use_sitemap_checkbox",
183
- )
184
- # ui_components.append(use_sitemap_checkbox) # Appended later with its group
185
-
186
- sitemap_url_textbox = gr.Textbox(
187
- label="Sitemap URL (e.g., /sitemap.xml or full URL)",
188
- placeholder="Enter sitemap URL relative to start URL or full path",
189
- visible=False,
190
- elem_id="crawler_sitemap_url_textbox",
191
- )
192
- # ui_components.append(sitemap_url_textbox) # Appended later with its group
193
-
194
- use_sitemap_checkbox.change(
195
- fn=lambda x: gr.update(visible=x),
196
- inputs=[use_sitemap_checkbox],
197
- outputs=[sitemap_url_textbox],
198
- )
199
- # Add sitemap components to the main list for return
200
- # sitemap_elements_for_return = [use_sitemap_checkbox, sitemap_url_textbox] # Unused variable
201
-
202
- with gr.Accordion(
203
- "Advanced Prompt Options",
204
- open=False,
205
- elem_id="crawler_advanced_options_accordion",
206
- ): # Removed assignment to advanced_options_accordion_component
207
- custom_system_prompt = gr.Textbox(
208
- label="Custom System Prompt (Optional)",
209
- placeholder="Leave empty to use the default system prompt for card generation.",
210
- lines=5,
211
- info="Define the overall role and instructions for the AI.",
212
- elem_id="crawler_custom_system_prompt",
213
- )
214
- # ui_components.append(custom_system_prompt) # Appended later
215
-
216
- custom_user_prompt_template = gr.Textbox(
217
- label="Custom User Prompt Template (Optional)",
218
- placeholder="Leave empty to use default. Available placeholders: {url}, {content}",
219
- lines=5,
220
- info="Define how the page URL and content are presented to the AI.",
221
- elem_id="crawler_custom_user_prompt_template",
222
- )
223
- # ui_components.append(custom_user_prompt_template) # Appended later
224
- # Add prompt components to the main list for return
225
- # prompt_elements_for_return = [custom_system_prompt, custom_user_prompt_template] # Unused variable
226
-
227
- # Crawl button (will trigger crawl_and_generate, results populate main DataFrame)
228
- crawl_button = gr.Button(
229
- "Crawl Content & Prepare Cards", # Changed button text
230
- variant="secondary", # Differentiate from main generate button
231
- elem_id="crawler_crawl_content_button",
232
- )
233
- # ui_components.append(crawl_button) # Returned separately
234
-
235
- # Progress bar and status for the crawling process
236
- progress_bar = (
237
- gr.Progress()
238
- ) # Removed elem_id as gr.Progress might not support it directly
239
- progress_status_textbox = gr.Textbox(
240
- label="Crawl Status",
241
- interactive=False,
242
- lines=3, # Reduced lines
243
- placeholder="Crawling process status will appear here...",
244
- elem_id="crawler_status_textbox",
245
- )
246
- # ui_components.append(progress_status_textbox) # Returned separately
247
-
248
- # REMOVED UI elements:
249
- # - export_format_radio (no longer needed here)
250
- # - All preview related: preview_row_component, preview_dataframe_component, update_cards_button_component
251
- # - All preview export related: export_format_preview_component, deck_name_preview_component, export_button_preview_component
252
- # - All direct file download related: download_row_group, generated_file_output, download_button
253
-
254
- # The main ui_components list should contain all elements whose values are needed as inputs to the crawl/generation
255
- # or whose visibility might be managed together.
256
- # For clarity, specific components like buttons or progress bars are returned separately if they have specific event handlers
257
- # or are managed distinctly.
258
-
259
- # Add all input fields to ui_components for easier management if needed, or return them individually.
260
- # For now, returning them grouped for clarity.
261
-
262
- return (
263
- ui_components,
264
- crawl_button,
265
- progress_bar,
266
- progress_status_textbox,
267
- custom_system_prompt,
268
- custom_user_prompt_template,
269
- use_sitemap_checkbox,
270
- sitemap_url_textbox,
271
- )
272
-
273
-
274
- # --- Crawl and Generate Logic (Task 7) ---
275
-
276
- # MODIFIED: Get model values from AVAILABLE_MODELS for validation
277
- CRAWLER_AVAILABLE_MODELS_VALUES = [m["value"] for m in AVAILABLE_MODELS]
278
-
279
-
280
- def _basic_sanitize_filename(name: str) -> str:
281
- """Basic filename sanitization by replacing non-alphanumeric characters with underscores."""
282
- return re.sub(r"[^a-zA-Z0-9_.-]", "_", name)
283
 
284
 
285
- def _validate_crawl_url(url: str) -> bool:
286
- """Validate URL for crawling."""
287
- if not url or not url.startswith(("http://", "https://")):
288
- gr.Warning("Invalid URL provided. Please enter a valid http/https URL.")
289
- return False
290
- try:
291
- urlparse(url)
292
- return True
293
- except Exception:
294
- return False
295
-
296
-
297
- def _create_web_crawler(
298
- url: str,
299
- max_depth: int,
300
- include_patterns: str,
301
- exclude_patterns: str,
302
- use_sitemap: bool,
303
- sitemap_url_str: str,
304
- ) -> WebCrawler:
305
- """Create configured WebCrawler instance."""
306
- include_list = [p.strip() for p in include_patterns.split(",") if p.strip()]
307
- exclude_list = [p.strip() for p in exclude_patterns.split(",") if p.strip()]
308
-
309
- return WebCrawler(
310
- start_url=url,
311
- max_depth=max_depth,
312
- include_patterns=include_list,
313
- exclude_patterns=exclude_list,
314
- use_sitemap=use_sitemap,
315
- sitemap_url=sitemap_url_str
316
- if use_sitemap and sitemap_url_str.strip()
317
- else None,
318
- )
319
-
320
-
321
- def _create_crawl_progress_callback(
322
- progress: gr.Progress,
323
- ) -> Tuple[Callable[[int, int, str], None], List[int]]:
324
- """Create progress callback for crawler with mutable state container."""
325
- total_urls_container = [0] # Mutable container for nonlocal-like behavior
326
-
327
- def callback(processed_count: int, total_urls: int, current_url: str):
328
- total_urls_container[0] = total_urls
329
- if total_urls_container[0] > 0:
330
- progress(
331
- 0.1 + (processed_count / total_urls_container[0]) * 0.4,
332
- desc=f"Crawling: {processed_count}/{total_urls_container[0]} URLs. Current: {current_url}",
333
- )
334
- else:
335
- progress(
336
- 0.1 + processed_count * 0.01,
337
- desc=f"Crawling: {processed_count} URLs discovered. Current: {current_url}",
338
- )
339
-
340
- return callback, total_urls_container
341
-
342
-
343
- async def _perform_web_crawl(
344
- crawler: WebCrawler,
345
- progress: gr.Progress,
346
- url: str,
347
- ) -> Optional[List[CrawledPage]]:
348
- """Execute web crawl and return pages or None if empty."""
349
- callback, _ = _create_crawl_progress_callback(progress)
350
-
351
- crawler_ui_logger.info(f"Starting crawl for {url}...")
352
- progress(0.15, desc=f"Starting crawl for {url}...")
353
-
354
- crawled_pages = await asyncio.to_thread(crawler.crawl, progress_callback=callback)
355
-
356
- crawler_ui_logger.info(f"Crawling finished. Found {len(crawled_pages)} pages.")
357
- progress(0.5, desc=f"Crawling finished. Found {len(crawled_pages)} pages.")
358
-
359
- return crawled_pages if crawled_pages else None
360
-
361
-
362
- async def _process_crawled_with_agents(
363
- crawled_pages: List[CrawledPage],
364
- client_manager: OpenAIClientManager,
365
- url: str,
366
- progress: gr.Progress,
367
- ) -> Tuple[List[Card], str]:
368
- """Process crawled content with agent system."""
369
- crawler_ui_logger.info("Using agent system for web crawling card generation")
370
-
371
- orchestrator = AgentOrchestrator(client_manager)
372
- # API key is already configured in client_manager, pass empty string as placeholder
373
- await orchestrator.initialize("")
374
-
375
- combined_content = "\n\n--- PAGE BREAK ---\n\n".join(
376
- [
377
- f"URL: {page.url}\nTitle: {page.title}\nContent: {page.text_content[:2000]}..."
378
- for page in crawled_pages[:10]
379
- ]
380
- )
381
-
382
- context = {
383
- "source_text": combined_content,
384
- "crawl_source": url,
385
- "pages_crawled": len(crawled_pages),
386
- }
387
-
388
- progress(0.6, desc="Processing with agent system...")
389
-
390
- agent_cards, _ = await orchestrator.generate_cards_with_agents(
391
- topic=f"Content from {url}",
392
- subject="web_content",
393
- num_cards=min(len(crawled_pages) * 3, 50),
394
- difficulty="intermediate",
395
- enable_quality_pipeline=True,
396
- context=context,
397
- )
398
-
399
- if agent_cards:
400
- progress(0.9, desc=f"Agent system generated {len(agent_cards)} cards")
401
- final_message = (
402
- f"Agent system processed content from {len(crawled_pages)} pages. "
403
- f"Generated {len(agent_cards)} high-quality cards."
404
- )
405
- else:
406
- final_message = "Agent system returned no cards"
407
-
408
- return agent_cards or [], final_message
409
-
410
-
411
- async def crawl_and_generate(
412
- url: str,
413
- max_depth: int,
414
- crawler_requests_per_second: float,
415
- include_patterns: str,
416
- exclude_patterns: str,
417
- model: str,
418
- export_format_ui: str,
419
- custom_system_prompt: str,
420
- custom_user_prompt_template: str,
421
- use_sitemap: bool,
422
- sitemap_url_str: str,
423
- client_manager: OpenAIClientManager,
424
- progress: gr.Progress,
425
- status_textbox: gr.Textbox,
426
- ) -> Tuple[str, List[dict], List[Card]]:
427
- """Crawls a website, generates Anki cards, and prepares them for export/display."""
428
- crawler_ui_logger.info(f"Crawl and generate called for URL: {url}")
429
-
430
- if not _validate_crawl_url(url):
431
- return "Invalid URL", [], []
432
-
433
- try:
434
- crawler = _create_web_crawler(
435
- url,
436
- max_depth,
437
- include_patterns,
438
- exclude_patterns,
439
- use_sitemap,
440
- sitemap_url_str,
441
- )
442
-
443
- crawled_pages = await _perform_web_crawl(crawler, progress, url)
444
- if not crawled_pages:
445
- progress(1.0, desc="No pages were crawled. Check URL and patterns.")
446
- return (
447
- "No pages were crawled. Check URL and patterns.",
448
- pd.DataFrame().to_dict(orient="records"),
449
- [],
450
- )
451
-
452
- agent_cards, final_message = await _process_crawled_with_agents(
453
- crawled_pages,
454
- client_manager,
455
- url,
456
- progress,
457
- )
458
-
459
- if agent_cards:
460
- cards_for_dataframe_export = generate_cards_from_crawled_content(
461
- agent_cards
462
- )
463
- progress(1.0, desc=final_message)
464
- return final_message, cards_for_dataframe_export, agent_cards
465
- else:
466
- progress(1.0, desc=final_message)
467
- return final_message, pd.DataFrame().to_dict(orient="records"), []
468
-
469
- except ConnectionError as e:
470
- crawler_ui_logger.error(f"Connection error during crawl: {e}", exc_info=True)
471
- progress(1.0, desc=f"Connection error: {e}")
472
- return f"Connection error: {e}", pd.DataFrame().to_dict(orient="records"), []
473
- except ValueError as e:
474
- crawler_ui_logger.error(f"Value error: {e}", exc_info=True)
475
- progress(1.0, desc=f"Input error: {e}")
476
- return f"Input error: {e}", pd.DataFrame().to_dict(orient="records"), []
477
- except RuntimeError as e: # Catch RuntimeError from client_manager.get_client()
478
- crawler_ui_logger.error(
479
- f"Runtime error (e.g., OpenAI client not init): {e}", exc_info=True
480
- )
481
- progress(1.0, desc=f"Runtime error: {e}")
482
- return f"Runtime error: {e}", pd.DataFrame().to_dict(orient="records"), []
483
- except Exception as e:
484
- crawler_ui_logger.error(
485
- f"Unexpected error in crawl_and_generate: {e}", exc_info=True
486
- )
487
- progress(1.0, desc=f"Unexpected error: {e}")
488
- return (
489
- f"An unexpected error occurred: {e}",
490
- pd.DataFrame().to_dict(orient="records"),
491
- [],
492
- )
493
-
494
-
495
- # --- Card Preview and Editing Utilities (Task 13.3) ---
496
 
497
 
498
  def cards_to_dataframe(cards: List[Card]) -> pd.DataFrame:
@@ -509,16 +61,16 @@ def cards_to_dataframe(cards: List[Card]) -> pd.DataFrame:
509
  data_for_df.append(
510
  {
511
  "ID": i + 1, # 1-indexed ID for display
512
- "Topic": topic_str, # Added Topic
513
  "Front": card.front.question,
514
  "Back": card.back.answer,
515
  "Tags": tags_str,
516
- "Card Type": card.card_type or "Basic", # Mapped from note_type
517
- "Explanation": card.back.explanation or "", # Added Explanation
518
- "Example": card.back.example or "", # Added Example
519
  "Source_URL": card.metadata.get("source_url", "")
520
  if card.metadata
521
- else "", # Added Source URL
522
  }
523
  )
524
  # Define all columns explicitly for consistent DataFrame structure
@@ -546,7 +98,7 @@ def dataframe_to_cards(df: pd.DataFrame, original_cards: List[Card]) -> List[Car
546
  if df.empty and not original_cards:
547
  return []
548
  if df.empty and original_cards:
549
- return [] # Or original_cards if no change is intended on empty df
550
 
551
  for index, row in df.iterrows():
552
  try:
@@ -556,8 +108,6 @@ def dataframe_to_cards(df: pd.DataFrame, original_cards: List[Card]) -> List[Car
556
  if 0 <= original_card_index < len(original_cards):
557
  card_to_update = original_cards[original_card_index]
558
 
559
- # Create new CardFront and CardBack objects for immutability if preferred,
560
- # or update existing ones since Pydantic models are mutable.
561
  new_front = card_to_update.front.copy(
562
  update={
563
  "question": str(row.get("Front", card_to_update.front.question))
@@ -592,7 +142,6 @@ def dataframe_to_cards(df: pd.DataFrame, original_cards: List[Card]) -> List[Car
592
  new_metadata["topic"] = str(
593
  row.get("Topic", new_metadata.get("topic", "N/A"))
594
  )
595
- # Source URL is generally not editable from this simple table
596
 
597
  updated_card = card_to_update.copy(
598
  update={
@@ -606,16 +155,14 @@ def dataframe_to_cards(df: pd.DataFrame, original_cards: List[Card]) -> List[Car
606
  )
607
  updated_cards.append(updated_card)
608
  else:
609
- crawler_ui_logger.warning(
610
  f"Card ID {card_id} from DataFrame is out of bounds for original_cards list."
611
  )
612
  except (ValueError, KeyError, AttributeError) as e:
613
- crawler_ui_logger.error(
614
  f"Error processing row {index} from DataFrame: {row}. Error: {e}"
615
  )
616
  if 0 <= original_card_index < len(original_cards):
617
- updated_cards.append(
618
- original_cards[original_card_index]
619
- ) # Re-add original on error
620
  continue
621
  return updated_cards
 
2
 
3
  import gradio as gr
4
  import pandas as pd
5
+ from typing import List
 
 
 
 
 
 
6
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  from ankigen_core.utils import get_logger
8
+ from ankigen_core.models import Card
9
 
10
+ logger = get_logger()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
 
 
12
 
13
+ def update_mode_visibility(mode: str, current_subject: str):
14
+ """Updates visibility and values of UI elements based on generation mode.
15
 
16
+ Currently only 'subject' mode is supported. This function is kept for
17
+ future extensibility.
18
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  # Define standard columns for empty DataFrames
20
  main_output_df_columns = [
21
  "Index",
 
31
  ]
32
 
33
  return (
34
+ gr.update(visible=True), # subject_mode (Group) - always visible
35
+ gr.update(visible=True), # cards_output - always visible
36
+ gr.update(value=current_subject), # subject textbox value
 
 
 
 
37
  gr.update(
38
  value=pd.DataFrame(columns=main_output_df_columns)
39
+ ), # output DataFrame
40
  gr.update(
41
  value="<div><b>Total Cards Generated:</b> <span id='total-cards-count'>0</span></div>",
42
  visible=False,
43
+ ), # total_cards_html
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
 
47
+ # --- Card Preview and Editing Utilities ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
 
50
  def cards_to_dataframe(cards: List[Card]) -> pd.DataFrame:
 
61
  data_for_df.append(
62
  {
63
  "ID": i + 1, # 1-indexed ID for display
64
+ "Topic": topic_str,
65
  "Front": card.front.question,
66
  "Back": card.back.answer,
67
  "Tags": tags_str,
68
+ "Card Type": card.card_type or "Basic",
69
+ "Explanation": card.back.explanation or "",
70
+ "Example": card.back.example or "",
71
  "Source_URL": card.metadata.get("source_url", "")
72
  if card.metadata
73
+ else "",
74
  }
75
  )
76
  # Define all columns explicitly for consistent DataFrame structure
 
98
  if df.empty and not original_cards:
99
  return []
100
  if df.empty and original_cards:
101
+ return []
102
 
103
  for index, row in df.iterrows():
104
  try:
 
108
  if 0 <= original_card_index < len(original_cards):
109
  card_to_update = original_cards[original_card_index]
110
 
 
 
111
  new_front = card_to_update.front.copy(
112
  update={
113
  "question": str(row.get("Front", card_to_update.front.question))
 
142
  new_metadata["topic"] = str(
143
  row.get("Topic", new_metadata.get("topic", "N/A"))
144
  )
 
145
 
146
  updated_card = card_to_update.copy(
147
  update={
 
155
  )
156
  updated_cards.append(updated_card)
157
  else:
158
+ logger.warning(
159
  f"Card ID {card_id} from DataFrame is out of bounds for original_cards list."
160
  )
161
  except (ValueError, KeyError, AttributeError) as e:
162
+ logger.error(
163
  f"Error processing row {index} from DataFrame: {row}. Error: {e}"
164
  )
165
  if 0 <= original_card_index < len(original_cards):
166
+ updated_cards.append(original_cards[original_card_index])
 
 
167
  continue
168
  return updated_cards
app.py CHANGED
@@ -18,11 +18,7 @@ from ankigen_core.exporters import (
18
  from ankigen_core.llm_interface import (
19
  OpenAIClientManager,
20
  ) # structured_output_completion is internal to core modules
21
- from ankigen_core.ui_logic import (
22
- crawl_and_generate,
23
- create_crawler_main_mode_elements,
24
- update_mode_visibility,
25
- )
26
  from ankigen_core.utils import (
27
  ResponseCache,
28
  get_logger,
@@ -159,13 +155,11 @@ def create_ankigen_interface():
159
  generation_mode = gr.Radio(
160
  choices=[
161
  ("Single Subject", "subject"),
162
- ("Learning Path", "path"),
163
- ("From Text", "text"),
164
- ("From Web", "web"),
165
  ],
166
  value="subject",
167
  label="Generation Mode",
168
  info="Choose how you want to generate content",
 
169
  )
170
  with gr.Group() as subject_mode:
171
  subject = gr.Textbox(
@@ -176,41 +170,6 @@ def create_ankigen_interface():
176
  "Auto-fill",
177
  variant="secondary",
178
  )
179
- with gr.Group(visible=False) as text_mode:
180
- source_text = gr.Textbox(
181
- label="Source Text",
182
- placeholder="Paste text here...",
183
- lines=15,
184
- )
185
- with gr.Group(visible=False) as web_mode:
186
- # --- BEGIN INTEGRATED CRAWLER UI (Task 16) ---
187
- logger.info(
188
- "Setting up integrated Web Crawler UI elements...",
189
- )
190
- (
191
- crawler_input_ui_elements, # List of inputs like URL, depth, model, patterns
192
- web_crawl_button, # Specific button to trigger crawl
193
- web_crawl_progress_bar,
194
- web_crawl_status_textbox,
195
- web_crawl_custom_system_prompt,
196
- web_crawl_custom_user_prompt_template,
197
- web_crawl_use_sitemap_checkbox,
198
- web_crawl_sitemap_url_textbox,
199
- ) = create_crawler_main_mode_elements()
200
-
201
- # Unpack crawler_input_ui_elements for clarity and use
202
- web_crawl_url_input = crawler_input_ui_elements[0]
203
- web_crawl_max_depth_slider = crawler_input_ui_elements[1]
204
- web_crawl_req_per_sec_slider = crawler_input_ui_elements[2]
205
- web_crawl_model_dropdown = crawler_input_ui_elements[3]
206
- web_crawl_include_patterns_textbox = (
207
- crawler_input_ui_elements[4]
208
- )
209
- web_crawl_exclude_patterns_textbox = (
210
- crawler_input_ui_elements[5]
211
- )
212
- # --- END INTEGRATED CRAWLER UI ---
213
-
214
  api_key_input = gr.Textbox(
215
  label="OpenAI API Key",
216
  type="password",
@@ -364,29 +323,21 @@ def create_ankigen_interface():
364
  inputs=[
365
  generation_mode,
366
  subject,
367
- source_text,
368
- web_crawl_url_input,
369
  ],
370
  outputs=[
371
  subject_mode,
372
- text_mode,
373
- web_mode,
374
  cards_output,
375
  subject,
376
- source_text,
377
- web_crawl_url_input,
378
  output,
379
  total_cards_html,
380
  ],
381
  )
382
 
383
- # Define an async wrapper for the orchestrate_card_generation partial
384
  async def handle_generate_click(
385
  api_key_input_val,
386
  subject_val,
387
  generation_mode_val,
388
- source_text_val,
389
- url_input_val,
390
  model_choice_val,
391
  topic_number_val,
392
  cards_per_topic_val,
@@ -394,20 +345,16 @@ def create_ankigen_interface():
394
  generate_cloze_checkbox_val,
395
  library_name_val,
396
  library_topic_val,
397
- progress=gr.Progress(track_tqdm=True), # Added progress tracker
398
  ):
399
- # Recreate the partial function call, but now it can be awaited
400
- # The actual orchestrate_card_generation is already partially applied with client_manager and response_cache
401
- # So, we need to get that specific partial object if it's stored, or redefine the partial logic here.
402
- # For simplicity and clarity, let's assume direct call to orchestrate_card_generation directly here
403
  return await orchestrate_card_generation(
404
- client_manager, # from global scope
405
- response_cache, # from global scope
406
  api_key_input_val,
407
  subject_val,
408
  generation_mode_val,
409
- source_text_val,
410
- url_input_val,
411
  model_choice_val,
412
  topic_number_val,
413
  cards_per_topic_val,
@@ -416,16 +363,13 @@ def create_ankigen_interface():
416
  library_name=library_name_val if library_name_val else None,
417
  library_topic=library_topic_val if library_topic_val else None,
418
  )
419
- # Expect 3-tuple return (dataframe, total_cards_html, token_usage_html)
420
 
421
  generate_button.click(
422
- fn=handle_generate_click, # MODIFIED: Use the new async handler
423
  inputs=[
424
  api_key_input,
425
  subject,
426
  generation_mode,
427
- source_text,
428
- web_crawl_url_input,
429
  model_choice,
430
  topic_number,
431
  cards_per_topic,
@@ -629,150 +573,8 @@ def create_ankigen_interface():
629
  preference_prompt,
630
  generate_cloze_checkbox,
631
  model_choice,
632
- library_accordion, # Reference to the accordion component
633
- ],
634
- )
635
-
636
- async def handle_web_crawl_click(
637
- api_key_val: str,
638
- url: str,
639
- max_depth: int,
640
- req_per_sec: float,
641
- model: str, # This is the model for LLM processing of crawled content
642
- include_patterns: str,
643
- exclude_patterns: str,
644
- custom_system_prompt: str,
645
- custom_user_prompt_template: str,
646
- use_sitemap: bool,
647
- sitemap_url: str,
648
- progress=gr.Progress(track_tqdm=True),
649
- ):
650
- progress(0, desc="Initializing web crawl...")
651
- yield {
652
- web_crawl_status_textbox: gr.update(
653
- value="Initializing web crawl...",
654
- ),
655
- output: gr.update(value=None), # Clear main output table
656
- total_cards_html: gr.update(
657
- visible=False,
658
- value="<div><b>Total Cards Generated:</b> <span id='total-cards-count'>0</span></div>",
659
- ),
660
- }
661
-
662
- if not api_key_val:
663
- logger.error("API Key is missing for web crawler operation.")
664
- yield {
665
- web_crawl_status_textbox: gr.update(
666
- value="Error: OpenAI API Key is required.",
667
- ),
668
- }
669
- return
670
- try:
671
- await client_manager.initialize_client(api_key_val)
672
- except Exception as e:
673
- logger.error(
674
- f"Failed to initialize OpenAI client for crawler: {e}",
675
- exc_info=True,
676
- )
677
- yield {
678
- web_crawl_status_textbox: gr.update(
679
- value=f"Error: Client init failed: {e!s}",
680
- ),
681
- }
682
- return
683
-
684
- message, cards_list_of_dicts, _ = await crawl_and_generate(
685
- url=url,
686
- max_depth=max_depth,
687
- crawler_requests_per_second=req_per_sec,
688
- include_patterns=include_patterns,
689
- exclude_patterns=exclude_patterns,
690
- model=model,
691
- export_format_ui="", # No longer used for direct export from crawl_and_generate
692
- custom_system_prompt=custom_system_prompt,
693
- custom_user_prompt_template=custom_user_prompt_template,
694
- use_sitemap=use_sitemap,
695
- sitemap_url_str=sitemap_url,
696
- client_manager=client_manager, # Passed from global scope
697
- progress=progress, # Gradio progress object
698
- status_textbox=web_crawl_status_textbox, # Specific status textbox for crawl
699
- )
700
-
701
- if cards_list_of_dicts:
702
- try:
703
- # Convert List[Dict] to Pandas DataFrame for the main output component
704
- preview_df_value = pd.DataFrame(cards_list_of_dicts)
705
- # Ensure columns match the main output dataframe
706
- # The `generate_cards_from_crawled_content` which produces `cards_list_of_dicts`
707
- # should already format it correctly. If not, mapping is needed here.
708
- # For now, assume it matches the main table structure expected by `gr.Dataframe(value=example_data)`
709
-
710
- # Check if columns match example_data, if not, reorder/rename or log warning
711
- if not preview_df_value.empty:
712
- expected_cols = example_data.columns.tolist()
713
- # Basic check, might need more robust mapping if structures differ significantly
714
- if not all(
715
- col in preview_df_value.columns for col in expected_cols
716
- ):
717
- logger.warning(
718
- "Crawled card data columns mismatch main output, attempting to use available data.",
719
- )
720
- # Potentially select only common columns or reindex if necessary
721
- # For now, we'll pass it as is, Gradio might handle extra/missing cols gracefully or error.
722
-
723
- num_cards = len(preview_df_value)
724
- total_cards_update = f"<div><b>Total Cards Prepared from Crawl:</b> <span id='total-cards-count'>{num_cards}</span></div>"
725
-
726
- yield {
727
- web_crawl_status_textbox: gr.update(value=message),
728
- output: gr.update(value=preview_df_value),
729
- total_cards_html: gr.update(
730
- visible=True,
731
- value=total_cards_update,
732
- ),
733
- }
734
- except Exception as e:
735
- logger.error(
736
- f"Error converting crawled cards to DataFrame: {e}",
737
- exc_info=True,
738
- )
739
- yield {
740
- web_crawl_status_textbox: gr.update(
741
- value=f"{message} (Error displaying cards: {e!s})",
742
- ),
743
- output: gr.update(value=None),
744
- total_cards_html: gr.update(visible=False),
745
- }
746
- else:
747
- yield {
748
- web_crawl_status_textbox: gr.update(
749
- value=message,
750
- ), # Message from crawl_and_generate (e.g. no cards)
751
- output: gr.update(value=None),
752
- total_cards_html: gr.update(visible=False),
753
- }
754
-
755
- web_crawl_button.click(
756
- fn=handle_web_crawl_click,
757
- inputs=[
758
- api_key_input,
759
- web_crawl_url_input,
760
- web_crawl_max_depth_slider,
761
- web_crawl_req_per_sec_slider,
762
- web_crawl_model_dropdown, # Model for LLM processing of content
763
- web_crawl_include_patterns_textbox,
764
- web_crawl_exclude_patterns_textbox,
765
- web_crawl_custom_system_prompt,
766
- web_crawl_custom_user_prompt_template,
767
- web_crawl_use_sitemap_checkbox,
768
- web_crawl_sitemap_url_textbox,
769
- ],
770
- outputs=[
771
- web_crawl_status_textbox, # Specific status for crawl
772
- output, # Main output DataFrame
773
- total_cards_html, # Main total cards display
774
  ],
775
- # Removed progress_bar from outputs as it's handled by gr.Progress(track_tqdm=True)
776
  )
777
 
778
  logger.info("AnkiGen Gradio interface creation complete.")
 
18
  from ankigen_core.llm_interface import (
19
  OpenAIClientManager,
20
  ) # structured_output_completion is internal to core modules
21
+ from ankigen_core.ui_logic import update_mode_visibility
 
 
 
 
22
  from ankigen_core.utils import (
23
  ResponseCache,
24
  get_logger,
 
155
  generation_mode = gr.Radio(
156
  choices=[
157
  ("Single Subject", "subject"),
 
 
 
158
  ],
159
  value="subject",
160
  label="Generation Mode",
161
  info="Choose how you want to generate content",
162
+ visible=False, # Hidden since only one mode exists
163
  )
164
  with gr.Group() as subject_mode:
165
  subject = gr.Textbox(
 
170
  "Auto-fill",
171
  variant="secondary",
172
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  api_key_input = gr.Textbox(
174
  label="OpenAI API Key",
175
  type="password",
 
323
  inputs=[
324
  generation_mode,
325
  subject,
 
 
326
  ],
327
  outputs=[
328
  subject_mode,
 
 
329
  cards_output,
330
  subject,
 
 
331
  output,
332
  total_cards_html,
333
  ],
334
  )
335
 
336
+ # Define an async wrapper for the orchestrate_card_generation
337
  async def handle_generate_click(
338
  api_key_input_val,
339
  subject_val,
340
  generation_mode_val,
 
 
341
  model_choice_val,
342
  topic_number_val,
343
  cards_per_topic_val,
 
345
  generate_cloze_checkbox_val,
346
  library_name_val,
347
  library_topic_val,
348
+ progress=gr.Progress(track_tqdm=True),
349
  ):
 
 
 
 
350
  return await orchestrate_card_generation(
351
+ client_manager,
352
+ response_cache,
353
  api_key_input_val,
354
  subject_val,
355
  generation_mode_val,
356
+ "", # source_text - deprecated
357
+ "", # url_input - deprecated
358
  model_choice_val,
359
  topic_number_val,
360
  cards_per_topic_val,
 
363
  library_name=library_name_val if library_name_val else None,
364
  library_topic=library_topic_val if library_topic_val else None,
365
  )
 
366
 
367
  generate_button.click(
368
+ fn=handle_generate_click,
369
  inputs=[
370
  api_key_input,
371
  subject,
372
  generation_mode,
 
 
373
  model_choice,
374
  topic_number,
375
  cards_per_topic,
 
573
  preference_prompt,
574
  generate_cloze_checkbox,
575
  model_choice,
576
+ library_accordion,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
577
  ],
 
578
  )
579
 
580
  logger.info("AnkiGen Gradio interface creation complete.")