Spaces:

brickfrog
/

ankigen

Running

App Files Files Community

ankigen / ankigen_core /card_generator.py

brickfrog

Upload folder using huggingface_hub

93bd7fb verified 22 days ago

raw

history blame contribute delete

13.7 kB

	# Module for core card generation logic

	import gradio as gr
	import pandas as pd
	from typing import List, Dict, Any

	# Imports from our core modules
	from ankigen_core.utils import (
	get_logger,
	ResponseCache,
	strip_html_tags,
	)
	from ankigen_core.llm_interface import OpenAIClientManager
	from ankigen_core.models import (
	Card,
	) # Import necessary Pydantic models

	# Import agent system - required
	from ankigen_core.agents.integration import AgentOrchestrator
	from agents import set_tracing_disabled

	logger = get_logger()

	# Disable tracing to prevent metrics persistence issues
	set_tracing_disabled(True)

	AGENTS_AVAILABLE = True
	logger.info("Agent system loaded successfully")

	# --- Constants --- (Moved from app.py)
	AVAILABLE_MODELS = [
	{
	"value": "gpt-4.1",
	"label": "GPT-4.1 (Best Quality)",
	"description": "Highest quality, large context window",
	},
	{
	"value": "gpt-4.1-nano",
	"label": "GPT-4.1 Nano (Ultra Fast)",
	"description": "Ultra-fast and cost-effective",
	},
	]

	GENERATION_MODES = [
	{
	"value": "subject",
	"label": "Single Subject",
	"description": "Generate cards for a specific topic",
	},
	{
	"value": "path",
	"label": "Learning Path",
	"description": "Break down a job description or learning goal into subjects",
	},
	{
	"value": "text",
	"label": "From Text",
	"description": "Generate cards from provided text",
	},
	{
	"value": "web",
	"label": "From Web",
	"description": "Generate cards from a web page URL",
	},
	]

	# --- Core Functions --- (Moved and adapted from app.py)


	# Legacy functions removed - all card generation now handled by agent system


	async def orchestrate_card_generation( # MODIFIED: Added async
	client_manager: OpenAIClientManager, # Expect the manager
	cache: ResponseCache, # Expect the cache instance
	# --- UI Inputs --- (These will be passed from app.py handler)
	api_key_input: str,
	subject: str,
	generation_mode: str,
	source_text: str,
	url_input: str,
	model_name: str,
	topic_number: int,
	cards_per_topic: int,
	preference_prompt: str,
	generate_cloze: bool,
	use_llm_judge: bool = False,
	library_name: str = None,
	library_topic: str = None,
	):
	"""Orchestrates the card generation process based on UI inputs."""

	logger.info(f"Starting card generation orchestration in {generation_mode} mode")
	logger.debug(
	f"Parameters: mode={generation_mode}, topics={topic_number}, cards_per_topic={cards_per_topic}, cloze={generate_cloze}"
	)

	# --- AGENT SYSTEM INTEGRATION ---
	if AGENTS_AVAILABLE:
	logger.info("🤖 Using agent system for card generation")
	try:
	from ankigen_core.agents.token_tracker import get_token_tracker

	token_tracker = get_token_tracker()

	orchestrator = AgentOrchestrator(client_manager)

	logger.info(f"Using {model_name} for SubjectExpertAgent")
	await orchestrator.initialize(api_key_input, {"subject_expert": model_name})

	# Map generation mode to subject
	agent_subject = "general"
	if generation_mode == "subject":
	agent_subject = subject if subject else "general"
	elif generation_mode == "path":
	agent_subject = "curriculum_design"
	elif generation_mode == "text":
	agent_subject = "content_analysis"

	total_cards_needed = topic_number * cards_per_topic

	context = {}
	if generation_mode == "text" and source_text:
	context["source_text"] = source_text

	agent_cards, agent_metadata = await orchestrator.generate_cards_with_agents(
	topic=subject if subject else "Mixed Topics",
	subject=agent_subject,
	num_cards=total_cards_needed,
	difficulty="intermediate",
	context=context,
	library_name=library_name,
	library_topic=library_topic,
	generate_cloze=generate_cloze,
	)

	# Get token usage from session
	try:
	# Try both method names for compatibility
	if hasattr(token_tracker, "get_session_summary"):
	token_usage = token_tracker.get_session_summary()
	elif hasattr(token_tracker, "get_session_usage"):
	token_usage = token_tracker.get_session_usage()
	else:
	raise AttributeError("TokenTracker has no session summary method")

	token_usage_html = f"<div style='margin-top: 8px;'><b>Token Usage:</b> {token_usage['total_tokens']} tokens</div>"
	except Exception as e:
	logger.error(f"Token usage collection failed: {e}")
	token_usage_html = "<div style='margin-top: 8px;'><b>Token Usage:</b> No usage data</div>"

	# Convert agent cards to dataframe format
	if agent_cards:
	formatted_cards = format_cards_for_dataframe(
	agent_cards,
	topic_name=subject if subject else "General",
	start_index=1,
	)

	output_df = pd.DataFrame(
	formatted_cards, columns=get_dataframe_columns()
	)
	total_cards_message = f"<div><b>Cards Generated:</b> <span id='total-cards-count'>{len(output_df)}</span></div>"

	logger.info(
	f"Agent system generated {len(output_df)} cards successfully"
	)
	return output_df, total_cards_message, token_usage_html
	else:
	logger.error("Agent system returned no cards")
	gr.Error("🤖 Agent system returned no cards")
	return (
	pd.DataFrame(columns=get_dataframe_columns()),
	"Agent system returned no cards.",
	"",
	)

	except Exception as e:
	logger.error(f"Agent system failed: {e}")
	gr.Error(f"🤖 Agent system error: {str(e)}")
	return (
	pd.DataFrame(columns=get_dataframe_columns()),
	f"Agent system error: {str(e)}",
	"",
	)

	# Agent system is required and should never fail to be available
	logger.error("Agent system failed but is required - this should not happen")
	gr.Error("Agent system is required but not available")
	return (
	pd.DataFrame(columns=get_dataframe_columns()),
	"Agent system error",
	"",
	)


	# Legacy helper functions removed - all processing now handled by agent system


	# --- Formatting and Utility Functions --- (Moved and adapted)
	def format_cards_for_dataframe(
	cards: list[Card], topic_name: str, topic_index: int = 0, start_index: int = 1
	) -> list:
	"""Formats a list of Card objects into a list of dictionaries for DataFrame display.
	Ensures all data is plain text.
	"""
	formatted_cards = []
	for i, card_obj in enumerate(cards):
	actual_index = start_index + i
	card_type = card_obj.card_type or "basic"
	question = card_obj.front.question or ""
	answer = card_obj.back.answer or ""
	explanation = card_obj.back.explanation or ""
	example = card_obj.back.example or ""

	# Metadata processing
	metadata = card_obj.metadata or {}
	prerequisites = metadata.get("prerequisites", [])
	learning_outcomes = metadata.get("learning_outcomes", [])
	difficulty = metadata.get("difficulty", "N/A")
	# Ensure list-based metadata are joined as plain strings for DataFrame
	prerequisites_str = strip_html_tags(
	", ".join(prerequisites)
	if isinstance(prerequisites, list)
	else str(prerequisites)
	)
	learning_outcomes_str = strip_html_tags(
	", ".join(learning_outcomes)
	if isinstance(learning_outcomes, list)
	else str(learning_outcomes)
	)
	difficulty_str = strip_html_tags(str(difficulty))

	formatted_card = {
	"Index": (
	f"{topic_index}.{actual_index}"
	if topic_index > 0
	else str(actual_index)
	),
	"Topic": strip_html_tags(topic_name), # Ensure topic is also plain
	"Card_Type": strip_html_tags(card_type),
	"Question": question, # Already stripped during Card object creation
	"Answer": answer, # Already stripped
	"Explanation": explanation, # Already stripped
	"Example": example, # Already stripped
	"Prerequisites": prerequisites_str,
	"Learning_Outcomes": learning_outcomes_str,
	"Difficulty": difficulty_str, # Ensure difficulty is plain text
	"Source_URL": strip_html_tags(
	metadata.get("source_url", "")
	), # Ensure Source_URL is plain
	}
	formatted_cards.append(formatted_card)
	return formatted_cards


	def get_dataframe_columns() -> list[str]:
	"""Returns the standard list of columns for the Anki card DataFrame."""
	return [
	"Index",
	"Topic",
	"Card_Type",
	"Question",
	"Answer",
	"Explanation",
	"Example",
	"Prerequisites",
	"Learning_Outcomes",
	"Difficulty",
	"Source_URL",
	]


	# This function might be specific to the old crawler flow if AnkiCardData is only from there.
	# If orchestrate_card_generation now also produces something convertible to AnkiCardData, it might be useful.
	# For now, it's used by generate_cards_from_crawled_content.
	def deduplicate_cards(cards: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
	"""Deduplicates a list of card dictionaries based on the 'Question' field."""
	seen_questions = set()
	unique_cards = []
	for card_dict in cards:
	question = card_dict.get("Question")
	if question is None: # Should not happen if cards are well-formed
	logger.warning(f"Card dictionary missing 'Question' key: {card_dict}")
	unique_cards.append(card_dict) # Keep it if no question to dedupe on
	continue

	# Normalize whitespace and case for deduplication
	normalized_question = " ".join(str(question).strip().lower().split())
	if normalized_question not in seen_questions:
	seen_questions.add(normalized_question)
	unique_cards.append(card_dict)
	else:
	logger.info(f"Deduplicated card with question: {question}")
	return unique_cards


	# --- Modification for generate_cards_from_crawled_content ---


	def generate_cards_from_crawled_content(
	all_cards: List[Card],
	) -> List[Dict[str, Any]]: # Changed AnkiCardData to Card
	"""
	Processes a list of Card objects (expected to have plain text fields after generate_cards_batch)
	and formats them into a list of dictionaries suitable for the DataFrame.
	"""
	if not all_cards:
	return []

	data_for_dataframe = []
	for i, card_obj in enumerate(all_cards):
	# Extract data, assuming it's already plain text from Card object creation
	topic = (
	card_obj.metadata.get("topic", f"Crawled Content - Card {i + 1}")
	if card_obj.metadata
	else f"Crawled Content - Card {i + 1}"
	)

	# Ensure list-based metadata are joined as plain strings for DataFrame
	prerequisites = (
	card_obj.metadata.get("prerequisites", []) if card_obj.metadata else []
	)
	learning_outcomes = (
	card_obj.metadata.get("learning_outcomes", []) if card_obj.metadata else []
	)

	prerequisites_str = strip_html_tags(
	", ".join(prerequisites)
	if isinstance(prerequisites, list)
	else str(prerequisites)
	)
	learning_outcomes_str = strip_html_tags(
	", ".join(learning_outcomes)
	if isinstance(learning_outcomes, list)
	else str(learning_outcomes)
	)
	difficulty_str = strip_html_tags(
	str(
	card_obj.metadata.get("difficulty", "N/A")
	if card_obj.metadata
	else "N/A"
	)
	)

	card_dict = {
	"Index": str(i + 1),
	"Topic": strip_html_tags(topic),
	"Card_Type": strip_html_tags(card_obj.card_type or "basic"),
	"Question": card_obj.front.question or "", # Should be plain
	"Answer": card_obj.back.answer or "", # Should be plain
	"Explanation": card_obj.back.explanation or "", # Should be plain
	"Example": card_obj.back.example or "", # Should be plain
	"Prerequisites": prerequisites_str,
	"Learning_Outcomes": learning_outcomes_str,
	"Difficulty": difficulty_str,
	"Source_URL": strip_html_tags(
	card_obj.metadata.get("source_url", "") if card_obj.metadata else ""
	),
	}
	data_for_dataframe.append(card_dict)
	return data_for_dataframe


	def generate_token_usage_html(token_usage=None):
	"""Generate HTML for token usage display"""
	if token_usage and isinstance(token_usage, dict):
	total_tokens = token_usage.get("total_tokens", 0)
	return f"<div style='margin-top: 8px;'><b>Token Usage:</b> {total_tokens} tokens</div>"
	else:
	return "<div style='margin-top: 8px;'><b>Token Usage:</b> No usage data</div>"