# Module for functions that build or manage UI sections/logic import gradio as gr import pandas as pd # Needed for use_selected_subjects type hinting from typing import ( List, Tuple, ) from urllib.parse import urlparse # --- Imports moved from later in the file (Task 7, etc.) --- import re # For URL validation and filename sanitization import asyncio from ankigen_core.crawler import WebCrawler from ankigen_core.llm_interface import ( OpenAIClientManager, process_crawled_pages, ) from ankigen_core.card_generator import ( generate_cards_from_crawled_content, AVAILABLE_MODELS, ) from ankigen_core.utils import get_logger # Only import models that are actually used in this file from ankigen_core.models import ( Card, # ModelSettings, # Removed # LearningPathInput, # Removed # LearningPath, # Removed # GeneratedPath, # Removed # SubjectAnalysis, # Removed # SubjectCardRequest, # Removed # TextCardRequest, # Removed # LearningPathRequest, # Removed ) # --- End moved imports --- # Get an instance of the logger for this module crawler_ui_logger = get_logger() # Keep this definition def update_mode_visibility( mode: str, current_subject: str, current_description: str, current_text: str, current_url: str, ): """Updates visibility and values of UI elements based on generation mode.""" is_subject = mode == "subject" is_path = mode == "path" is_text = mode == "text" is_web = mode == "web" # Determine value persistence or clearing subject_val = current_subject if is_subject else "" description_val = current_description if is_path else "" text_val = current_text if is_text else "" url_val = current_url if is_web else "" cards_output_visible = is_subject or is_text or is_web # Define standard columns for empty DataFrames main_output_df_columns = [ "Index", "Topic", "Card_Type", "Question", "Answer", "Explanation", "Example", "Prerequisites", "Learning_Outcomes", "Common_Misconceptions", "Difficulty", ] subjects_list_df_columns = ["Subject", "Prerequisites", "Time Estimate"] return ( gr.update(visible=is_subject), # 1 subject_mode (Group) gr.update(visible=is_path), # 2 path_mode (Group) gr.update(visible=is_text), # 3 text_mode (Group) gr.update(visible=is_web), # 4 web_mode (Group for crawler UI) gr.update(visible=is_path), # 5 path_results (Group) gr.update( visible=cards_output_visible ), # 6 cards_output (Group for main table) gr.update(value=subject_val), # Now 7th item (was 8th) gr.update(value=description_val), # Now 8th item (was 9th) gr.update(value=text_val), # Now 9th item (was 10th) gr.update(value=url_val), # Now 10th item (was 11th) gr.update( value=pd.DataFrame(columns=main_output_df_columns) ), # Now 11th item (was 12th) gr.update( value=pd.DataFrame(columns=subjects_list_df_columns) ), # Now 12th item (was 13th) gr.update(value=""), # Now 13th item (was 14th) gr.update(value=""), # Now 14th item (was 15th) gr.update( value="
Total Cards Generated: 0
", visible=False, ), # Now 15th item (was 16th) ) def use_selected_subjects(subjects_df: pd.DataFrame | None): """Updates UI to use subjects from learning path analysis.""" if subjects_df is None or subjects_df.empty: gr.Warning("No subjects available to copy from Learning Path analysis.") # Return updates that change nothing for all 18 outputs return ( gr.update(), # 1 generation_mode gr.update(), # 2 subject_mode gr.update(), # 3 path_mode gr.update(), # 4 text_mode gr.update(), # 5 web_mode gr.update(), # 6 path_results gr.update(), # 7 cards_output gr.update(), # 8 subject gr.update(), # 9 description gr.update(), # 10 source_text gr.update(), # 11 web_crawl_url_input gr.update(), # 12 topic_number gr.update(), # 13 preference_prompt gr.update( value=pd.DataFrame( columns=[ "Index", "Topic", "Card_Type", "Question", "Answer", "Explanation", "Example", "Prerequisites", "Learning_Outcomes", "Common_Misconceptions", "Difficulty", ] ) ), # 14 output (DataFrame) gr.update( value=pd.DataFrame( columns=["Subject", "Prerequisites", "Time Estimate"] ) ), # 15 subjects_list (DataFrame) gr.update(), # 16 learning_order gr.update(), # 17 projects gr.update(visible=False), # 18 total_cards_html ) try: subjects = subjects_df["Subject"].tolist() combined_subject = ", ".join(subjects) # Ensure suggested_topics is an int, Gradio sliders expect int/float for value suggested_topics = int(min(len(subjects) + 1, 20)) except KeyError: gr.Error("Learning path analysis result is missing the 'Subject' column.") # Return no-change updates for all 18 outputs return ( gr.update(), # 1 generation_mode gr.update(), # 2 subject_mode gr.update(), # 3 path_mode gr.update(), # 4 text_mode gr.update(), # 5 web_mode gr.update(), # 6 path_results gr.update(), # 7 cards_output gr.update(), # 8 subject gr.update(), # 9 description gr.update(), # 10 source_text gr.update(), # 11 web_crawl_url_input gr.update(), # 12 topic_number gr.update(), # 13 preference_prompt gr.update( value=pd.DataFrame( columns=[ "Index", "Topic", "Card_Type", "Question", "Answer", "Explanation", "Example", "Prerequisites", "Learning_Outcomes", "Common_Misconceptions", "Difficulty", ] ) ), # 14 output (DataFrame) gr.update( value=pd.DataFrame( columns=["Subject", "Prerequisites", "Time Estimate"] ) ), # 15 subjects_list (DataFrame) gr.update(), # 16 learning_order gr.update(), # 17 projects gr.update(visible=False), # 18 total_cards_html ) # Corresponds to outputs in app.py for use_subjects.click: # [generation_mode, subject_mode, path_mode, text_mode, web_mode, path_results, cards_output, # subject, description, source_text, web_crawl_url_input, topic_number, preference_prompt, # output, subjects_list, learning_order, projects, total_cards_html] return ( gr.update(value="subject"), # 1 generation_mode (Radio) gr.update(visible=True), # 2 subject_mode (Group) gr.update(visible=False), # 3 path_mode (Group) gr.update(visible=False), # 4 text_mode (Group) gr.update(visible=False), # 5 web_mode (Group) gr.update(visible=False), # 6 path_results (Group) gr.update(visible=True), # 7 cards_output (Group) gr.update(value=combined_subject), # 8 subject (Textbox) gr.update(value=""), # 9 description (Textbox) gr.update(value=""), # 10 source_text (Textbox) gr.update(value=""), # 11 web_crawl_url_input (Textbox) gr.update(value=suggested_topics), # 12 topic_number (Slider) gr.update( value="Focus on connections between these subjects and their practical applications." ), # 13 preference_prompt (Textbox) gr.update( value=pd.DataFrame( columns=[ "Index", "Topic", "Card_Type", "Question", "Answer", "Explanation", "Example", "Prerequisites", "Learning_Outcomes", "Common_Misconceptions", "Difficulty", ] ) ), # 14 output (DataFrame) - Clear it gr.update( value=subjects_df ), # 15 subjects_list (DataFrame) - Keep the value that triggered this gr.update( value="" ), # 16 learning_order (Markdown) - Clear it or decide to keep gr.update(value=""), # 17 projects (Markdown) - Clear it or decide to keep gr.update(visible=False), # 18 total_cards_html (HTML) ) def create_crawler_main_mode_elements() -> ( Tuple[ List[gr.components.Component], # ui_components (url_input, max_depth, etc.) gr.Button, # crawl_button gr.Progress, # progress_bar gr.Textbox, # progress_status_textbox gr.Textbox, # custom_system_prompt gr.Textbox, # custom_user_prompt_template gr.Checkbox, # use_sitemap_checkbox gr.Textbox, # sitemap_url_textbox ] ): """Creates the UI components for the Web Crawler mode integrated into the main tab.""" ui_components: List[gr.components.Component] = [] # URL Input url_input = gr.Textbox( label="Start URL", placeholder="Enter the full URL to start crawling (e.g., https://example.com/docs)", elem_id="crawler_url_input", ) ui_components.append(url_input) with gr.Row(): max_depth_slider = gr.Slider( minimum=0, maximum=5, value=1, step=1, label="Max Crawl Depth", elem_id="crawler_max_depth_slider", ) ui_components.append(max_depth_slider) crawler_req_per_sec_slider = gr.Slider( minimum=0.1, maximum=10, value=2, step=0.1, label="Requests per Second (Crawler)", elem_id="crawler_req_per_sec_slider", ) ui_components.append(crawler_req_per_sec_slider) model_choices_ui_crawler = [(m["label"], m["value"]) for m in AVAILABLE_MODELS] default_model_value_crawler = next( (m["value"] for m in AVAILABLE_MODELS if "nano" in m["value"].lower()), AVAILABLE_MODELS[0]["value"] if AVAILABLE_MODELS else "", ) model_dropdown = gr.Dropdown( choices=model_choices_ui_crawler, label="AI Model for Content Processing", # Clarified label value=default_model_value_crawler, elem_id="crawler_model_dropdown", ) ui_components.append(model_dropdown) with gr.Row(): include_patterns_textbox = gr.Textbox( label="Include URL Patterns (one per line, regex compatible)", placeholder="""e.g., /blog/.* example.com/articles/.*""", lines=3, elem_id="crawler_include_patterns", scale=1, ) ui_components.append(include_patterns_textbox) exclude_patterns_textbox = gr.Textbox( label="Exclude URL Patterns (one per line, regex compatible)", placeholder="""e.g., /category/.* .*/login""", lines=3, elem_id="crawler_exclude_patterns", scale=1, ) ui_components.append(exclude_patterns_textbox) with gr.Accordion( "Sitemap Options", open=False, elem_id="crawler_sitemap_options_accordion" ): use_sitemap_checkbox = gr.Checkbox( label="Use Sitemap?", value=False, elem_id="crawler_use_sitemap_checkbox", ) # ui_components.append(use_sitemap_checkbox) # Appended later with its group sitemap_url_textbox = gr.Textbox( label="Sitemap URL (e.g., /sitemap.xml or full URL)", placeholder="Enter sitemap URL relative to start URL or full path", visible=False, elem_id="crawler_sitemap_url_textbox", ) # ui_components.append(sitemap_url_textbox) # Appended later with its group use_sitemap_checkbox.change( fn=lambda x: gr.update(visible=x), inputs=[use_sitemap_checkbox], outputs=[sitemap_url_textbox], ) # Add sitemap components to the main list for return # sitemap_elements_for_return = [use_sitemap_checkbox, sitemap_url_textbox] # Unused variable with gr.Accordion( "Advanced Prompt Options", open=False, elem_id="crawler_advanced_options_accordion", ): # Removed assignment to advanced_options_accordion_component custom_system_prompt = gr.Textbox( label="Custom System Prompt (Optional)", placeholder="Leave empty to use the default system prompt for card generation.", lines=5, info="Define the overall role and instructions for the AI.", elem_id="crawler_custom_system_prompt", ) # ui_components.append(custom_system_prompt) # Appended later custom_user_prompt_template = gr.Textbox( label="Custom User Prompt Template (Optional)", placeholder="Leave empty to use default. Available placeholders: {url}, {content}", lines=5, info="Define how the page URL and content are presented to the AI.", elem_id="crawler_custom_user_prompt_template", ) # ui_components.append(custom_user_prompt_template) # Appended later # Add prompt components to the main list for return # prompt_elements_for_return = [custom_system_prompt, custom_user_prompt_template] # Unused variable # Crawl button (will trigger crawl_and_generate, results populate main DataFrame) crawl_button = gr.Button( "Crawl Content & Prepare Cards", # Changed button text variant="secondary", # Differentiate from main generate button elem_id="crawler_crawl_content_button", ) # ui_components.append(crawl_button) # Returned separately # Progress bar and status for the crawling process progress_bar = ( gr.Progress() ) # Removed elem_id as gr.Progress might not support it directly progress_status_textbox = gr.Textbox( label="Crawl Status", interactive=False, lines=3, # Reduced lines placeholder="Crawling process status will appear here...", elem_id="crawler_status_textbox", ) # ui_components.append(progress_status_textbox) # Returned separately # REMOVED UI elements: # - export_format_radio (no longer needed here) # - All preview related: preview_row_component, preview_dataframe_component, update_cards_button_component # - All preview export related: export_format_preview_component, deck_name_preview_component, export_button_preview_component # - All direct file download related: download_row_group, generated_file_output, download_button # The main ui_components list should contain all elements whose values are needed as inputs to the crawl/generation # or whose visibility might be managed together. # For clarity, specific components like buttons or progress bars are returned separately if they have specific event handlers # or are managed distinctly. # Add all input fields to ui_components for easier management if needed, or return them individually. # For now, returning them grouped for clarity. return ( ui_components, crawl_button, progress_bar, progress_status_textbox, custom_system_prompt, custom_user_prompt_template, use_sitemap_checkbox, sitemap_url_textbox, ) # --- Crawl and Generate Logic (Task 7) --- # MODIFIED: Get model values from AVAILABLE_MODELS for validation CRAWLER_AVAILABLE_MODELS_VALUES = [m["value"] for m in AVAILABLE_MODELS] def _basic_sanitize_filename(name: str) -> str: """Basic filename sanitization by replacing non-alphanumeric characters with underscores.""" return re.sub(r"[^a-zA-Z0-9_.-]", "_", name) async def crawl_and_generate( url: str, max_depth: int, crawler_requests_per_second: float, include_patterns: str, exclude_patterns: str, model: str, export_format_ui: str, custom_system_prompt: str, custom_user_prompt_template: str, use_sitemap: bool, sitemap_url_str: str, client_manager: OpenAIClientManager, progress: gr.Progress, status_textbox: gr.Textbox, ) -> Tuple[str, List[dict], List[Card]]: """Crawls a website, generates Anki cards, and prepares them for export/display.""" # Initialize crawler_ui_logger if it's meant to be used here, e.g., at the start of the function # For now, assuming it's available in the scope (e.g., global or passed in if it were a class) # If it's a module-level logger, it should be fine. # Ensure the status_textbox is updated via gr.Info or similar if needed # as it's a parameter but not directly used for output updates in the provided snippet. # It might be used by side-effect if gr.Info/gr.Warning updates it globally, or if it's part of `progress`. # The `status_textbox` parameter is not directly used to set a value in the return, # but `gr.Info` might update a default status area, or it's for other UI purposes. crawler_ui_logger.info(f"Crawl and generate called for URL: {url}") if not url or not url.startswith(("http://", "https://")): gr.Warning("Invalid URL provided. Please enter a valid http/https URL.") return "Invalid URL", [], [] try: urlparse(url) # domain = parsed_url.netloc # allowed_domains is removed from WebCrawler call # if not domain: # gr.Warning("Could not parse domain from URL. Please enter a valid URL.") # return "Invalid URL (cannot parse domain)", [], [] include_list = [p.strip() for p in include_patterns.split(",") if p.strip()] exclude_list = [p.strip() for p in exclude_patterns.split(",") if p.strip()] # WebCrawler instantiation updated to remove parameters causing issues. # The WebCrawler will use its defaults or other configured ways for these. # The 'requests_per_second' from UI maps to 'delay_between_requests' internally if crawler supports it, # but since 'delay_between_requests' was also flagged, we remove it. # The WebCrawler class itself needs to be checked for its actual constructor parameters. crawler = WebCrawler( start_url=url, max_depth=max_depth, # Assuming max_depth is still a valid param # allowed_domains=[domain], # Removed based on linter error # delay_between_requests=1.0 / crawler_requests_per_second # Removed # if crawler_requests_per_second > 0 # else 0.1, # max_pages=500, # Removed include_patterns=include_list, # Assuming this is valid exclude_patterns=exclude_list, # Assuming this is valid use_sitemap=use_sitemap, # Assuming this is valid sitemap_url=sitemap_url_str if use_sitemap and sitemap_url_str and sitemap_url_str.strip() else None, ) total_urls_for_progress = 0 def crawler_progress_callback( processed_count: int, total_urls: int, current_url_processing: str ): nonlocal total_urls_for_progress total_urls_for_progress = total_urls if total_urls_for_progress > 0: progress( 0.1 + (processed_count / total_urls_for_progress) * 0.4, desc=f"Crawling: {processed_count}/{total_urls_for_progress} URLs. Current: {current_url_processing}", ) else: progress( 0.1 + processed_count * 0.01, desc=f"Crawling: {processed_count} URLs discovered. Current: {current_url_processing}", ) crawler_ui_logger.info(f"Starting crawl for {url}...") progress(0.15, desc=f"Starting crawl for {url}...") crawled_pages = await asyncio.to_thread( crawler.crawl, progress_callback=crawler_progress_callback ) crawler_ui_logger.info(f"Crawling finished. Found {len(crawled_pages)} pages.") progress(0.5, desc=f"Crawling finished. Found {len(crawled_pages)} pages.") if not crawled_pages: progress(1.0, desc="No pages were crawled. Check URL and patterns.") # Return structure: (status_message, df_data, raw_cards_data) return ( "No pages were crawled. Check URL and patterns.", pd.DataFrame().to_dict(orient="records"), [], ) openai_client = client_manager.get_client() processed_llm_pages = 0 def llm_progress_callback(completed_count: int, total_count: int): nonlocal processed_llm_pages processed_llm_pages = completed_count progress( 0.5 + (completed_count / total_count) * 0.4, desc=f"Processing content: {completed_count}/{total_count} pages processed by LLM.", ) crawler_ui_logger.info( f"Starting LLM processing for {len(crawled_pages)} pages..." ) progress( 0.55, desc=f"Processing {len(crawled_pages)} pages with LLM ({model})..." ) all_cards = await process_crawled_pages( # This now returns List[Card] openai_client=openai_client, pages=crawled_pages, model=model, max_prompt_content_tokens=6000, max_concurrent_requests=5, custom_system_prompt=custom_system_prompt if custom_system_prompt and custom_system_prompt.strip() else None, custom_user_prompt_template=custom_user_prompt_template if custom_user_prompt_template and custom_user_prompt_template.strip() else None, progress_callback=llm_progress_callback, ) crawler_ui_logger.info( f"LLM processing finished. Generated {len(all_cards)} Card objects." # Changed AnkiCardData to Card ) progress( 0.9, desc=f"LLM processing finished. Generated {len(all_cards)} Anki cards.", ) if not all_cards: progress( 1.0, desc="LLM processing complete, but no Anki cards were generated." ) return ( "LLM processing complete, but no Anki cards were generated.", pd.DataFrame().to_dict(orient="records"), # Empty DataFrame data [], # Empty list of raw cards ) cards_for_dataframe_export = generate_cards_from_crawled_content( all_cards ) # Expects List[Card] if not cards_for_dataframe_export: progress( 1.0, desc="Card processing (formatting, etc.) resulted in no cards." ) return ( "Card processing resulted in no cards.", pd.DataFrame().to_dict(orient="records"), [], ) except ConnectionError as e: crawler_ui_logger.error(f"Connection error during crawl: {e}", exc_info=True) progress(1.0, desc=f"Connection error: {e}") return f"Connection error: {e}", pd.DataFrame().to_dict(orient="records"), [] except ValueError as e: crawler_ui_logger.error(f"Value error: {e}", exc_info=True) progress(1.0, desc=f"Input error: {e}") return f"Input error: {e}", pd.DataFrame().to_dict(orient="records"), [] except RuntimeError as e: # Catch RuntimeError from client_manager.get_client() crawler_ui_logger.error( f"Runtime error (e.g., OpenAI client not init): {e}", exc_info=True ) progress(1.0, desc=f"Runtime error: {e}") return f"Runtime error: {e}", pd.DataFrame().to_dict(orient="records"), [] except Exception as e: crawler_ui_logger.error( f"Unexpected error in crawl_and_generate: {e}", exc_info=True ) progress(1.0, desc=f"Unexpected error: {e}") return ( f"An unexpected error occurred: {e}", pd.DataFrame().to_dict(orient="records"), [], ) final_message = f"Content crawled and processed. {len(cards_for_dataframe_export) if cards_for_dataframe_export else 0} potential cards prepared. Load them into the main table for review and export." progress(1.0, desc=final_message) return ( final_message, cards_for_dataframe_export, all_cards, ) # all_cards is List[Card] # --- Card Preview and Editing Utilities (Task 13.3) --- def cards_to_dataframe(cards: List[Card]) -> pd.DataFrame: """Converts a list of Card objects to a Pandas DataFrame for UI display.""" data_for_df = [] for i, card in enumerate(cards): # Extract tags from metadata if they exist tags_list = card.metadata.get("tags", []) if card.metadata else [] tags_str = ", ".join(tags_list) if tags_list else "" # Topic from metadata or a default topic_str = card.metadata.get("topic", "N/A") if card.metadata else "N/A" data_for_df.append( { "ID": i + 1, # 1-indexed ID for display "Topic": topic_str, # Added Topic "Front": card.front.question, "Back": card.back.answer, "Tags": tags_str, "Card Type": card.card_type or "Basic", # Mapped from note_type "Explanation": card.back.explanation or "", # Added Explanation "Example": card.back.example or "", # Added Example "Source_URL": card.metadata.get("source_url", "") if card.metadata else "", # Added Source URL } ) # Define all columns explicitly for consistent DataFrame structure df_columns = [ "ID", "Topic", "Front", "Back", "Tags", "Card Type", "Explanation", "Example", "Source_URL", ] df = pd.DataFrame(data_for_df, columns=df_columns) return df def dataframe_to_cards(df: pd.DataFrame, original_cards: List[Card]) -> List[Card]: """ Updates a list of Card objects based on edits from a Pandas DataFrame. Assumes the DataFrame 'ID' column corresponds to the 1-based index of original_cards. """ updated_cards: List[Card] = [] if df.empty and not original_cards: return [] if df.empty and original_cards: return [] # Or original_cards if no change is intended on empty df for index, row in df.iterrows(): try: card_id = int(row["ID"]) # DataFrame ID is 1-indexed original_card_index = card_id - 1 if 0 <= original_card_index < len(original_cards): card_to_update = original_cards[original_card_index] # Create new CardFront and CardBack objects for immutability if preferred, # or update existing ones since Pydantic models are mutable. new_front = card_to_update.front.copy( update={ "question": str(row.get("Front", card_to_update.front.question)) } ) new_back = card_to_update.back.copy( update={ "answer": str(row.get("Back", card_to_update.back.answer)), "explanation": str( row.get("Explanation", card_to_update.back.explanation) ), "example": str(row.get("Example", card_to_update.back.example)), } ) tags_str = str( row.get( "Tags", ",".join( card_to_update.metadata.get("tags", []) if card_to_update.metadata else [] ), ) ) new_tags = [t.strip() for t in tags_str.split(",") if t.strip()] new_metadata = ( card_to_update.metadata.copy() if card_to_update.metadata else {} ) new_metadata["tags"] = new_tags new_metadata["topic"] = str( row.get("Topic", new_metadata.get("topic", "N/A")) ) # Source URL is generally not editable from this simple table updated_card = card_to_update.copy( update={ "front": new_front, "back": new_back, "card_type": str( row.get("Card Type", card_to_update.card_type or "Basic") ), "metadata": new_metadata, } ) updated_cards.append(updated_card) else: crawler_ui_logger.warning( f"Card ID {card_id} from DataFrame is out of bounds for original_cards list." ) except (ValueError, KeyError, AttributeError) as e: crawler_ui_logger.error( f"Error processing row {index} from DataFrame: {row}. Error: {e}" ) if 0 <= original_card_index < len(original_cards): updated_cards.append( original_cards[original_card_index] ) # Re-add original on error continue return updated_cards