|
|
|
|
|
import gradio as gr |
|
import pandas as pd |
|
from typing import ( |
|
List, |
|
Tuple, |
|
) |
|
from urllib.parse import urlparse |
|
|
|
|
|
import re |
|
import asyncio |
|
|
|
from ankigen_core.crawler import WebCrawler |
|
from ankigen_core.llm_interface import ( |
|
OpenAIClientManager, |
|
process_crawled_pages, |
|
) |
|
from ankigen_core.card_generator import ( |
|
generate_cards_from_crawled_content, |
|
AVAILABLE_MODELS, |
|
) |
|
from ankigen_core.utils import get_logger |
|
|
|
|
|
from ankigen_core.models import ( |
|
Card, |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
) |
|
|
|
|
|
|
|
crawler_ui_logger = get_logger() |
|
|
|
|
|
def update_mode_visibility( |
|
mode: str, |
|
current_subject: str, |
|
current_description: str, |
|
current_text: str, |
|
current_url: str, |
|
): |
|
"""Updates visibility and values of UI elements based on generation mode.""" |
|
is_subject = mode == "subject" |
|
is_path = mode == "path" |
|
is_text = mode == "text" |
|
is_web = mode == "web" |
|
|
|
|
|
subject_val = current_subject if is_subject else "" |
|
description_val = current_description if is_path else "" |
|
text_val = current_text if is_text else "" |
|
url_val = current_url if is_web else "" |
|
|
|
cards_output_visible = is_subject or is_text or is_web |
|
|
|
|
|
main_output_df_columns = [ |
|
"Index", |
|
"Topic", |
|
"Card_Type", |
|
"Question", |
|
"Answer", |
|
"Explanation", |
|
"Example", |
|
"Prerequisites", |
|
"Learning_Outcomes", |
|
"Common_Misconceptions", |
|
"Difficulty", |
|
] |
|
subjects_list_df_columns = ["Subject", "Prerequisites", "Time Estimate"] |
|
|
|
return ( |
|
gr.update(visible=is_subject), |
|
gr.update(visible=is_path), |
|
gr.update(visible=is_text), |
|
gr.update(visible=is_web), |
|
gr.update(visible=is_path), |
|
gr.update( |
|
visible=cards_output_visible |
|
), |
|
gr.update(value=subject_val), |
|
gr.update(value=description_val), |
|
gr.update(value=text_val), |
|
gr.update(value=url_val), |
|
gr.update( |
|
value=pd.DataFrame(columns=main_output_df_columns) |
|
), |
|
gr.update( |
|
value=pd.DataFrame(columns=subjects_list_df_columns) |
|
), |
|
gr.update(value=""), |
|
gr.update(value=""), |
|
gr.update( |
|
value="<div><b>Total Cards Generated:</b> <span id='total-cards-count'>0</span></div>", |
|
visible=False, |
|
), |
|
) |
|
|
|
|
|
def use_selected_subjects(subjects_df: pd.DataFrame | None): |
|
"""Updates UI to use subjects from learning path analysis.""" |
|
if subjects_df is None or subjects_df.empty: |
|
gr.Warning("No subjects available to copy from Learning Path analysis.") |
|
|
|
return ( |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update( |
|
value=pd.DataFrame( |
|
columns=[ |
|
"Index", |
|
"Topic", |
|
"Card_Type", |
|
"Question", |
|
"Answer", |
|
"Explanation", |
|
"Example", |
|
"Prerequisites", |
|
"Learning_Outcomes", |
|
"Common_Misconceptions", |
|
"Difficulty", |
|
] |
|
) |
|
), |
|
gr.update( |
|
value=pd.DataFrame( |
|
columns=["Subject", "Prerequisites", "Time Estimate"] |
|
) |
|
), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(visible=False), |
|
) |
|
|
|
try: |
|
subjects = subjects_df["Subject"].tolist() |
|
combined_subject = ", ".join(subjects) |
|
|
|
suggested_topics = int(min(len(subjects) + 1, 20)) |
|
except KeyError: |
|
gr.Error("Learning path analysis result is missing the 'Subject' column.") |
|
|
|
return ( |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(), |
|
gr.update( |
|
value=pd.DataFrame( |
|
columns=[ |
|
"Index", |
|
"Topic", |
|
"Card_Type", |
|
"Question", |
|
"Answer", |
|
"Explanation", |
|
"Example", |
|
"Prerequisites", |
|
"Learning_Outcomes", |
|
"Common_Misconceptions", |
|
"Difficulty", |
|
] |
|
) |
|
), |
|
gr.update( |
|
value=pd.DataFrame( |
|
columns=["Subject", "Prerequisites", "Time Estimate"] |
|
) |
|
), |
|
gr.update(), |
|
gr.update(), |
|
gr.update(visible=False), |
|
) |
|
|
|
|
|
|
|
|
|
|
|
return ( |
|
gr.update(value="subject"), |
|
gr.update(visible=True), |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
gr.update(visible=True), |
|
gr.update(value=combined_subject), |
|
gr.update(value=""), |
|
gr.update(value=""), |
|
gr.update(value=""), |
|
gr.update(value=suggested_topics), |
|
gr.update( |
|
value="Focus on connections between these subjects and their practical applications." |
|
), |
|
gr.update( |
|
value=pd.DataFrame( |
|
columns=[ |
|
"Index", |
|
"Topic", |
|
"Card_Type", |
|
"Question", |
|
"Answer", |
|
"Explanation", |
|
"Example", |
|
"Prerequisites", |
|
"Learning_Outcomes", |
|
"Common_Misconceptions", |
|
"Difficulty", |
|
] |
|
) |
|
), |
|
gr.update( |
|
value=subjects_df |
|
), |
|
gr.update( |
|
value="" |
|
), |
|
gr.update(value=""), |
|
gr.update(visible=False), |
|
) |
|
|
|
|
|
def create_crawler_main_mode_elements() -> ( |
|
Tuple[ |
|
List[gr.components.Component], |
|
gr.Button, |
|
gr.Progress, |
|
gr.Textbox, |
|
gr.Textbox, |
|
gr.Textbox, |
|
gr.Checkbox, |
|
gr.Textbox, |
|
] |
|
): |
|
"""Creates the UI components for the Web Crawler mode integrated into the main tab.""" |
|
ui_components: List[gr.components.Component] = [] |
|
|
|
|
|
url_input = gr.Textbox( |
|
label="Start URL", |
|
placeholder="Enter the full URL to start crawling (e.g., https://example.com/docs)", |
|
elem_id="crawler_url_input", |
|
) |
|
ui_components.append(url_input) |
|
|
|
with gr.Row(): |
|
max_depth_slider = gr.Slider( |
|
minimum=0, |
|
maximum=5, |
|
value=1, |
|
step=1, |
|
label="Max Crawl Depth", |
|
elem_id="crawler_max_depth_slider", |
|
) |
|
ui_components.append(max_depth_slider) |
|
|
|
crawler_req_per_sec_slider = gr.Slider( |
|
minimum=0.1, |
|
maximum=10, |
|
value=2, |
|
step=0.1, |
|
label="Requests per Second (Crawler)", |
|
elem_id="crawler_req_per_sec_slider", |
|
) |
|
ui_components.append(crawler_req_per_sec_slider) |
|
|
|
model_choices_ui_crawler = [(m["label"], m["value"]) for m in AVAILABLE_MODELS] |
|
default_model_value_crawler = next( |
|
(m["value"] for m in AVAILABLE_MODELS if "nano" in m["value"].lower()), |
|
AVAILABLE_MODELS[0]["value"] if AVAILABLE_MODELS else "", |
|
) |
|
model_dropdown = gr.Dropdown( |
|
choices=model_choices_ui_crawler, |
|
label="AI Model for Content Processing", |
|
value=default_model_value_crawler, |
|
elem_id="crawler_model_dropdown", |
|
) |
|
ui_components.append(model_dropdown) |
|
|
|
with gr.Row(): |
|
include_patterns_textbox = gr.Textbox( |
|
label="Include URL Patterns (one per line, regex compatible)", |
|
placeholder="""e.g., /blog/.* |
|
example.com/articles/.*""", |
|
lines=3, |
|
elem_id="crawler_include_patterns", |
|
scale=1, |
|
) |
|
ui_components.append(include_patterns_textbox) |
|
|
|
exclude_patterns_textbox = gr.Textbox( |
|
label="Exclude URL Patterns (one per line, regex compatible)", |
|
placeholder="""e.g., /category/.* |
|
.*/login""", |
|
lines=3, |
|
elem_id="crawler_exclude_patterns", |
|
scale=1, |
|
) |
|
ui_components.append(exclude_patterns_textbox) |
|
|
|
with gr.Accordion( |
|
"Sitemap Options", open=False, elem_id="crawler_sitemap_options_accordion" |
|
): |
|
use_sitemap_checkbox = gr.Checkbox( |
|
label="Use Sitemap?", |
|
value=False, |
|
elem_id="crawler_use_sitemap_checkbox", |
|
) |
|
|
|
|
|
sitemap_url_textbox = gr.Textbox( |
|
label="Sitemap URL (e.g., /sitemap.xml or full URL)", |
|
placeholder="Enter sitemap URL relative to start URL or full path", |
|
visible=False, |
|
elem_id="crawler_sitemap_url_textbox", |
|
) |
|
|
|
|
|
use_sitemap_checkbox.change( |
|
fn=lambda x: gr.update(visible=x), |
|
inputs=[use_sitemap_checkbox], |
|
outputs=[sitemap_url_textbox], |
|
) |
|
|
|
|
|
|
|
with gr.Accordion( |
|
"Advanced Prompt Options", |
|
open=False, |
|
elem_id="crawler_advanced_options_accordion", |
|
): |
|
custom_system_prompt = gr.Textbox( |
|
label="Custom System Prompt (Optional)", |
|
placeholder="Leave empty to use the default system prompt for card generation.", |
|
lines=5, |
|
info="Define the overall role and instructions for the AI.", |
|
elem_id="crawler_custom_system_prompt", |
|
) |
|
|
|
|
|
custom_user_prompt_template = gr.Textbox( |
|
label="Custom User Prompt Template (Optional)", |
|
placeholder="Leave empty to use default. Available placeholders: {url}, {content}", |
|
lines=5, |
|
info="Define how the page URL and content are presented to the AI.", |
|
elem_id="crawler_custom_user_prompt_template", |
|
) |
|
|
|
|
|
|
|
|
|
|
|
crawl_button = gr.Button( |
|
"Crawl Content & Prepare Cards", |
|
variant="secondary", |
|
elem_id="crawler_crawl_content_button", |
|
) |
|
|
|
|
|
|
|
progress_bar = ( |
|
gr.Progress() |
|
) |
|
progress_status_textbox = gr.Textbox( |
|
label="Crawl Status", |
|
interactive=False, |
|
lines=3, |
|
placeholder="Crawling process status will appear here...", |
|
elem_id="crawler_status_textbox", |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return ( |
|
ui_components, |
|
crawl_button, |
|
progress_bar, |
|
progress_status_textbox, |
|
custom_system_prompt, |
|
custom_user_prompt_template, |
|
use_sitemap_checkbox, |
|
sitemap_url_textbox, |
|
) |
|
|
|
|
|
|
|
|
|
|
|
CRAWLER_AVAILABLE_MODELS_VALUES = [m["value"] for m in AVAILABLE_MODELS] |
|
|
|
|
|
def _basic_sanitize_filename(name: str) -> str: |
|
"""Basic filename sanitization by replacing non-alphanumeric characters with underscores.""" |
|
return re.sub(r"[^a-zA-Z0-9_.-]", "_", name) |
|
|
|
|
|
async def crawl_and_generate( |
|
url: str, |
|
max_depth: int, |
|
crawler_requests_per_second: float, |
|
include_patterns: str, |
|
exclude_patterns: str, |
|
model: str, |
|
export_format_ui: str, |
|
custom_system_prompt: str, |
|
custom_user_prompt_template: str, |
|
use_sitemap: bool, |
|
sitemap_url_str: str, |
|
client_manager: OpenAIClientManager, |
|
progress: gr.Progress, |
|
status_textbox: gr.Textbox, |
|
) -> Tuple[str, List[dict], List[Card]]: |
|
"""Crawls a website, generates Anki cards, and prepares them for export/display.""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
crawler_ui_logger.info(f"Crawl and generate called for URL: {url}") |
|
if not url or not url.startswith(("http://", "https://")): |
|
gr.Warning("Invalid URL provided. Please enter a valid http/https URL.") |
|
return "Invalid URL", [], [] |
|
|
|
try: |
|
urlparse(url) |
|
|
|
|
|
|
|
|
|
|
|
include_list = [p.strip() for p in include_patterns.split(",") if p.strip()] |
|
exclude_list = [p.strip() for p in exclude_patterns.split(",") if p.strip()] |
|
|
|
|
|
|
|
|
|
|
|
|
|
crawler = WebCrawler( |
|
start_url=url, |
|
max_depth=max_depth, |
|
|
|
|
|
|
|
|
|
|
|
include_patterns=include_list, |
|
exclude_patterns=exclude_list, |
|
use_sitemap=use_sitemap, |
|
sitemap_url=sitemap_url_str |
|
if use_sitemap and sitemap_url_str and sitemap_url_str.strip() |
|
else None, |
|
) |
|
|
|
total_urls_for_progress = 0 |
|
|
|
def crawler_progress_callback( |
|
processed_count: int, total_urls: int, current_url_processing: str |
|
): |
|
nonlocal total_urls_for_progress |
|
total_urls_for_progress = total_urls |
|
if total_urls_for_progress > 0: |
|
progress( |
|
0.1 + (processed_count / total_urls_for_progress) * 0.4, |
|
desc=f"Crawling: {processed_count}/{total_urls_for_progress} URLs. Current: {current_url_processing}", |
|
) |
|
else: |
|
progress( |
|
0.1 + processed_count * 0.01, |
|
desc=f"Crawling: {processed_count} URLs discovered. Current: {current_url_processing}", |
|
) |
|
|
|
crawler_ui_logger.info(f"Starting crawl for {url}...") |
|
progress(0.15, desc=f"Starting crawl for {url}...") |
|
crawled_pages = await asyncio.to_thread( |
|
crawler.crawl, progress_callback=crawler_progress_callback |
|
) |
|
crawler_ui_logger.info(f"Crawling finished. Found {len(crawled_pages)} pages.") |
|
progress(0.5, desc=f"Crawling finished. Found {len(crawled_pages)} pages.") |
|
|
|
if not crawled_pages: |
|
progress(1.0, desc="No pages were crawled. Check URL and patterns.") |
|
|
|
return ( |
|
"No pages were crawled. Check URL and patterns.", |
|
pd.DataFrame().to_dict(orient="records"), |
|
[], |
|
) |
|
|
|
openai_client = client_manager.get_client() |
|
processed_llm_pages = 0 |
|
|
|
def llm_progress_callback(completed_count: int, total_count: int): |
|
nonlocal processed_llm_pages |
|
processed_llm_pages = completed_count |
|
progress( |
|
0.5 + (completed_count / total_count) * 0.4, |
|
desc=f"Processing content: {completed_count}/{total_count} pages processed by LLM.", |
|
) |
|
|
|
crawler_ui_logger.info( |
|
f"Starting LLM processing for {len(crawled_pages)} pages..." |
|
) |
|
progress( |
|
0.55, desc=f"Processing {len(crawled_pages)} pages with LLM ({model})..." |
|
) |
|
all_cards = await process_crawled_pages( |
|
openai_client=openai_client, |
|
pages=crawled_pages, |
|
model=model, |
|
max_prompt_content_tokens=6000, |
|
max_concurrent_requests=5, |
|
custom_system_prompt=custom_system_prompt |
|
if custom_system_prompt and custom_system_prompt.strip() |
|
else None, |
|
custom_user_prompt_template=custom_user_prompt_template |
|
if custom_user_prompt_template and custom_user_prompt_template.strip() |
|
else None, |
|
progress_callback=llm_progress_callback, |
|
) |
|
crawler_ui_logger.info( |
|
f"LLM processing finished. Generated {len(all_cards)} Card objects." |
|
) |
|
progress( |
|
0.9, |
|
desc=f"LLM processing finished. Generated {len(all_cards)} Anki cards.", |
|
) |
|
|
|
if not all_cards: |
|
progress( |
|
1.0, desc="LLM processing complete, but no Anki cards were generated." |
|
) |
|
return ( |
|
"LLM processing complete, but no Anki cards were generated.", |
|
pd.DataFrame().to_dict(orient="records"), |
|
[], |
|
) |
|
|
|
cards_for_dataframe_export = generate_cards_from_crawled_content( |
|
all_cards |
|
) |
|
if not cards_for_dataframe_export: |
|
progress( |
|
1.0, desc="Card processing (formatting, etc.) resulted in no cards." |
|
) |
|
return ( |
|
"Card processing resulted in no cards.", |
|
pd.DataFrame().to_dict(orient="records"), |
|
[], |
|
) |
|
|
|
except ConnectionError as e: |
|
crawler_ui_logger.error(f"Connection error during crawl: {e}", exc_info=True) |
|
progress(1.0, desc=f"Connection error: {e}") |
|
return f"Connection error: {e}", pd.DataFrame().to_dict(orient="records"), [] |
|
except ValueError as e: |
|
crawler_ui_logger.error(f"Value error: {e}", exc_info=True) |
|
progress(1.0, desc=f"Input error: {e}") |
|
return f"Input error: {e}", pd.DataFrame().to_dict(orient="records"), [] |
|
except RuntimeError as e: |
|
crawler_ui_logger.error( |
|
f"Runtime error (e.g., OpenAI client not init): {e}", exc_info=True |
|
) |
|
progress(1.0, desc=f"Runtime error: {e}") |
|
return f"Runtime error: {e}", pd.DataFrame().to_dict(orient="records"), [] |
|
except Exception as e: |
|
crawler_ui_logger.error( |
|
f"Unexpected error in crawl_and_generate: {e}", exc_info=True |
|
) |
|
progress(1.0, desc=f"Unexpected error: {e}") |
|
return ( |
|
f"An unexpected error occurred: {e}", |
|
pd.DataFrame().to_dict(orient="records"), |
|
[], |
|
) |
|
|
|
final_message = f"Content crawled and processed. {len(cards_for_dataframe_export) if cards_for_dataframe_export else 0} potential cards prepared. Load them into the main table for review and export." |
|
progress(1.0, desc=final_message) |
|
return ( |
|
final_message, |
|
cards_for_dataframe_export, |
|
all_cards, |
|
) |
|
|
|
|
|
|
|
|
|
|
|
def cards_to_dataframe(cards: List[Card]) -> pd.DataFrame: |
|
"""Converts a list of Card objects to a Pandas DataFrame for UI display.""" |
|
data_for_df = [] |
|
for i, card in enumerate(cards): |
|
|
|
tags_list = card.metadata.get("tags", []) if card.metadata else [] |
|
tags_str = ", ".join(tags_list) if tags_list else "" |
|
|
|
|
|
topic_str = card.metadata.get("topic", "N/A") if card.metadata else "N/A" |
|
|
|
data_for_df.append( |
|
{ |
|
"ID": i + 1, |
|
"Topic": topic_str, |
|
"Front": card.front.question, |
|
"Back": card.back.answer, |
|
"Tags": tags_str, |
|
"Card Type": card.card_type or "Basic", |
|
"Explanation": card.back.explanation or "", |
|
"Example": card.back.example or "", |
|
"Source_URL": card.metadata.get("source_url", "") |
|
if card.metadata |
|
else "", |
|
} |
|
) |
|
|
|
df_columns = [ |
|
"ID", |
|
"Topic", |
|
"Front", |
|
"Back", |
|
"Tags", |
|
"Card Type", |
|
"Explanation", |
|
"Example", |
|
"Source_URL", |
|
] |
|
df = pd.DataFrame(data_for_df, columns=df_columns) |
|
return df |
|
|
|
|
|
def dataframe_to_cards(df: pd.DataFrame, original_cards: List[Card]) -> List[Card]: |
|
""" |
|
Updates a list of Card objects based on edits from a Pandas DataFrame. |
|
Assumes the DataFrame 'ID' column corresponds to the 1-based index of original_cards. |
|
""" |
|
updated_cards: List[Card] = [] |
|
if df.empty and not original_cards: |
|
return [] |
|
if df.empty and original_cards: |
|
return [] |
|
|
|
for index, row in df.iterrows(): |
|
try: |
|
card_id = int(row["ID"]) |
|
original_card_index = card_id - 1 |
|
|
|
if 0 <= original_card_index < len(original_cards): |
|
card_to_update = original_cards[original_card_index] |
|
|
|
|
|
|
|
new_front = card_to_update.front.copy( |
|
update={ |
|
"question": str(row.get("Front", card_to_update.front.question)) |
|
} |
|
) |
|
new_back = card_to_update.back.copy( |
|
update={ |
|
"answer": str(row.get("Back", card_to_update.back.answer)), |
|
"explanation": str( |
|
row.get("Explanation", card_to_update.back.explanation) |
|
), |
|
"example": str(row.get("Example", card_to_update.back.example)), |
|
} |
|
) |
|
|
|
tags_str = str( |
|
row.get( |
|
"Tags", |
|
",".join( |
|
card_to_update.metadata.get("tags", []) |
|
if card_to_update.metadata |
|
else [] |
|
), |
|
) |
|
) |
|
new_tags = [t.strip() for t in tags_str.split(",") if t.strip()] |
|
|
|
new_metadata = ( |
|
card_to_update.metadata.copy() if card_to_update.metadata else {} |
|
) |
|
new_metadata["tags"] = new_tags |
|
new_metadata["topic"] = str( |
|
row.get("Topic", new_metadata.get("topic", "N/A")) |
|
) |
|
|
|
|
|
updated_card = card_to_update.copy( |
|
update={ |
|
"front": new_front, |
|
"back": new_back, |
|
"card_type": str( |
|
row.get("Card Type", card_to_update.card_type or "Basic") |
|
), |
|
"metadata": new_metadata, |
|
} |
|
) |
|
updated_cards.append(updated_card) |
|
else: |
|
crawler_ui_logger.warning( |
|
f"Card ID {card_id} from DataFrame is out of bounds for original_cards list." |
|
) |
|
except (ValueError, KeyError, AttributeError) as e: |
|
crawler_ui_logger.error( |
|
f"Error processing row {index} from DataFrame: {row}. Error: {e}" |
|
) |
|
if 0 <= original_card_index < len(original_cards): |
|
updated_cards.append( |
|
original_cards[original_card_index] |
|
) |
|
continue |
|
return updated_cards |
|
|