Spaces:

brickfrog
/

ankigen

Running

App Files Files Community

brickfrog commited on 15 days ago

Commit

93bd7fb

verified ·

1 Parent(s): 3acb91e

Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

.gitignore +2 -1
.pre-commit-config.yaml +1 -1
ankigen_core/agents/generators.py +7 -6
ankigen_core/card_generator.py +2 -2
ankigen_core/context7.py +60 -7
ankigen_core/crawler.py +225 -82
ankigen_core/exceptions.py +104 -0
ankigen_core/exporters.py +117 -93
ankigen_core/llm_interface.py +81 -1
ankigen_core/ui_logic.py +10 -12
ankigen_core/utils.py +53 -28
pyproject.toml +1 -1

.gitignore CHANGED Viewed

@@ -197,4 +197,5 @@ tasks/
 scripts/
 .taskmasterconfig
-.cursor

 scripts/
 .taskmasterconfig
+.cursor
+.serena/

.pre-commit-config.yaml CHANGED Viewed

@@ -1,6 +1,6 @@
 repos:
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.5.7 # Use a recent ruff version
     hooks:
     -   id: ruff
         args: [--fix, --exit-non-zero-on-fix]

 repos:
 -   repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.13.1  # Updated to match pyproject.toml version
     hooks:
     -   id: ruff
         args: [--fix, --exit-non-zero-on-fix]

ankigen_core/agents/generators.py CHANGED Viewed

@@ -108,9 +108,10 @@ class SubjectExpertAgent(BaseAgentWrapper):
                     f"Generating batch {batch_num}: {cards_in_this_batch} cards"
                 )
-                # Reset agent for each batch to avoid conversation history accumulation
-                self.agent = None
-                await self.initialize()
                 user_input = (
                     f"Generate {cards_in_this_batch} flashcards for the topic: {topic}"
@@ -158,13 +159,13 @@ class SubjectExpertAgent(BaseAgentWrapper):
                 batch_num += 1
                 logger.info(
-                    f"Batch {batch_num-1} generated {len(batch_cards)} cards. {cards_remaining} cards remaining."
                 )
                 # Safety check to prevent infinite loops
                 if len(batch_cards) == 0:
                     logger.warning(
-                        f"No cards generated in batch {batch_num-1}, stopping generation"
                     )
                     break
@@ -175,7 +176,7 @@ class SubjectExpertAgent(BaseAgentWrapper):
                 )
             logger.info(
-                f"✅ Generated {len(all_cards)} cards total across {batch_num-1} batches for topic '{topic}'"
             )
             return all_cards

                     f"Generating batch {batch_num}: {cards_in_this_batch} cards"
                 )
+                # Initialize agent only once - Runner.run() creates fresh context each time
+                # No conversation history accumulation across batches (significant performance gain)
+                if not self.agent:
+                    await self.initialize()
                 user_input = (
                     f"Generate {cards_in_this_batch} flashcards for the topic: {topic}"
                 batch_num += 1
                 logger.info(
+                    f"Batch {batch_num - 1} generated {len(batch_cards)} cards. {cards_remaining} cards remaining."
                 )
                 # Safety check to prevent infinite loops
                 if len(batch_cards) == 0:
                     logger.warning(
+                        f"No cards generated in batch {batch_num - 1}, stopping generation"
                     )
                     break
                 )
             logger.info(
+                f"✅ Generated {len(all_cards)} cards total across {batch_num - 1} batches for topic '{topic}'"
             )
             return all_cards

ankigen_core/card_generator.py CHANGED Viewed

@@ -312,9 +312,9 @@ def generate_cards_from_crawled_content(
     for i, card_obj in enumerate(all_cards):
         # Extract data, assuming it's already plain text from Card object creation
         topic = (
-            card_obj.metadata.get("topic", f"Crawled Content - Card {i+1}")
             if card_obj.metadata
-            else f"Crawled Content - Card {i+1}"
         )
         # Ensure list-based metadata are joined as plain strings for DataFrame

     for i, card_obj in enumerate(all_cards):
         # Extract data, assuming it's already plain text from Card object creation
         topic = (
+            card_obj.metadata.get("topic", f"Crawled Content - Card {i + 1}")
             if card_obj.metadata
+            else f"Crawled Content - Card {i + 1}"
         )
         # Ensure list-based metadata are joined as plain strings for DataFrame

ankigen_core/context7.py CHANGED Viewed

@@ -4,19 +4,37 @@ import asyncio
 import subprocess
 import json
 from typing import Optional, Dict, Any
 from ankigen_core.logging import logger
 class Context7Client:
     """Context7 MCP client for fetching library documentation"""
     def __init__(self):
-        self.server_process = None
     async def call_context7_tool(
         self, tool_name: str, args: Dict[str, Any]
     ) -> Optional[Dict[str, Any]]:
-        """Call a Context7 tool via direct JSONRPC"""
         try:
             # Build the JSONRPC request
             request = {
@@ -47,9 +65,35 @@ class Context7Client:
                 },
             }
-            # Send both requests
-            input_data = json.dumps(init_request) + "\n" + json.dumps(request) + "\n"
-            stdout, stderr = await process.communicate(input=input_data.encode())
             # Parse responses
             responses = stdout.decode().strip().split("\n")
@@ -204,6 +248,15 @@ class Context7Client:
         self, library_id: str, topic: Optional[str] = None, tokens: int = 5000
     ) -> Optional[str]:
         """Get documentation for a library"""
         logger.info(
             f"Fetching docs for: {library_id}" + (f" (topic: {topic})" if topic else "")
         )
@@ -233,7 +286,7 @@ class Context7Client:
         return await self.get_library_docs(library_id, topic, tokens)
-async def test_context7():
     """Test the Context7 integration"""
     client = Context7Client()

 import subprocess
 import json
 from typing import Optional, Dict, Any
+from tenacity import (
+    retry,
+    stop_after_attempt,
+    wait_exponential,
+    retry_if_exception_type,
+)
 from ankigen_core.logging import logger
+from ankigen_core.exceptions import (
+    ValidationError,
+)
+MAX_STRING_LENGTH = 200  # Prevent excessively long inputs
+SUBPROCESS_TIMEOUT = 60.0  # 60 second timeout for Context7 calls
 class Context7Client:
     """Context7 MCP client for fetching library documentation"""
     def __init__(self):
+        pass  # No state needed - each call creates fresh subprocess
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=2, max=10),
+        retry=retry_if_exception_type((TimeoutError, ConnectionError)),
+        reraise=True,
+    )
     async def call_context7_tool(
         self, tool_name: str, args: Dict[str, Any]
     ) -> Optional[Dict[str, Any]]:
+        """Call a Context7 tool via direct JSONRPC with retry logic"""
         try:
             # Build the JSONRPC request
             request = {
                 },
             }
+            # Send both requests with timeout protection
+            # Optimize: Use list join for string concatenation
+            input_data = "\n".join([json.dumps(init_request), json.dumps(request), ""])
+            try:
+                stdout, stderr = await asyncio.wait_for(
+                    process.communicate(input=input_data.encode()),
+                    timeout=SUBPROCESS_TIMEOUT,
+                )
+            except asyncio.TimeoutError:
+                # Proper process cleanup on timeout
+                try:
+                    if process.returncode is None:  # Process still running
+                        process.kill()
+                        # Wait for process to actually terminate
+                        await asyncio.wait_for(process.wait(), timeout=5.0)
+                except Exception as cleanup_error:
+                    logger.error(f"Error during process cleanup: {cleanup_error}")
+                raise TimeoutError(
+                    f"Context7 subprocess timed out after {SUBPROCESS_TIMEOUT}s"
+                )
+            except Exception:
+                # Clean up process on any other error
+                try:
+                    if process.returncode is None:
+                        process.kill()
+                        await asyncio.wait_for(process.wait(), timeout=5.0)
+                except Exception:
+                    pass  # Best effort cleanup
+                raise
             # Parse responses
             responses = stdout.decode().strip().split("\n")
         self, library_id: str, topic: Optional[str] = None, tokens: int = 5000
     ) -> Optional[str]:
         """Get documentation for a library"""
+        # Security: Validate library_id (should start with /)
+        if (
+            not library_id
+            or not library_id.startswith("/")
+            or len(library_id) > MAX_STRING_LENGTH
+        ):
+            logger.error(f"Invalid library ID format (security): '{library_id}'")
+            raise ValidationError("Invalid library ID format")
         logger.info(
             f"Fetching docs for: {library_id}" + (f" (topic: {topic})" if topic else "")
         )
         return await self.get_library_docs(library_id, topic, tokens)
+async def test_context7() -> None:
     """Test the Context7 integration"""
     client = Context7Client()

ankigen_core/crawler.py CHANGED Viewed

@@ -1,13 +1,61 @@
 import requests
 from bs4 import BeautifulSoup, Tag
 from urllib.parse import urljoin, urlparse
 import re
 from typing import List, Set, Optional, Callable, Tuple
 import xml.etree.ElementTree as ET  # Added for Sitemap parsing
 from ankigen_core.models import CrawledPage
 from ankigen_core.utils import RateLimiter, get_logger
 from ankigen_core.logging import logger  # Added
 class WebCrawler:
@@ -41,20 +89,83 @@ class WebCrawler:
         self.logger = get_logger()
         self.session = requests.Session()
         self.session.headers.update({"User-Agent": self.user_agent})
         self.rate_limiter = RateLimiter(self.requests_per_second)
     def _is_valid_url(self, url: str) -> bool:
         """
-        Checks if the URL is valid for crawling (same domain, scheme, matches patterns).
         """
         try:
             parsed_url = urlparse(url)
             if not parsed_url.scheme or parsed_url.scheme.lower() not in [
                 "http",
                 "https",
             ]:
                 logger.debug(f"Invalid scheme for URL: {url}")
                 return False
             if parsed_url.netloc != self.base_domain:
                 logger.debug(f"URL {url} not in base domain {self.base_domain}")
                 return False
@@ -76,6 +187,10 @@ class WebCrawler:
         except ValueError:  # Handle potential errors from urlparse on malformed URLs
             logger.warning(f"ValueError when parsing URL: {url}", exc_info=True)
             return False
         return True
     def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[str]:
@@ -194,40 +309,122 @@ class WebCrawler:
     # --- End Sitemap Processing Methods ---
-    def crawl(
-        self, progress_callback: Optional[Callable[[int, int, str], None]] = None
-    ) -> List[CrawledPage]:
         urls_to_visit: List[Tuple[str, int, Optional[str]]] = []
-        crawled_pages: List[CrawledPage] = []
-        initial_total_for_progress = 0
         if self.use_sitemap and self.sitemap_url:
             self.logger.info(f"Attempting to use sitemap: {self.sitemap_url}")
             sitemap_extracted_urls = self._get_urls_from_sitemap()
             if sitemap_extracted_urls:
                 for url in sitemap_extracted_urls:
-                    if self._is_valid_url(
-                        url
-                    ):  # Checks domain, include/exclude patterns
-                        urls_to_visit.append(
-                            (url, 0, None)
-                        )  # Add with depth 0 and None parent
                 self.logger.info(
                     f"Initialized {len(urls_to_visit)} URLs to visit from sitemap after validation."
                 )
-                initial_total_for_progress = len(urls_to_visit)
             else:
                 self.logger.warning(
-                    "Sitemap processing yielded no URLs, or sitemap_url not set. Falling back to start_url if provided."
                 )
-                # Fallback to start_url if sitemap is empty or fails
                 if self._is_valid_url(self.start_url):
-                    urls_to_visit.append((self.start_url, 0, None))  # None parent
-                initial_total_for_progress = len(urls_to_visit)
         else:
             if self._is_valid_url(self.start_url):
-                urls_to_visit.append((self.start_url, 0, None))  # None parent
-            initial_total_for_progress = len(urls_to_visit)
         processed_count = 0
         while urls_to_visit:
@@ -246,28 +443,16 @@ class WebCrawler:
                     current_url,
                 )
-            if current_url in self.visited_urls:
-                self.logger.debug(f"URL already visited: {current_url}. Skipping.")
-                if progress_callback:
-                    # When skipping, processed_count doesn't increment, but one item is removed from effective queue for this iteration.
-                    # current_total_for_progress should reflect this for accuracy if it's dynamic.
-                    # If sitemap, it remains initial_total_for_progress.
                     dynamic_total = (
                         initial_total_for_progress
                         if self.use_sitemap
                         else processed_count + len(urls_to_visit) + 1
                     )
-                    progress_callback(
-                        processed_count,
-                        dynamic_total,
-                        f"Skipped (visited): {current_url}",
-                    )
-                continue
-            if current_depth > self.max_depth:
-                logger.debug(
-                    f"Skipping URL {current_url} due to depth {current_depth} > max_depth {self.max_depth}"
-                )
                 continue
             self.logger.info(
@@ -289,52 +474,10 @@ class WebCrawler:
                 html_content = response.text
                 soup = BeautifulSoup(html_content, "html.parser")
-                # Revert to original BeautifulSoup parsing logic for title, meta_description, meta_keywords
-                page_title_tag = soup.find("title")
-                page_title: Optional[str] = None
-                if isinstance(page_title_tag, Tag) and page_title_tag.string:
-                    page_title = page_title_tag.string.strip()
-                else:
-                    self.logger.debug(f"No title tag found for {current_url}")
-                meta_desc_tag = soup.find("meta", attrs={"name": "description"})
-                meta_description: Optional[str] = None
-                if isinstance(meta_desc_tag, Tag):
-                    content = meta_desc_tag.get("content")
-                    if isinstance(content, str):
-                        meta_description = content.strip()
-                    elif isinstance(content, list):
-                        meta_description = " ".join(
-                            str(item) for item in content
-                        ).strip()
-                        self.logger.debug(
-                            f"Meta description for {current_url} was a list, joined: {meta_description}"
-                        )
-                else:
-                    self.logger.debug(f"No meta description found for {current_url}")
-                meta_keywords_tag = soup.find("meta", attrs={"name": "keywords"})
-                meta_keywords: List[str] = []
-                if isinstance(meta_keywords_tag, Tag):
-                    content = meta_keywords_tag.get("content")
-                    raw_keywords_content: str = ""
-                    if isinstance(content, str):
-                        raw_keywords_content = content
-                    elif isinstance(content, list):
-                        raw_keywords_content = " ".join(str(item) for item in content)
-                        self.logger.debug(
-                            f"Meta keywords for {current_url} was a list, joined: {raw_keywords_content}"
-                        )
-                    if raw_keywords_content:
-                        meta_keywords = [
-                            k.strip()
-                            for k in raw_keywords_content.split(",")
-                            if k.strip()
-                        ]
-                else:
-                    self.logger.debug(f"No meta keywords found for {current_url}")
-                # End reverted section
                 text_content = self._extract_text(soup)

 import requests
+from requests.adapters import HTTPAdapter
 from bs4 import BeautifulSoup, Tag
 from urllib.parse import urljoin, urlparse
 import re
+import ipaddress
+import socket
 from typing import List, Set, Optional, Callable, Tuple
 import xml.etree.ElementTree as ET  # Added for Sitemap parsing
 from ankigen_core.models import CrawledPage
 from ankigen_core.utils import RateLimiter, get_logger
 from ankigen_core.logging import logger  # Added
+from ankigen_core.exceptions import (
+    SecurityError,
+)
+# Security: Maximum URL length to prevent abuse
+MAX_URL_LENGTH = 2048
+class SSRFProtectionAdapter(HTTPAdapter):
+    """
+    Custom HTTP adapter that prevents SSRF attacks by validating
+    IP addresses at connection time (prevents DNS rebinding attacks).
+    """
+    def send(self, request, **kwargs) -> requests.Response:
+        """Override send to validate IP before making request."""
+        # Parse the URL to get hostname
+        parsed = urlparse(request.url)
+        hostname = parsed.hostname
+        if hostname:
+            try:
+                # Resolve hostname to IP at request time (prevents DNS rebinding)
+                ip_str = socket.gethostbyname(hostname)
+                ip = ipaddress.ip_address(ip_str)
+                # Block private, loopback, link-local, and reserved addresses
+                if (
+                    ip.is_private
+                    or ip.is_loopback
+                    or ip.is_link_local
+                    or ip.is_reserved
+                ):
+                    msg = f"SSRF protection: Blocked request to private IP {ip_str} for hostname {hostname}"
+                    logger.error(msg)
+                    raise SecurityError(msg)
+            except (socket.gaierror, ValueError) as e:
+                logger.error(
+                    f"SSRF protection: DNS resolution failed for {hostname}: {e}"
+                )
+                raise requests.exceptions.ConnectionError(
+                    f"DNS resolution failed for {hostname}"
+                )
+        return super().send(request, **kwargs)
 class WebCrawler:
         self.logger = get_logger()
         self.session = requests.Session()
         self.session.headers.update({"User-Agent": self.user_agent})
+        # Security: Add SSRF protection adapter to prevent DNS rebinding attacks
+        # Performance: Configure connection pooling (10 connections per host, 20 total)
+        ssrf_adapter = SSRFProtectionAdapter(pool_connections=10, pool_maxsize=20)
+        self.session.mount("http://", ssrf_adapter)
+        self.session.mount("https://", ssrf_adapter)
         self.rate_limiter = RateLimiter(self.requests_per_second)
+    def __enter__(self):
+        """Context manager entry."""
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit - cleanup resources."""
+        self.close()
+        return False
+    def close(self) -> None:
+        """Close the requests session and cleanup resources."""
+        if hasattr(self, "session") and self.session:
+            self.session.close()
+            self.logger.debug("WebCrawler session closed")
     def _is_valid_url(self, url: str) -> bool:
         """
+        Checks if the URL is valid for crawling with SSRF protection.
+        Validates scheme, domain, patterns, and blocks private IP ranges.
         """
         try:
+            # Security: URL length check
+            if len(url) > MAX_URL_LENGTH:
+                logger.warning(
+                    f"URL exceeds maximum length ({MAX_URL_LENGTH}): {url[:100]}..."
+                )
+                return False
             parsed_url = urlparse(url)
+            # Security: Protocol whitelist (http/https only)
             if not parsed_url.scheme or parsed_url.scheme.lower() not in [
                 "http",
                 "https",
             ]:
                 logger.debug(f"Invalid scheme for URL: {url}")
                 return False
+            # Security: SSRF protection - block private IP ranges
+            hostname = parsed_url.hostname
+            if not hostname:
+                logger.warning(f"URL missing hostname: {url}")
+                return False
+            # Resolve hostname to IP and check if it's private
+            try:
+                # Get IP address for hostname
+                ip_str = socket.gethostbyname(hostname)
+                ip = ipaddress.ip_address(ip_str)
+                # Block private, loopback, link-local, and reserved addresses
+                if (
+                    ip.is_private
+                    or ip.is_loopback
+                    or ip.is_link_local
+                    or ip.is_reserved
+                ):
+                    logger.error(
+                        f"SSRF protection: Blocked private/internal IP {ip_str} for hostname {hostname}"
+                    )
+                    return False
+            except (socket.gaierror, ValueError, OSError) as e:
+                # DNS resolution failed or invalid IP
+                logger.warning(f"Could not resolve hostname {hostname}: {e}")
+                return False
+            # Domain check
             if parsed_url.netloc != self.base_domain:
                 logger.debug(f"URL {url} not in base domain {self.base_domain}")
                 return False
         except ValueError:  # Handle potential errors from urlparse on malformed URLs
             logger.warning(f"ValueError when parsing URL: {url}", exc_info=True)
             return False
+        except Exception as e:
+            logger.error(f"Unexpected error validating URL {url}: {e}", exc_info=True)
+            return False
         return True
     def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[str]:
     # --- End Sitemap Processing Methods ---
+    def _initialize_crawl_queue(self) -> List[Tuple[str, int, Optional[str]]]:
+        """Initialize the crawl queue from sitemap or start URL.
+        Returns:
+            List of tuples (url, depth, parent_url) to visit
+        """
         urls_to_visit: List[Tuple[str, int, Optional[str]]] = []
         if self.use_sitemap and self.sitemap_url:
             self.logger.info(f"Attempting to use sitemap: {self.sitemap_url}")
             sitemap_extracted_urls = self._get_urls_from_sitemap()
             if sitemap_extracted_urls:
                 for url in sitemap_extracted_urls:
+                    if self._is_valid_url(url):
+                        urls_to_visit.append((url, 0, None))
                 self.logger.info(
                     f"Initialized {len(urls_to_visit)} URLs to visit from sitemap after validation."
                 )
             else:
                 self.logger.warning(
+                    "Sitemap processing yielded no URLs. Falling back to start_url."
                 )
                 if self._is_valid_url(self.start_url):
+                    urls_to_visit.append((self.start_url, 0, None))
         else:
             if self._is_valid_url(self.start_url):
+                urls_to_visit.append((self.start_url, 0, None))
+        return urls_to_visit
+    def _extract_page_metadata(
+        self, soup: BeautifulSoup, url: str
+    ) -> Tuple[Optional[str], Optional[str], List[str]]:
+        """Extract title, meta description, and meta keywords from page.
+        Args:
+            soup: BeautifulSoup object of the page
+            url: URL being processed (for logging)
+        Returns:
+            Tuple of (title, meta_description, meta_keywords_list)
+        """
+        # Extract title
+        page_title_tag = soup.find("title")
+        page_title: Optional[str] = None
+        if isinstance(page_title_tag, Tag) and page_title_tag.string:
+            page_title = page_title_tag.string.strip()
+        else:
+            self.logger.debug(f"No title tag found for {url}")
+        # Extract meta description
+        meta_desc_tag = soup.find("meta", attrs={"name": "description"})
+        meta_description: Optional[str] = None
+        if isinstance(meta_desc_tag, Tag):
+            content = meta_desc_tag.get("content")
+            if isinstance(content, str):
+                meta_description = content.strip()
+            elif isinstance(content, list):
+                meta_description = " ".join(str(item) for item in content).strip()
+                self.logger.debug(
+                    f"Meta description for {url} was a list, joined: {meta_description}"
+                )
+        else:
+            self.logger.debug(f"No meta description found for {url}")
+        # Extract meta keywords
+        meta_keywords_tag = soup.find("meta", attrs={"name": "keywords"})
+        meta_keywords: List[str] = []
+        if isinstance(meta_keywords_tag, Tag):
+            content_kw = meta_keywords_tag.get("content")
+            raw_keywords_content: str = ""
+            if isinstance(content_kw, str):
+                raw_keywords_content = content_kw
+            elif isinstance(content_kw, list):
+                raw_keywords_content = " ".join(str(item) for item in content_kw)
+                self.logger.debug(
+                    f"Meta keywords for {url} was a list, joined: {raw_keywords_content}"
+                )
+            if raw_keywords_content:
+                meta_keywords = [
+                    k.strip() for k in raw_keywords_content.split(",") if k.strip()
+                ]
+        else:
+            self.logger.debug(f"No meta keywords found for {url}")
+        return page_title, meta_description, meta_keywords
+    def _should_skip_url(self, url: str, depth: int) -> Tuple[bool, Optional[str]]:
+        """Check if URL should be skipped.
+        Args:
+            url: URL to check
+            depth: Current depth of URL
+        Returns:
+            Tuple of (should_skip, skip_reason)
+        """
+        if url in self.visited_urls:
+            return True, f"Skipped (visited): {url}"
+        if depth > self.max_depth:
+            logger.debug(
+                f"Skipping URL {url} due to depth {depth} > max_depth {self.max_depth}"
+            )
+            return True, f"Skipped (max depth): {url}"
+        return False, None
+    def crawl(
+        self, progress_callback: Optional[Callable[[int, int, str], None]] = None
+    ) -> List[CrawledPage]:
+        # Initialize URLs using helper method
+        urls_to_visit = self._initialize_crawl_queue()
+        crawled_pages: List[CrawledPage] = []
+        initial_total_for_progress = len(urls_to_visit)
         processed_count = 0
         while urls_to_visit:
                     current_url,
                 )
+            # Check if URL should be skipped using helper method
+            should_skip, skip_reason = self._should_skip_url(current_url, current_depth)
+            if should_skip:
+                if progress_callback and skip_reason:
                     dynamic_total = (
                         initial_total_for_progress
                         if self.use_sitemap
                         else processed_count + len(urls_to_visit) + 1
                     )
+                    progress_callback(processed_count, dynamic_total, skip_reason)
                 continue
             self.logger.info(
                 html_content = response.text
                 soup = BeautifulSoup(html_content, "html.parser")
+                # Extract metadata using helper method
+                page_title, meta_description, meta_keywords = (
+                    self._extract_page_metadata(soup, current_url)
+                )
                 text_content = self._extract_text(soup)

ankigen_core/exceptions.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""Custom exceptions for AnkiGen application.
+This module provides a hierarchy of custom exceptions to standardize
+error handling across the codebase.
+"""
+class AnkigenError(Exception):
+    """Base exception for all AnkiGen errors."""
+    pass
+class ValidationError(AnkigenError):
+    """Raised when input validation fails."""
+    pass
+class SecurityError(AnkigenError):
+    """Raised when a security check fails (SSRF, command injection, etc.)."""
+    pass
+class APIError(AnkigenError):
+    """Base exception for API-related errors."""
+    pass
+class OpenAIAPIError(APIError):
+    """Raised when OpenAI API calls fail."""
+    pass
+class Context7APIError(APIError):
+    """Raised when Context7 API calls fail."""
+    pass
+class CrawlerError(AnkigenError):
+    """Base exception for web crawler errors."""
+    pass
+class URLValidationError(CrawlerError):
+    """Raised when URL validation fails."""
+    pass
+class ContentExtractionError(CrawlerError):
+    """Raised when content extraction from web page fails."""
+    pass
+class ExportError(AnkigenError):
+    """Base exception for export-related errors."""
+    pass
+class CardGenerationError(AnkigenError):
+    """Raised when card generation fails."""
+    pass
+class ConfigurationError(AnkigenError):
+    """Raised when configuration is invalid or missing."""
+    pass
+def handle_exception(
+    exc: Exception,
+    logger,
+    message: str,
+    reraise: bool = True,
+    reraise_as: type[Exception] | None = None,
+) -> None:
+    """Standardized exception handler.
+    Args:
+        exc: The exception to handle
+        logger: Logger instance to use
+        message: Error message to log
+        reraise: Whether to re-raise the exception
+        reraise_as: Optional exception type to wrap and re-raise as
+    Raises:
+        The original exception or wrapped exception if reraise is True
+    """
+    logger.error(f"{message}: {exc}", exc_info=True)
+    if reraise:
+        if reraise_as:
+            raise reraise_as(f"{message}: {exc}") from exc
+        raise

ankigen_core/exporters.py CHANGED Viewed

@@ -4,6 +4,7 @@ import gradio as gr
 import pandas as pd
 import genanki
 import random
 from typing import List, Dict, Any, Optional
 import csv
 from datetime import datetime
@@ -23,6 +24,57 @@ def _format_field_as_string(value: Any) -> str:
     return str(value).strip()
 # --- Constants for APKG Generation (Subtask 10) ---
 ANKI_BASIC_MODEL_NAME = "AnkiGen Basic"
 ANKI_CLOZE_MODEL_NAME = "AnkiGen Cloze"
@@ -587,19 +639,18 @@ def export_cards_to_csv(
         KeyError: If a card dictionary is missing essential keys like 'front' or 'back'.
         ValueError: If the cards list is empty or not provided.
     """
-    if not cards:
-        logger.warning("export_cards_to_csv called with an empty list of cards.")
-        raise ValueError("No cards provided to export.")
     if not filename:
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        # Ensure filename is just the name, not a path if not intended
-        # For simplicity, this example saves in the current working directory if no path is specified.
-        filename = f"ankigen_cards_{timestamp}.csv"
         logger.info(f"No filename provided, generated: {filename}")
     # Define the fieldnames expected in the CSV.
-    # 'front' and 'back' are mandatory.
     fieldnames = ["front", "back", "tags", "note_type"]
     try:
@@ -611,7 +662,7 @@ def export_cards_to_csv(
             writer.writeheader()
             for i, card in enumerate(cards):
                 try:
-                    # Ensure mandatory fields exist, others are optional via card.get in row_to_write
                     if "front" not in card or "back" not in card:
                         raise KeyError(
                             f"Card at index {i} is missing 'front' or 'back' key."
@@ -628,16 +679,13 @@ def export_cards_to_csv(
                     logger.error(
                         f"Skipping card due to KeyError: {e_inner}. Card data: {card}"
                     )
-                    # Optionally re-raise if one bad card should stop the whole export,
-                    # or continue to export valid cards.
-                    # For this implementation, we log and continue.
                     continue
         logger.info(f"Successfully exported cards to {filename}")
         return filename
     except IOError as e_io:
         logger.error(f"IOError during CSV export to {filename}: {e_io}", exc_info=True)
-        raise  # Re-raise the IOError
-    except Exception as e_general:  # Catch any other unexpected errors
         logger.error(
             f"Unexpected error during CSV export to {filename}: {e_general}",
             exc_info=True,
@@ -664,16 +712,18 @@ def export_cards_to_apkg(
         The path to the exported file.
     """
     logger.info(f"Starting APKG export for {len(cards)} cards to deck '{deck_name}'.")
     if not filename:
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        filename = f"ankigen_deck_{timestamp}.apkg"
     elif not filename.lower().endswith(".apkg"):
         filename += ".apkg"
-    output_dir = os.path.dirname(filename)
-    if output_dir and not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-        logger.info(f"Created output directory for APKG: {output_dir}")
     anki_basic_model = BASIC_MODEL
     anki_cloze_model = CLOZE_MODEL
@@ -687,20 +737,17 @@ def export_cards_to_apkg(
         tags_for_note_object = card_dict.get("tags_for_note_object", [])
         # Extract all potential fields, defaulting to empty strings
-        question = card_dict.get("Question", "")
-        answer = card_dict.get("Answer", "")
-        explanation = card_dict.get("Explanation", "")
-        example = card_dict.get("Example", "")
-        prerequisites = card_dict.get("Prerequisites", "")
-        learning_outcomes = card_dict.get("Learning_Outcomes", "")
-        difficulty = card_dict.get("Difficulty", "")
-        source_url = card_dict.get("SourceURL", "")
-        tags_str_field = card_dict.get(
-            "TagsStr", ""
-        )  # This is the string for the model's TagsStr field
-        # The 'Question' field from card_dict is used as the main text for both basic and cloze.
-        # For cloze, this 'Question' field should contain the cloze-formatted text (e.g., "The capital of {{c1::France}} is Paris.")
         if not question:
             logger.error(
                 f"SKIPPING CARD DUE TO EMPTY 'Question' (front/text) field. Card data: {card_dict}"
@@ -709,11 +756,10 @@ def export_cards_to_apkg(
         try:
             if note_type.lower() == "cloze":
-                # CLOZE_MODEL fields: Text, Back Extra, Explanation, Example, Prerequisites,
-                # Learning_Outcomes, Difficulty, SourceURL, TagsStr
                 note_fields = [
-                    question,  # Text (this is the card_dict['Question'] which should be cloze-formatted)
-                    answer,  # Back Extra (this is card_dict['Answer'])
                     explanation,
                     example,
                     prerequisites,
@@ -728,8 +774,7 @@ def export_cards_to_apkg(
                     tags=tags_for_note_object,
                 )
             else:  # Basic
-                # BASIC_MODEL fields: Question, Answer, Explanation, Example, Prerequisites,
-                # Learning_Outcomes, Difficulty, SourceURL, TagsStr
                 note_fields = [
                     question,
                     answer,
@@ -755,24 +800,17 @@ def export_cards_to_apkg(
             )
             logger.warning(f"Skipping card due to error: Question='{question[:50]}...'")
-    if notes_added_count == 0 and cards:  # Some cards were provided but none were added
-        logger.error(  # Changed to error for more visibility
             "No valid notes could be created from the provided cards. APKG generation aborted."
         )
-        # This error should be caught by the calling function in app.py to inform the user
         raise gr.Error("Failed to create any valid Anki notes from the input.")
-    elif not cards:  # No cards provided initially
-        logger.info("No cards provided to export to APKG. APKG generation skipped.")
-        # Depending on desired behavior, could raise or return a specific status/filename
-        # For now, let's assume an empty/default filename or None indicates no action if no cards
-        # However, the function is typed to return str, so raising is more consistent if no file is made.
-        raise gr.Error("No cards were provided to generate an APKG file.")
-    else:  # notes_added_count > 0
-        logger.info(
-            f"Added {notes_added_count} notes to deck '{deck_name}'. Proceeding to package."
-        )
-    # Only proceed to package and write if notes were successfully added
     package = genanki.Package(anki_deck)
     try:
         package.write_to_file(filename)
@@ -846,18 +884,18 @@ def export_dataframe_to_csv(
     logger.info(
         f"Attempting to export DataFrame to CSV. Suggested filename: {filename_suggestion}"
     )
-    if data is None or data.empty:
         logger.warning(
             "No data provided to export_dataframe_to_csv. Skipping CSV export."
         )
-        raise gr.Error(
-            "No card data available"
-        )  # Notify user via Gradio with Error instead of Info
-        # return None # This line is now unreachable due to the raise
     try:
-        # Create a specific filename using both suggestion and timestamp
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         base_name_from_suggestion = "ankigen_cards"  # Default base part
         # Sanitize and use the suggestion (e.g., subject name) if provided
@@ -867,28 +905,23 @@ def export_dataframe_to_csv(
             safe_suggestion = (
                 processed_suggestion.replace(" ", "_")
                 .replace("/", "-")
-                .replace("\\\\", "-")
             )
-            if (
-                safe_suggestion
-            ):  # If suggestion wasn't just '.csv' or empty after processing
                 base_name_from_suggestion = f"ankigen_{safe_suggestion[:50]}"
-            # If suggestion was empty or only '.csv', default base_name_from_suggestion remains 'ankigen_cards'
-        final_filename = f"{base_name_from_suggestion}_{timestamp}.csv"
-        # Ensure output directory exists if filename contains path
-        output_dir = os.path.dirname(final_filename)
-        if output_dir and not os.path.exists(output_dir):
-            os.makedirs(output_dir)
-            logger.info(f"Created output directory for CSV: {output_dir}")
-        data.to_csv(final_filename, index=False)  # MODIFIED: Write to final_filename
         logger.info(f"Successfully exported DataFrame to CSV: {final_filename}")
-        gr.Info(
-            f"CSV ready for download: {os.path.basename(final_filename)}"
-        )  # User-friendly message
-        return final_filename  # MODIFIED: Return final_filename
     except Exception as e:
         logger.error(f"Error exporting DataFrame to CSV: {e}", exc_info=True)
         gr.Error(f"Error exporting DataFrame to CSV: {e}")
@@ -902,9 +935,8 @@ def export_dataframe_to_apkg(
     deck_name: str,
 ) -> str:
     """Exports a DataFrame of cards to an Anki .apkg file."""
-    if df.empty:
-        logger.warning("export_dataframe_to_apkg called with an empty DataFrame.")
-        raise ValueError("No cards in DataFrame to export.")
     logger.info(
         f"Starting APKG export for DataFrame with {len(df)} rows to deck '{deck_name}'. Output: {output_path}"
@@ -918,25 +950,17 @@ def export_dataframe_to_apkg(
             )
             topic = _format_field_as_string(row.get("Topic", ""))
             difficulty_raw = _format_field_as_string(row.get("Difficulty", ""))
-            difficulty_plain_for_tag = strip_html_tags(
-                difficulty_raw
-            )  # Strip HTML for the tag
-            tags_list_for_note_obj = []  # For genanki.Note(tags=...)
             if topic:
                 tags_list_for_note_obj.append(topic.replace(" ", "_").replace(",", "_"))
-            if difficulty_plain_for_tag:  # Use the plain text version for the tag
-                # Further sanitize for Anki tags: replace spaces with underscores, remove other invalid chars if any.
-                # Anki tags also often don't like colons or other special chars except underscore/hyphen.
-                # For now, just replacing space, as that's the error seen.
                 safe_difficulty_tag = difficulty_plain_for_tag.replace(" ", "_")
                 tags_list_for_note_obj.append(safe_difficulty_tag)
-            tags_str_for_field = " ".join(
-                tags_list_for_note_obj
-            )  # For the 'TagsStr' model field
-            # Prepare a dictionary that contains all possible fields our models might need.
             card_data_for_note = {
                 "note_type": note_type_val,
                 "tags_for_note_object": tags_list_for_note_obj,
@@ -949,7 +973,7 @@ def export_dataframe_to_apkg(
                 "Learning_Outcomes": _format_field_as_string(
                     row.get("Learning_Outcomes", "")
                 ),
-                "Difficulty": difficulty_raw,  # Keep the original HTML for the 'Difficulty' field itself
                 "SourceURL": _format_field_as_string(row.get("Source_URL", "")),
             }
             cards_for_apkg.append(card_data_for_note)

 import pandas as pd
 import genanki
 import random
+import html
 from typing import List, Dict, Any, Optional
 import csv
 from datetime import datetime
     return str(value).strip()
+def _generate_timestamped_filename(
+    base_name: str, extension: str, include_timestamp: bool = True
+) -> str:
+    """Generate a filename with optional timestamp.
+    Args:
+        base_name: The base name for the file (without extension)
+        extension: File extension (e.g., 'csv', 'apkg')
+        include_timestamp: Whether to include timestamp in filename
+    Returns:
+        Generated filename with extension
+    """
+    if include_timestamp:
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        return f"{base_name}_{timestamp}.{extension}"
+    return f"{base_name}.{extension}"
+def _ensure_output_directory(filepath: str) -> None:
+    """Ensure the output directory exists for the given filepath.
+    Args:
+        filepath: Full path to the file
+    Creates the directory if it doesn't exist.
+    """
+    output_dir = os.path.dirname(filepath)
+    if output_dir and not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+        logger.info(f"Created output directory: {output_dir}")
+def _validate_non_empty_data(data: Any, data_type: str) -> None:
+    """Validate that data is not empty.
+    Args:
+        data: The data to validate (list, DataFrame, etc.)
+        data_type: Description of data type for error messages
+    Raises:
+        ValueError: If data is empty or None
+    """
+    if data is None:
+        raise ValueError(f"No {data_type} provided to export.")
+    if isinstance(data, list) and not data:
+        raise ValueError(f"No {data_type} provided to export.")
+    if isinstance(data, pd.DataFrame) and data.empty:
+        raise ValueError(f"No {data_type} available to export.")
 # --- Constants for APKG Generation (Subtask 10) ---
 ANKI_BASIC_MODEL_NAME = "AnkiGen Basic"
 ANKI_CLOZE_MODEL_NAME = "AnkiGen Cloze"
         KeyError: If a card dictionary is missing essential keys like 'front' or 'back'.
         ValueError: If the cards list is empty or not provided.
     """
+    # Validation using helper
+    _validate_non_empty_data(cards, "cards")
+    # Filename generation using helper
     if not filename:
+        filename = _generate_timestamped_filename("ankigen_cards", "csv")
         logger.info(f"No filename provided, generated: {filename}")
+    # Ensure output directory exists using helper
+    _ensure_output_directory(filename)
     # Define the fieldnames expected in the CSV.
     fieldnames = ["front", "back", "tags", "note_type"]
     try:
             writer.writeheader()
             for i, card in enumerate(cards):
                 try:
+                    # Ensure mandatory fields exist
                     if "front" not in card or "back" not in card:
                         raise KeyError(
                             f"Card at index {i} is missing 'front' or 'back' key."
                     logger.error(
                         f"Skipping card due to KeyError: {e_inner}. Card data: {card}"
                     )
                     continue
         logger.info(f"Successfully exported cards to {filename}")
         return filename
     except IOError as e_io:
         logger.error(f"IOError during CSV export to {filename}: {e_io}", exc_info=True)
+        raise
+    except Exception as e_general:
         logger.error(
             f"Unexpected error during CSV export to {filename}: {e_general}",
             exc_info=True,
         The path to the exported file.
     """
     logger.info(f"Starting APKG export for {len(cards)} cards to deck '{deck_name}'.")
+    # Validation using helper - note this now raises ValueError instead of gr.Error
+    _validate_non_empty_data(cards, "cards")
+    # Filename generation using helper
     if not filename:
+        filename = _generate_timestamped_filename("ankigen_deck", "apkg")
     elif not filename.lower().endswith(".apkg"):
         filename += ".apkg"
+    # Ensure output directory exists using helper
+    _ensure_output_directory(filename)
     anki_basic_model = BASIC_MODEL
     anki_cloze_model = CLOZE_MODEL
         tags_for_note_object = card_dict.get("tags_for_note_object", [])
         # Extract all potential fields, defaulting to empty strings
+        # Security: Sanitize HTML to prevent XSS when viewing cards in Anki
+        question = html.escape(card_dict.get("Question", ""))
+        answer = html.escape(card_dict.get("Answer", ""))
+        explanation = html.escape(card_dict.get("Explanation", ""))
+        example = html.escape(card_dict.get("Example", ""))
+        prerequisites = html.escape(card_dict.get("Prerequisites", ""))
+        learning_outcomes = html.escape(card_dict.get("Learning_Outcomes", ""))
+        difficulty = html.escape(card_dict.get("Difficulty", ""))
+        source_url = html.escape(card_dict.get("SourceURL", ""))
+        tags_str_field = html.escape(card_dict.get("TagsStr", ""))
         if not question:
             logger.error(
                 f"SKIPPING CARD DUE TO EMPTY 'Question' (front/text) field. Card data: {card_dict}"
         try:
             if note_type.lower() == "cloze":
+                # CLOZE_MODEL fields
                 note_fields = [
+                    question,  # Text
+                    answer,  # Back Extra
                     explanation,
                     example,
                     prerequisites,
                     tags=tags_for_note_object,
                 )
             else:  # Basic
+                # BASIC_MODEL fields
                 note_fields = [
                     question,
                     answer,
             )
             logger.warning(f"Skipping card due to error: Question='{question[:50]}...'")
+    if notes_added_count == 0:
+        logger.error(
             "No valid notes could be created from the provided cards. APKG generation aborted."
         )
         raise gr.Error("Failed to create any valid Anki notes from the input.")
+    logger.info(
+        f"Added {notes_added_count} notes to deck '{deck_name}'. Proceeding to package."
+    )
+    # Package and write
     package = genanki.Package(anki_deck)
     try:
         package.write_to_file(filename)
     logger.info(
         f"Attempting to export DataFrame to CSV. Suggested filename: {filename_suggestion}"
     )
+    # Validation using helper
+    try:
+        _validate_non_empty_data(data, "card data")
+    except ValueError:
         logger.warning(
             "No data provided to export_dataframe_to_csv. Skipping CSV export."
         )
+        raise gr.Error("No card data available")
     try:
+        # Generate filename from suggestion
         base_name_from_suggestion = "ankigen_cards"  # Default base part
         # Sanitize and use the suggestion (e.g., subject name) if provided
             safe_suggestion = (
                 processed_suggestion.replace(" ", "_")
                 .replace("/", "-")
+                .replace("\\", "-")
             )
+            if safe_suggestion:
                 base_name_from_suggestion = f"ankigen_{safe_suggestion[:50]}"
+        # Generate timestamped filename using helper
+        final_filename = _generate_timestamped_filename(
+            base_name_from_suggestion, "csv"
+        )
+        # Ensure output directory exists using helper
+        _ensure_output_directory(final_filename)
+        data.to_csv(final_filename, index=False)
         logger.info(f"Successfully exported DataFrame to CSV: {final_filename}")
+        gr.Info(f"CSV ready for download: {os.path.basename(final_filename)}")
+        return final_filename
     except Exception as e:
         logger.error(f"Error exporting DataFrame to CSV: {e}", exc_info=True)
         gr.Error(f"Error exporting DataFrame to CSV: {e}")
     deck_name: str,
 ) -> str:
     """Exports a DataFrame of cards to an Anki .apkg file."""
+    # Validation using helper
+    _validate_non_empty_data(df, "cards in DataFrame")
     logger.info(
         f"Starting APKG export for DataFrame with {len(df)} rows to deck '{deck_name}'. Output: {output_path}"
             )
             topic = _format_field_as_string(row.get("Topic", ""))
             difficulty_raw = _format_field_as_string(row.get("Difficulty", ""))
+            difficulty_plain_for_tag = strip_html_tags(difficulty_raw)
+            tags_list_for_note_obj = []
             if topic:
                 tags_list_for_note_obj.append(topic.replace(" ", "_").replace(",", "_"))
+            if difficulty_plain_for_tag:
                 safe_difficulty_tag = difficulty_plain_for_tag.replace(" ", "_")
                 tags_list_for_note_obj.append(safe_difficulty_tag)
+            tags_str_for_field = " ".join(tags_list_for_note_obj)
             card_data_for_note = {
                 "note_type": note_type_val,
                 "tags_for_note_object": tags_list_for_note_obj,
                 "Learning_Outcomes": _format_field_as_string(
                     row.get("Learning_Outcomes", "")
                 ),
+                "Difficulty": difficulty_raw,
                 "SourceURL": _format_field_as_string(row.get("Source_URL", "")),
             }
             cards_for_apkg.append(card_data_for_note)

ankigen_core/llm_interface.py CHANGED Viewed

@@ -74,6 +74,52 @@ class OpenAIClientManager:
             )
         return self._client
 # Retry decorator for API calls - kept similar to original
 @retry(
@@ -114,6 +160,7 @@ async def structured_output_completion(
         ):
             effective_system_prompt = f"{system_prompt}\nProvide your response as a JSON object matching the specified schema."
         completion = await openai_client.chat.completions.create(
             model=model,
             messages=[
@@ -122,6 +169,7 @@ async def structured_output_completion(
             ],
             response_format=response_format,  # Pass the dict directly
             temperature=0.7,  # Consider making this configurable
         )
         if not hasattr(completion, "choices") or not completion.choices:
@@ -252,8 +300,30 @@ async def process_crawled_page(
     custom_system_prompt: Optional[str] = None,
     custom_user_prompt_template: Optional[str] = None,
     max_prompt_content_tokens: int = 6000,
 ) -> List[Card]:
-    """Process a crawled page and extract structured Card objects using OpenAI."""
     logger.info(
         f"Processing page: {page.url} with model {model}, max_prompt_content_tokens: {max_prompt_content_tokens}"
     )
@@ -362,6 +432,7 @@ Generate a few high-quality Anki cards from this content.
             f"Attempting to generate cards for {page.url} using model {model}."
         )
         response_format_param = {"type": "json_object"}
         response_data = await openai_client.chat.completions.create(
             model=model,
             messages=[
@@ -370,6 +441,7 @@ Generate a few high-quality Anki cards from this content.
             ],
             response_format=response_format_param,
             temperature=0.5,
         )
         if (
@@ -466,6 +538,12 @@ Generate a few high-quality Anki cards from this content.
             logger.info(
                 f"Successfully generated {len(validated_cards)} Cards from {page.url}."
             )
         return validated_cards
     except json.JSONDecodeError as e:
@@ -509,6 +587,7 @@ async def process_crawled_pages(
     custom_system_prompt: Optional[str] = None,
     custom_user_prompt_template: Optional[str] = None,
     progress_callback: Optional[Callable[[int, int], None]] = None,
 ) -> List[Card]:
     if not pages:
         logger.info("No pages provided to process_crawled_pages.")
@@ -536,6 +615,7 @@ async def process_crawled_pages(
                     custom_system_prompt=custom_system_prompt,
                     custom_user_prompt_template=custom_user_prompt_template,
                     max_prompt_content_tokens=max_prompt_content_tokens,
                 )
                 if page_cards is None:
                     logger.warning(

             )
         return self._client
+    def __enter__(self):
+        """Context manager entry."""
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit - cleanup resources."""
+        self.close()
+        return False
+    async def __aenter__(self):
+        """Async context manager entry."""
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Async context manager exit - cleanup resources."""
+        await self.aclose()
+        return False
+    def close(self) -> None:
+        """Close the OpenAI client synchronously."""
+        if self._client:
+            try:
+                # OpenAI client has a close method for cleanup
+                if hasattr(self._client, "close"):
+                    self._client.close()
+                logger.debug("OpenAI client closed")
+            except Exception as e:
+                logger.warning(f"Error closing OpenAI client: {e}")
+            finally:
+                self._client = None
+    async def aclose(self) -> None:
+        """Close the OpenAI client asynchronously."""
+        if self._client:
+            try:
+                # OpenAI async client has an aclose method
+                if hasattr(self._client, "aclose"):
+                    await self._client.aclose()
+                elif hasattr(self._client, "close"):
+                    self._client.close()
+                logger.debug("OpenAI client closed (async)")
+            except Exception as e:
+                logger.warning(f"Error closing OpenAI client: {e}")
+            finally:
+                self._client = None
 # Retry decorator for API calls - kept similar to original
 @retry(
         ):
             effective_system_prompt = f"{system_prompt}\nProvide your response as a JSON object matching the specified schema."
+        # Security: Add timeout to prevent indefinite hanging
         completion = await openai_client.chat.completions.create(
             model=model,
             messages=[
             ],
             response_format=response_format,  # Pass the dict directly
             temperature=0.7,  # Consider making this configurable
+            timeout=120.0,  # 120 second timeout
         )
         if not hasattr(completion, "choices") or not completion.choices:
     custom_system_prompt: Optional[str] = None,
     custom_user_prompt_template: Optional[str] = None,
     max_prompt_content_tokens: int = 6000,
+    cache: Optional[ResponseCache] = None,
 ) -> List[Card]:
+    """Process a crawled page and extract structured Card objects using OpenAI.
+    Args:
+        openai_client: The OpenAI client instance
+        page: The crawled page to process
+        model: The model to use for generation
+        custom_system_prompt: Optional custom system prompt
+        custom_user_prompt_template: Optional custom user prompt template
+        max_prompt_content_tokens: Maximum tokens for content
+        cache: Optional ResponseCache for page-level caching
+    Returns:
+        List of generated Card objects
+    """
+    # Check page-level cache first
+    if cache:
+        cache_key = f"{page.url}:{model}"
+        cached_cards = cache.get(cache_key, "page_cache")
+        if cached_cards is not None:
+            logger.info(f"Using cached cards for page: {page.url}")
+            return cached_cards
     logger.info(
         f"Processing page: {page.url} with model {model}, max_prompt_content_tokens: {max_prompt_content_tokens}"
     )
             f"Attempting to generate cards for {page.url} using model {model}."
         )
         response_format_param = {"type": "json_object"}
+        # Security: Add timeout to prevent indefinite hanging
         response_data = await openai_client.chat.completions.create(
             model=model,
             messages=[
             ],
             response_format=response_format_param,
             temperature=0.5,
+            timeout=120.0,  # 120 second timeout
         )
         if (
             logger.info(
                 f"Successfully generated {len(validated_cards)} Cards from {page.url}."
             )
+            # Cache successful results for page-level caching
+            if cache:
+                cache_key = f"{page.url}:{model}"
+                cache.set(cache_key, "page_cache", validated_cards)
+                logger.debug(f"Cached {len(validated_cards)} cards for {page.url}")
         return validated_cards
     except json.JSONDecodeError as e:
     custom_system_prompt: Optional[str] = None,
     custom_user_prompt_template: Optional[str] = None,
     progress_callback: Optional[Callable[[int, int], None]] = None,
+    cache: Optional[ResponseCache] = None,
 ) -> List[Card]:
     if not pages:
         logger.info("No pages provided to process_crawled_pages.")
                     custom_system_prompt=custom_system_prompt,
                     custom_user_prompt_template=custom_user_prompt_template,
                     max_prompt_content_tokens=max_prompt_content_tokens,
+                    cache=cache,
                 )
                 if page_cards is None:
                     logger.warning(

ankigen_core/ui_logic.py CHANGED Viewed

@@ -250,18 +250,16 @@ def use_selected_subjects(subjects_df: pd.DataFrame | None):
     )
-def create_crawler_main_mode_elements() -> (
-    Tuple[
-        List[gr.components.Component],  # ui_components (url_input, max_depth, etc.)
-        gr.Button,  # crawl_button
-        gr.Progress,  # progress_bar
-        gr.Textbox,  # progress_status_textbox
-        gr.Textbox,  # custom_system_prompt
-        gr.Textbox,  # custom_user_prompt_template
-        gr.Checkbox,  # use_sitemap_checkbox
-        gr.Textbox,  # sitemap_url_textbox
-    ]
-):
     """Creates the UI components for the Web Crawler mode integrated into the main tab."""
     ui_components: List[gr.components.Component] = []

     )
+def create_crawler_main_mode_elements() -> Tuple[
+    List[gr.components.Component],  # ui_components (url_input, max_depth, etc.)
+    gr.Button,  # crawl_button
+    gr.Progress,  # progress_bar
+    gr.Textbox,  # progress_status_textbox
+    gr.Textbox,  # custom_system_prompt
+    gr.Textbox,  # custom_user_prompt_template
+    gr.Checkbox,  # use_sitemap_checkbox
+    gr.Textbox,  # sitemap_url_textbox
+]:
     """Creates the UI components for the Web Crawler mode integrated into the main tab."""
     ui_components: List[gr.components.Component] = []

ankigen_core/utils.py CHANGED Viewed

@@ -6,7 +6,6 @@ import sys
 import hashlib
 import requests
 from bs4 import BeautifulSoup
-from functools import lru_cache
 from typing import Any, Optional
 import time
@@ -14,7 +13,7 @@ import time
 _logger_instance = None
-def setup_logging():
     """Configure logging to both file and console"""
     global _logger_instance
     if _logger_instance:
@@ -49,7 +48,7 @@ def setup_logging():
     return logger
-def get_logger():
     """Returns the initialized logger instance."""
     if _logger_instance is None:
         return setup_logging()
@@ -62,39 +61,65 @@ logger = get_logger()
 # --- Caching ---
 class ResponseCache:
-    """A simple cache for API responses using LRU for get operations."""
-    def __init__(self, maxsize=128):
-        # This internal method will be decorated by lru_cache
-        self._internal_get_from_dict = self._get_from_dict_actual
-        self._lru_cached_get = lru_cache(maxsize=maxsize)(self._internal_get_from_dict)
-        self._dict_cache = {}  # Main store for set operations
-    def _get_from_dict_actual(self, cache_key: str):
-        """Actual dictionary lookup, intended to be wrapped by lru_cache."""
-        logger.debug(f"Cache DICT GET: key={cache_key}")
-        return self._dict_cache.get(cache_key)
     def get(self, prompt: str, model: str) -> Optional[Any]:
-        """Retrieves an item from the cache. Uses LRU for this get path."""
         cache_key = self._create_key(prompt, model)
-        # Use the LRU cached getter which looks up in _dict_cache
-        return self._lru_cached_get(cache_key)
     def set(self, prompt: str, model: str, response: Any):
-        """Sets an item in the cache."""
         cache_key = self._create_key(prompt, model)
-        logger.debug(f"Cache SET: key={cache_key}, type={type(response)}")
-        self._dict_cache[cache_key] = response
-        # To make the LRU cache aware of this new item for subsequent gets:
-        # We can call the LRU getter so it caches it, or clear specific lru entry if updating.
-        # For simplicity, if a new item is set, a subsequent get will fetch and cache it via LRU.
-        # Or, we can "prime" the lru_cache, but that's more complex.
-        # Current approach: set updates _dict_cache. Next get for this key will use _lru_cached_get,
-        # which will fetch from _dict_cache and then be LRU-managed.
     def _create_key(self, prompt: str, model: str) -> str:
-        """Creates a unique MD5 hash key for caching."""
         return hashlib.md5(f"{model}:{prompt}".encode("utf-8")).hexdigest()
@@ -178,7 +203,7 @@ class RateLimiter:
         self.last_request_timestamp: float = 0.0
         # Use a lock if this were to be used by multiple threads, but for now assuming single thread access per instance
-    def wait(self):
         """Blocks until it's safe to make the next request."""
         current_time = time.monotonic()  # Use monotonic clock for intervals
         time_since_last_request = current_time - self.last_request_timestamp

 import hashlib
 import requests
 from bs4 import BeautifulSoup
 from typing import Any, Optional
 import time
 _logger_instance = None
+def setup_logging() -> logging.Logger:
     """Configure logging to both file and console"""
     global _logger_instance
     if _logger_instance:
     return logger
+def get_logger() -> logging.Logger:
     """Returns the initialized logger instance."""
     if _logger_instance is None:
         return setup_logging()
 # --- Caching ---
 class ResponseCache:
+    """Simple and efficient LRU cache for API responses with proper eviction."""
+    def __init__(self, maxsize: int = 128):
+        self.maxsize = maxsize
+        self._cache = {}  # {key: response}
+        self._access_order = []  # Track access order for LRU eviction
+        self.hits = 0
+        self.misses = 0
     def get(self, prompt: str, model: str) -> Optional[Any]:
+        """Retrieve item from cache, updating LRU order."""
         cache_key = self._create_key(prompt, model)
+        if cache_key in self._cache:
+            # Move to end (most recently used)
+            self._access_order.remove(cache_key)
+            self._access_order.append(cache_key)
+            self.hits += 1
+            logger.debug(
+                f"Cache HIT: {cache_key[:16]}... (hits={self.hits}, misses={self.misses})"
+            )
+            return self._cache[cache_key]
+        self.misses += 1
+        logger.debug(
+            f"Cache MISS: {cache_key[:16]}... (hits={self.hits}, misses={self.misses})"
+        )
+        return None
     def set(self, prompt: str, model: str, response: Any):
+        """Store item in cache with LRU eviction when full."""
         cache_key = self._create_key(prompt, model)
+        # If key exists, update and move to end
+        if cache_key in self._cache:
+            self._access_order.remove(cache_key)
+        # If cache is full, evict least recently used
+        elif len(self._cache) >= self.maxsize:
+            evicted_key = self._access_order.pop(0)
+            del self._cache[evicted_key]
+            logger.debug(
+                f"Cache EVICT: {evicted_key[:16]}... (size={len(self._cache)})"
+            )
+        self._cache[cache_key] = response
+        self._access_order.append(cache_key)
+        logger.debug(f"Cache SET: {cache_key[:16]}... (size={len(self._cache)})")
+    def clear(self) -> None:
+        """Clear all cache entries and statistics."""
+        self._cache.clear()
+        self._access_order.clear()
+        self.hits = 0
+        self.misses = 0
+        logger.debug("Cache CLEARED")
     def _create_key(self, prompt: str, model: str) -> str:
+        """Create cache key from prompt and model (MD5 hash for size efficiency)."""
+        # Hash to keep keys manageable size while maintaining uniqueness
         return hashlib.md5(f"{model}:{prompt}".encode("utf-8")).hexdigest()
         self.last_request_timestamp: float = 0.0
         # Use a lock if this were to be used by multiple threads, but for now assuming single thread access per instance
+    def wait(self) -> None:
         """Blocks until it's safe to make the next request."""
         current_time = time.monotonic()  # Use monotonic clock for intervals
         time_since_last_request = current_time - self.last_request_timestamp

pyproject.toml CHANGED Viewed

@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "ankigen"
 version = "0.2.0"
-description = ""
 authors = [
     { name = "Justin", email = "[email protected]" },
 ]

 [project]
 name = "ankigen"
 version = "0.2.0"
+description = "AI-powered Anki flashcard generator using OpenAI GPT models with CLI and web interface"
 authors = [
     { name = "Justin", email = "[email protected]" },
 ]