Upload folder using huggingface_hub
Browse files- .gitignore +2 -1
- .pre-commit-config.yaml +1 -1
- ankigen_core/agents/generators.py +7 -6
- ankigen_core/card_generator.py +2 -2
- ankigen_core/context7.py +60 -7
- ankigen_core/crawler.py +225 -82
- ankigen_core/exceptions.py +104 -0
- ankigen_core/exporters.py +117 -93
- ankigen_core/llm_interface.py +81 -1
- ankigen_core/ui_logic.py +10 -12
- ankigen_core/utils.py +53 -28
- pyproject.toml +1 -1
.gitignore
CHANGED
|
@@ -197,4 +197,5 @@ tasks/
|
|
| 197 |
scripts/
|
| 198 |
|
| 199 |
.taskmasterconfig
|
| 200 |
-
.cursor
|
|
|
|
|
|
| 197 |
scripts/
|
| 198 |
|
| 199 |
.taskmasterconfig
|
| 200 |
+
.cursor
|
| 201 |
+
.serena/
|
.pre-commit-config.yaml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
repos:
|
| 2 |
- repo: https://github.com/astral-sh/ruff-pre-commit
|
| 3 |
-
rev: v0.
|
| 4 |
hooks:
|
| 5 |
- id: ruff
|
| 6 |
args: [--fix, --exit-non-zero-on-fix]
|
|
|
|
| 1 |
repos:
|
| 2 |
- repo: https://github.com/astral-sh/ruff-pre-commit
|
| 3 |
+
rev: v0.13.1 # Updated to match pyproject.toml version
|
| 4 |
hooks:
|
| 5 |
- id: ruff
|
| 6 |
args: [--fix, --exit-non-zero-on-fix]
|
ankigen_core/agents/generators.py
CHANGED
|
@@ -108,9 +108,10 @@ class SubjectExpertAgent(BaseAgentWrapper):
|
|
| 108 |
f"Generating batch {batch_num}: {cards_in_this_batch} cards"
|
| 109 |
)
|
| 110 |
|
| 111 |
-
#
|
| 112 |
-
|
| 113 |
-
|
|
|
|
| 114 |
|
| 115 |
user_input = (
|
| 116 |
f"Generate {cards_in_this_batch} flashcards for the topic: {topic}"
|
|
@@ -158,13 +159,13 @@ class SubjectExpertAgent(BaseAgentWrapper):
|
|
| 158 |
batch_num += 1
|
| 159 |
|
| 160 |
logger.info(
|
| 161 |
-
f"Batch {batch_num-1} generated {len(batch_cards)} cards. {cards_remaining} cards remaining."
|
| 162 |
)
|
| 163 |
|
| 164 |
# Safety check to prevent infinite loops
|
| 165 |
if len(batch_cards) == 0:
|
| 166 |
logger.warning(
|
| 167 |
-
f"No cards generated in batch {batch_num-1}, stopping generation"
|
| 168 |
)
|
| 169 |
break
|
| 170 |
|
|
@@ -175,7 +176,7 @@ class SubjectExpertAgent(BaseAgentWrapper):
|
|
| 175 |
)
|
| 176 |
|
| 177 |
logger.info(
|
| 178 |
-
f"✅ Generated {len(all_cards)} cards total across {batch_num-1} batches for topic '{topic}'"
|
| 179 |
)
|
| 180 |
return all_cards
|
| 181 |
|
|
|
|
| 108 |
f"Generating batch {batch_num}: {cards_in_this_batch} cards"
|
| 109 |
)
|
| 110 |
|
| 111 |
+
# Initialize agent only once - Runner.run() creates fresh context each time
|
| 112 |
+
# No conversation history accumulation across batches (significant performance gain)
|
| 113 |
+
if not self.agent:
|
| 114 |
+
await self.initialize()
|
| 115 |
|
| 116 |
user_input = (
|
| 117 |
f"Generate {cards_in_this_batch} flashcards for the topic: {topic}"
|
|
|
|
| 159 |
batch_num += 1
|
| 160 |
|
| 161 |
logger.info(
|
| 162 |
+
f"Batch {batch_num - 1} generated {len(batch_cards)} cards. {cards_remaining} cards remaining."
|
| 163 |
)
|
| 164 |
|
| 165 |
# Safety check to prevent infinite loops
|
| 166 |
if len(batch_cards) == 0:
|
| 167 |
logger.warning(
|
| 168 |
+
f"No cards generated in batch {batch_num - 1}, stopping generation"
|
| 169 |
)
|
| 170 |
break
|
| 171 |
|
|
|
|
| 176 |
)
|
| 177 |
|
| 178 |
logger.info(
|
| 179 |
+
f"✅ Generated {len(all_cards)} cards total across {batch_num - 1} batches for topic '{topic}'"
|
| 180 |
)
|
| 181 |
return all_cards
|
| 182 |
|
ankigen_core/card_generator.py
CHANGED
|
@@ -312,9 +312,9 @@ def generate_cards_from_crawled_content(
|
|
| 312 |
for i, card_obj in enumerate(all_cards):
|
| 313 |
# Extract data, assuming it's already plain text from Card object creation
|
| 314 |
topic = (
|
| 315 |
-
card_obj.metadata.get("topic", f"Crawled Content - Card {i+1}")
|
| 316 |
if card_obj.metadata
|
| 317 |
-
else f"Crawled Content - Card {i+1}"
|
| 318 |
)
|
| 319 |
|
| 320 |
# Ensure list-based metadata are joined as plain strings for DataFrame
|
|
|
|
| 312 |
for i, card_obj in enumerate(all_cards):
|
| 313 |
# Extract data, assuming it's already plain text from Card object creation
|
| 314 |
topic = (
|
| 315 |
+
card_obj.metadata.get("topic", f"Crawled Content - Card {i + 1}")
|
| 316 |
if card_obj.metadata
|
| 317 |
+
else f"Crawled Content - Card {i + 1}"
|
| 318 |
)
|
| 319 |
|
| 320 |
# Ensure list-based metadata are joined as plain strings for DataFrame
|
ankigen_core/context7.py
CHANGED
|
@@ -4,19 +4,37 @@ import asyncio
|
|
| 4 |
import subprocess
|
| 5 |
import json
|
| 6 |
from typing import Optional, Dict, Any
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
from ankigen_core.logging import logger
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
class Context7Client:
|
| 11 |
"""Context7 MCP client for fetching library documentation"""
|
| 12 |
|
| 13 |
def __init__(self):
|
| 14 |
-
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
async def call_context7_tool(
|
| 17 |
self, tool_name: str, args: Dict[str, Any]
|
| 18 |
) -> Optional[Dict[str, Any]]:
|
| 19 |
-
"""Call a Context7 tool via direct JSONRPC"""
|
| 20 |
try:
|
| 21 |
# Build the JSONRPC request
|
| 22 |
request = {
|
|
@@ -47,9 +65,35 @@ class Context7Client:
|
|
| 47 |
},
|
| 48 |
}
|
| 49 |
|
| 50 |
-
# Send both requests
|
| 51 |
-
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
# Parse responses
|
| 55 |
responses = stdout.decode().strip().split("\n")
|
|
@@ -204,6 +248,15 @@ class Context7Client:
|
|
| 204 |
self, library_id: str, topic: Optional[str] = None, tokens: int = 5000
|
| 205 |
) -> Optional[str]:
|
| 206 |
"""Get documentation for a library"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
logger.info(
|
| 208 |
f"Fetching docs for: {library_id}" + (f" (topic: {topic})" if topic else "")
|
| 209 |
)
|
|
@@ -233,7 +286,7 @@ class Context7Client:
|
|
| 233 |
return await self.get_library_docs(library_id, topic, tokens)
|
| 234 |
|
| 235 |
|
| 236 |
-
async def test_context7():
|
| 237 |
"""Test the Context7 integration"""
|
| 238 |
client = Context7Client()
|
| 239 |
|
|
|
|
| 4 |
import subprocess
|
| 5 |
import json
|
| 6 |
from typing import Optional, Dict, Any
|
| 7 |
+
from tenacity import (
|
| 8 |
+
retry,
|
| 9 |
+
stop_after_attempt,
|
| 10 |
+
wait_exponential,
|
| 11 |
+
retry_if_exception_type,
|
| 12 |
+
)
|
| 13 |
from ankigen_core.logging import logger
|
| 14 |
+
from ankigen_core.exceptions import (
|
| 15 |
+
ValidationError,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
MAX_STRING_LENGTH = 200 # Prevent excessively long inputs
|
| 19 |
+
SUBPROCESS_TIMEOUT = 60.0 # 60 second timeout for Context7 calls
|
| 20 |
|
| 21 |
|
| 22 |
class Context7Client:
|
| 23 |
"""Context7 MCP client for fetching library documentation"""
|
| 24 |
|
| 25 |
def __init__(self):
|
| 26 |
+
pass # No state needed - each call creates fresh subprocess
|
| 27 |
+
|
| 28 |
+
@retry(
|
| 29 |
+
stop=stop_after_attempt(3),
|
| 30 |
+
wait=wait_exponential(multiplier=1, min=2, max=10),
|
| 31 |
+
retry=retry_if_exception_type((TimeoutError, ConnectionError)),
|
| 32 |
+
reraise=True,
|
| 33 |
+
)
|
| 34 |
async def call_context7_tool(
|
| 35 |
self, tool_name: str, args: Dict[str, Any]
|
| 36 |
) -> Optional[Dict[str, Any]]:
|
| 37 |
+
"""Call a Context7 tool via direct JSONRPC with retry logic"""
|
| 38 |
try:
|
| 39 |
# Build the JSONRPC request
|
| 40 |
request = {
|
|
|
|
| 65 |
},
|
| 66 |
}
|
| 67 |
|
| 68 |
+
# Send both requests with timeout protection
|
| 69 |
+
# Optimize: Use list join for string concatenation
|
| 70 |
+
input_data = "\n".join([json.dumps(init_request), json.dumps(request), ""])
|
| 71 |
+
try:
|
| 72 |
+
stdout, stderr = await asyncio.wait_for(
|
| 73 |
+
process.communicate(input=input_data.encode()),
|
| 74 |
+
timeout=SUBPROCESS_TIMEOUT,
|
| 75 |
+
)
|
| 76 |
+
except asyncio.TimeoutError:
|
| 77 |
+
# Proper process cleanup on timeout
|
| 78 |
+
try:
|
| 79 |
+
if process.returncode is None: # Process still running
|
| 80 |
+
process.kill()
|
| 81 |
+
# Wait for process to actually terminate
|
| 82 |
+
await asyncio.wait_for(process.wait(), timeout=5.0)
|
| 83 |
+
except Exception as cleanup_error:
|
| 84 |
+
logger.error(f"Error during process cleanup: {cleanup_error}")
|
| 85 |
+
raise TimeoutError(
|
| 86 |
+
f"Context7 subprocess timed out after {SUBPROCESS_TIMEOUT}s"
|
| 87 |
+
)
|
| 88 |
+
except Exception:
|
| 89 |
+
# Clean up process on any other error
|
| 90 |
+
try:
|
| 91 |
+
if process.returncode is None:
|
| 92 |
+
process.kill()
|
| 93 |
+
await asyncio.wait_for(process.wait(), timeout=5.0)
|
| 94 |
+
except Exception:
|
| 95 |
+
pass # Best effort cleanup
|
| 96 |
+
raise
|
| 97 |
|
| 98 |
# Parse responses
|
| 99 |
responses = stdout.decode().strip().split("\n")
|
|
|
|
| 248 |
self, library_id: str, topic: Optional[str] = None, tokens: int = 5000
|
| 249 |
) -> Optional[str]:
|
| 250 |
"""Get documentation for a library"""
|
| 251 |
+
# Security: Validate library_id (should start with /)
|
| 252 |
+
if (
|
| 253 |
+
not library_id
|
| 254 |
+
or not library_id.startswith("/")
|
| 255 |
+
or len(library_id) > MAX_STRING_LENGTH
|
| 256 |
+
):
|
| 257 |
+
logger.error(f"Invalid library ID format (security): '{library_id}'")
|
| 258 |
+
raise ValidationError("Invalid library ID format")
|
| 259 |
+
|
| 260 |
logger.info(
|
| 261 |
f"Fetching docs for: {library_id}" + (f" (topic: {topic})" if topic else "")
|
| 262 |
)
|
|
|
|
| 286 |
return await self.get_library_docs(library_id, topic, tokens)
|
| 287 |
|
| 288 |
|
| 289 |
+
async def test_context7() -> None:
|
| 290 |
"""Test the Context7 integration"""
|
| 291 |
client = Context7Client()
|
| 292 |
|
ankigen_core/crawler.py
CHANGED
|
@@ -1,13 +1,61 @@
|
|
| 1 |
import requests
|
|
|
|
| 2 |
from bs4 import BeautifulSoup, Tag
|
| 3 |
from urllib.parse import urljoin, urlparse
|
| 4 |
import re
|
|
|
|
|
|
|
| 5 |
from typing import List, Set, Optional, Callable, Tuple
|
| 6 |
import xml.etree.ElementTree as ET # Added for Sitemap parsing
|
| 7 |
|
| 8 |
from ankigen_core.models import CrawledPage
|
| 9 |
from ankigen_core.utils import RateLimiter, get_logger
|
| 10 |
from ankigen_core.logging import logger # Added
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
class WebCrawler:
|
|
@@ -41,20 +89,83 @@ class WebCrawler:
|
|
| 41 |
self.logger = get_logger()
|
| 42 |
self.session = requests.Session()
|
| 43 |
self.session.headers.update({"User-Agent": self.user_agent})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
self.rate_limiter = RateLimiter(self.requests_per_second)
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
def _is_valid_url(self, url: str) -> bool:
|
| 47 |
"""
|
| 48 |
-
Checks if the URL is valid for crawling
|
|
|
|
| 49 |
"""
|
| 50 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
parsed_url = urlparse(url)
|
|
|
|
|
|
|
| 52 |
if not parsed_url.scheme or parsed_url.scheme.lower() not in [
|
| 53 |
"http",
|
| 54 |
"https",
|
| 55 |
]:
|
| 56 |
logger.debug(f"Invalid scheme for URL: {url}")
|
| 57 |
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
if parsed_url.netloc != self.base_domain:
|
| 59 |
logger.debug(f"URL {url} not in base domain {self.base_domain}")
|
| 60 |
return False
|
|
@@ -76,6 +187,10 @@ class WebCrawler:
|
|
| 76 |
except ValueError: # Handle potential errors from urlparse on malformed URLs
|
| 77 |
logger.warning(f"ValueError when parsing URL: {url}", exc_info=True)
|
| 78 |
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
return True
|
| 80 |
|
| 81 |
def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[str]:
|
|
@@ -194,40 +309,122 @@ class WebCrawler:
|
|
| 194 |
|
| 195 |
# --- End Sitemap Processing Methods ---
|
| 196 |
|
| 197 |
-
def
|
| 198 |
-
|
| 199 |
-
|
|
|
|
|
|
|
|
|
|
| 200 |
urls_to_visit: List[Tuple[str, int, Optional[str]]] = []
|
| 201 |
-
crawled_pages: List[CrawledPage] = []
|
| 202 |
-
initial_total_for_progress = 0
|
| 203 |
|
| 204 |
if self.use_sitemap and self.sitemap_url:
|
| 205 |
self.logger.info(f"Attempting to use sitemap: {self.sitemap_url}")
|
| 206 |
sitemap_extracted_urls = self._get_urls_from_sitemap()
|
| 207 |
if sitemap_extracted_urls:
|
| 208 |
for url in sitemap_extracted_urls:
|
| 209 |
-
if self._is_valid_url(
|
| 210 |
-
url
|
| 211 |
-
): # Checks domain, include/exclude patterns
|
| 212 |
-
urls_to_visit.append(
|
| 213 |
-
(url, 0, None)
|
| 214 |
-
) # Add with depth 0 and None parent
|
| 215 |
self.logger.info(
|
| 216 |
f"Initialized {len(urls_to_visit)} URLs to visit from sitemap after validation."
|
| 217 |
)
|
| 218 |
-
initial_total_for_progress = len(urls_to_visit)
|
| 219 |
else:
|
| 220 |
self.logger.warning(
|
| 221 |
-
"Sitemap processing yielded no URLs
|
| 222 |
)
|
| 223 |
-
# Fallback to start_url if sitemap is empty or fails
|
| 224 |
if self._is_valid_url(self.start_url):
|
| 225 |
-
urls_to_visit.append((self.start_url, 0, None))
|
| 226 |
-
initial_total_for_progress = len(urls_to_visit)
|
| 227 |
else:
|
| 228 |
if self._is_valid_url(self.start_url):
|
| 229 |
-
urls_to_visit.append((self.start_url, 0, None))
|
| 230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
|
| 232 |
processed_count = 0
|
| 233 |
while urls_to_visit:
|
|
@@ -246,28 +443,16 @@ class WebCrawler:
|
|
| 246 |
current_url,
|
| 247 |
)
|
| 248 |
|
| 249 |
-
if
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
# current_total_for_progress should reflect this for accuracy if it's dynamic.
|
| 254 |
-
# If sitemap, it remains initial_total_for_progress.
|
| 255 |
dynamic_total = (
|
| 256 |
initial_total_for_progress
|
| 257 |
if self.use_sitemap
|
| 258 |
else processed_count + len(urls_to_visit) + 1
|
| 259 |
)
|
| 260 |
-
progress_callback(
|
| 261 |
-
processed_count,
|
| 262 |
-
dynamic_total,
|
| 263 |
-
f"Skipped (visited): {current_url}",
|
| 264 |
-
)
|
| 265 |
-
continue
|
| 266 |
-
|
| 267 |
-
if current_depth > self.max_depth:
|
| 268 |
-
logger.debug(
|
| 269 |
-
f"Skipping URL {current_url} due to depth {current_depth} > max_depth {self.max_depth}"
|
| 270 |
-
)
|
| 271 |
continue
|
| 272 |
|
| 273 |
self.logger.info(
|
|
@@ -289,52 +474,10 @@ class WebCrawler:
|
|
| 289 |
html_content = response.text
|
| 290 |
soup = BeautifulSoup(html_content, "html.parser")
|
| 291 |
|
| 292 |
-
#
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
page_title = page_title_tag.string.strip()
|
| 297 |
-
else:
|
| 298 |
-
self.logger.debug(f"No title tag found for {current_url}")
|
| 299 |
-
|
| 300 |
-
meta_desc_tag = soup.find("meta", attrs={"name": "description"})
|
| 301 |
-
meta_description: Optional[str] = None
|
| 302 |
-
if isinstance(meta_desc_tag, Tag):
|
| 303 |
-
content = meta_desc_tag.get("content")
|
| 304 |
-
if isinstance(content, str):
|
| 305 |
-
meta_description = content.strip()
|
| 306 |
-
elif isinstance(content, list):
|
| 307 |
-
meta_description = " ".join(
|
| 308 |
-
str(item) for item in content
|
| 309 |
-
).strip()
|
| 310 |
-
self.logger.debug(
|
| 311 |
-
f"Meta description for {current_url} was a list, joined: {meta_description}"
|
| 312 |
-
)
|
| 313 |
-
else:
|
| 314 |
-
self.logger.debug(f"No meta description found for {current_url}")
|
| 315 |
-
|
| 316 |
-
meta_keywords_tag = soup.find("meta", attrs={"name": "keywords"})
|
| 317 |
-
meta_keywords: List[str] = []
|
| 318 |
-
if isinstance(meta_keywords_tag, Tag):
|
| 319 |
-
content = meta_keywords_tag.get("content")
|
| 320 |
-
raw_keywords_content: str = ""
|
| 321 |
-
if isinstance(content, str):
|
| 322 |
-
raw_keywords_content = content
|
| 323 |
-
elif isinstance(content, list):
|
| 324 |
-
raw_keywords_content = " ".join(str(item) for item in content)
|
| 325 |
-
self.logger.debug(
|
| 326 |
-
f"Meta keywords for {current_url} was a list, joined: {raw_keywords_content}"
|
| 327 |
-
)
|
| 328 |
-
|
| 329 |
-
if raw_keywords_content:
|
| 330 |
-
meta_keywords = [
|
| 331 |
-
k.strip()
|
| 332 |
-
for k in raw_keywords_content.split(",")
|
| 333 |
-
if k.strip()
|
| 334 |
-
]
|
| 335 |
-
else:
|
| 336 |
-
self.logger.debug(f"No meta keywords found for {current_url}")
|
| 337 |
-
# End reverted section
|
| 338 |
|
| 339 |
text_content = self._extract_text(soup)
|
| 340 |
|
|
|
|
| 1 |
import requests
|
| 2 |
+
from requests.adapters import HTTPAdapter
|
| 3 |
from bs4 import BeautifulSoup, Tag
|
| 4 |
from urllib.parse import urljoin, urlparse
|
| 5 |
import re
|
| 6 |
+
import ipaddress
|
| 7 |
+
import socket
|
| 8 |
from typing import List, Set, Optional, Callable, Tuple
|
| 9 |
import xml.etree.ElementTree as ET # Added for Sitemap parsing
|
| 10 |
|
| 11 |
from ankigen_core.models import CrawledPage
|
| 12 |
from ankigen_core.utils import RateLimiter, get_logger
|
| 13 |
from ankigen_core.logging import logger # Added
|
| 14 |
+
from ankigen_core.exceptions import (
|
| 15 |
+
SecurityError,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
# Security: Maximum URL length to prevent abuse
|
| 19 |
+
MAX_URL_LENGTH = 2048
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class SSRFProtectionAdapter(HTTPAdapter):
|
| 23 |
+
"""
|
| 24 |
+
Custom HTTP adapter that prevents SSRF attacks by validating
|
| 25 |
+
IP addresses at connection time (prevents DNS rebinding attacks).
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
def send(self, request, **kwargs) -> requests.Response:
|
| 29 |
+
"""Override send to validate IP before making request."""
|
| 30 |
+
# Parse the URL to get hostname
|
| 31 |
+
parsed = urlparse(request.url)
|
| 32 |
+
hostname = parsed.hostname
|
| 33 |
+
|
| 34 |
+
if hostname:
|
| 35 |
+
try:
|
| 36 |
+
# Resolve hostname to IP at request time (prevents DNS rebinding)
|
| 37 |
+
ip_str = socket.gethostbyname(hostname)
|
| 38 |
+
ip = ipaddress.ip_address(ip_str)
|
| 39 |
+
|
| 40 |
+
# Block private, loopback, link-local, and reserved addresses
|
| 41 |
+
if (
|
| 42 |
+
ip.is_private
|
| 43 |
+
or ip.is_loopback
|
| 44 |
+
or ip.is_link_local
|
| 45 |
+
or ip.is_reserved
|
| 46 |
+
):
|
| 47 |
+
msg = f"SSRF protection: Blocked request to private IP {ip_str} for hostname {hostname}"
|
| 48 |
+
logger.error(msg)
|
| 49 |
+
raise SecurityError(msg)
|
| 50 |
+
except (socket.gaierror, ValueError) as e:
|
| 51 |
+
logger.error(
|
| 52 |
+
f"SSRF protection: DNS resolution failed for {hostname}: {e}"
|
| 53 |
+
)
|
| 54 |
+
raise requests.exceptions.ConnectionError(
|
| 55 |
+
f"DNS resolution failed for {hostname}"
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
return super().send(request, **kwargs)
|
| 59 |
|
| 60 |
|
| 61 |
class WebCrawler:
|
|
|
|
| 89 |
self.logger = get_logger()
|
| 90 |
self.session = requests.Session()
|
| 91 |
self.session.headers.update({"User-Agent": self.user_agent})
|
| 92 |
+
|
| 93 |
+
# Security: Add SSRF protection adapter to prevent DNS rebinding attacks
|
| 94 |
+
# Performance: Configure connection pooling (10 connections per host, 20 total)
|
| 95 |
+
ssrf_adapter = SSRFProtectionAdapter(pool_connections=10, pool_maxsize=20)
|
| 96 |
+
self.session.mount("http://", ssrf_adapter)
|
| 97 |
+
self.session.mount("https://", ssrf_adapter)
|
| 98 |
+
|
| 99 |
self.rate_limiter = RateLimiter(self.requests_per_second)
|
| 100 |
|
| 101 |
+
def __enter__(self):
|
| 102 |
+
"""Context manager entry."""
|
| 103 |
+
return self
|
| 104 |
+
|
| 105 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 106 |
+
"""Context manager exit - cleanup resources."""
|
| 107 |
+
self.close()
|
| 108 |
+
return False
|
| 109 |
+
|
| 110 |
+
def close(self) -> None:
|
| 111 |
+
"""Close the requests session and cleanup resources."""
|
| 112 |
+
if hasattr(self, "session") and self.session:
|
| 113 |
+
self.session.close()
|
| 114 |
+
self.logger.debug("WebCrawler session closed")
|
| 115 |
+
|
| 116 |
def _is_valid_url(self, url: str) -> bool:
|
| 117 |
"""
|
| 118 |
+
Checks if the URL is valid for crawling with SSRF protection.
|
| 119 |
+
Validates scheme, domain, patterns, and blocks private IP ranges.
|
| 120 |
"""
|
| 121 |
try:
|
| 122 |
+
# Security: URL length check
|
| 123 |
+
if len(url) > MAX_URL_LENGTH:
|
| 124 |
+
logger.warning(
|
| 125 |
+
f"URL exceeds maximum length ({MAX_URL_LENGTH}): {url[:100]}..."
|
| 126 |
+
)
|
| 127 |
+
return False
|
| 128 |
+
|
| 129 |
parsed_url = urlparse(url)
|
| 130 |
+
|
| 131 |
+
# Security: Protocol whitelist (http/https only)
|
| 132 |
if not parsed_url.scheme or parsed_url.scheme.lower() not in [
|
| 133 |
"http",
|
| 134 |
"https",
|
| 135 |
]:
|
| 136 |
logger.debug(f"Invalid scheme for URL: {url}")
|
| 137 |
return False
|
| 138 |
+
|
| 139 |
+
# Security: SSRF protection - block private IP ranges
|
| 140 |
+
hostname = parsed_url.hostname
|
| 141 |
+
if not hostname:
|
| 142 |
+
logger.warning(f"URL missing hostname: {url}")
|
| 143 |
+
return False
|
| 144 |
+
|
| 145 |
+
# Resolve hostname to IP and check if it's private
|
| 146 |
+
try:
|
| 147 |
+
# Get IP address for hostname
|
| 148 |
+
ip_str = socket.gethostbyname(hostname)
|
| 149 |
+
ip = ipaddress.ip_address(ip_str)
|
| 150 |
+
|
| 151 |
+
# Block private, loopback, link-local, and reserved addresses
|
| 152 |
+
if (
|
| 153 |
+
ip.is_private
|
| 154 |
+
or ip.is_loopback
|
| 155 |
+
or ip.is_link_local
|
| 156 |
+
or ip.is_reserved
|
| 157 |
+
):
|
| 158 |
+
logger.error(
|
| 159 |
+
f"SSRF protection: Blocked private/internal IP {ip_str} for hostname {hostname}"
|
| 160 |
+
)
|
| 161 |
+
return False
|
| 162 |
+
|
| 163 |
+
except (socket.gaierror, ValueError, OSError) as e:
|
| 164 |
+
# DNS resolution failed or invalid IP
|
| 165 |
+
logger.warning(f"Could not resolve hostname {hostname}: {e}")
|
| 166 |
+
return False
|
| 167 |
+
|
| 168 |
+
# Domain check
|
| 169 |
if parsed_url.netloc != self.base_domain:
|
| 170 |
logger.debug(f"URL {url} not in base domain {self.base_domain}")
|
| 171 |
return False
|
|
|
|
| 187 |
except ValueError: # Handle potential errors from urlparse on malformed URLs
|
| 188 |
logger.warning(f"ValueError when parsing URL: {url}", exc_info=True)
|
| 189 |
return False
|
| 190 |
+
except Exception as e:
|
| 191 |
+
logger.error(f"Unexpected error validating URL {url}: {e}", exc_info=True)
|
| 192 |
+
return False
|
| 193 |
+
|
| 194 |
return True
|
| 195 |
|
| 196 |
def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[str]:
|
|
|
|
| 309 |
|
| 310 |
# --- End Sitemap Processing Methods ---
|
| 311 |
|
| 312 |
+
def _initialize_crawl_queue(self) -> List[Tuple[str, int, Optional[str]]]:
|
| 313 |
+
"""Initialize the crawl queue from sitemap or start URL.
|
| 314 |
+
|
| 315 |
+
Returns:
|
| 316 |
+
List of tuples (url, depth, parent_url) to visit
|
| 317 |
+
"""
|
| 318 |
urls_to_visit: List[Tuple[str, int, Optional[str]]] = []
|
|
|
|
|
|
|
| 319 |
|
| 320 |
if self.use_sitemap and self.sitemap_url:
|
| 321 |
self.logger.info(f"Attempting to use sitemap: {self.sitemap_url}")
|
| 322 |
sitemap_extracted_urls = self._get_urls_from_sitemap()
|
| 323 |
if sitemap_extracted_urls:
|
| 324 |
for url in sitemap_extracted_urls:
|
| 325 |
+
if self._is_valid_url(url):
|
| 326 |
+
urls_to_visit.append((url, 0, None))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 327 |
self.logger.info(
|
| 328 |
f"Initialized {len(urls_to_visit)} URLs to visit from sitemap after validation."
|
| 329 |
)
|
|
|
|
| 330 |
else:
|
| 331 |
self.logger.warning(
|
| 332 |
+
"Sitemap processing yielded no URLs. Falling back to start_url."
|
| 333 |
)
|
|
|
|
| 334 |
if self._is_valid_url(self.start_url):
|
| 335 |
+
urls_to_visit.append((self.start_url, 0, None))
|
|
|
|
| 336 |
else:
|
| 337 |
if self._is_valid_url(self.start_url):
|
| 338 |
+
urls_to_visit.append((self.start_url, 0, None))
|
| 339 |
+
|
| 340 |
+
return urls_to_visit
|
| 341 |
+
|
| 342 |
+
def _extract_page_metadata(
|
| 343 |
+
self, soup: BeautifulSoup, url: str
|
| 344 |
+
) -> Tuple[Optional[str], Optional[str], List[str]]:
|
| 345 |
+
"""Extract title, meta description, and meta keywords from page.
|
| 346 |
+
|
| 347 |
+
Args:
|
| 348 |
+
soup: BeautifulSoup object of the page
|
| 349 |
+
url: URL being processed (for logging)
|
| 350 |
+
|
| 351 |
+
Returns:
|
| 352 |
+
Tuple of (title, meta_description, meta_keywords_list)
|
| 353 |
+
"""
|
| 354 |
+
# Extract title
|
| 355 |
+
page_title_tag = soup.find("title")
|
| 356 |
+
page_title: Optional[str] = None
|
| 357 |
+
if isinstance(page_title_tag, Tag) and page_title_tag.string:
|
| 358 |
+
page_title = page_title_tag.string.strip()
|
| 359 |
+
else:
|
| 360 |
+
self.logger.debug(f"No title tag found for {url}")
|
| 361 |
+
|
| 362 |
+
# Extract meta description
|
| 363 |
+
meta_desc_tag = soup.find("meta", attrs={"name": "description"})
|
| 364 |
+
meta_description: Optional[str] = None
|
| 365 |
+
if isinstance(meta_desc_tag, Tag):
|
| 366 |
+
content = meta_desc_tag.get("content")
|
| 367 |
+
if isinstance(content, str):
|
| 368 |
+
meta_description = content.strip()
|
| 369 |
+
elif isinstance(content, list):
|
| 370 |
+
meta_description = " ".join(str(item) for item in content).strip()
|
| 371 |
+
self.logger.debug(
|
| 372 |
+
f"Meta description for {url} was a list, joined: {meta_description}"
|
| 373 |
+
)
|
| 374 |
+
else:
|
| 375 |
+
self.logger.debug(f"No meta description found for {url}")
|
| 376 |
+
|
| 377 |
+
# Extract meta keywords
|
| 378 |
+
meta_keywords_tag = soup.find("meta", attrs={"name": "keywords"})
|
| 379 |
+
meta_keywords: List[str] = []
|
| 380 |
+
if isinstance(meta_keywords_tag, Tag):
|
| 381 |
+
content_kw = meta_keywords_tag.get("content")
|
| 382 |
+
raw_keywords_content: str = ""
|
| 383 |
+
if isinstance(content_kw, str):
|
| 384 |
+
raw_keywords_content = content_kw
|
| 385 |
+
elif isinstance(content_kw, list):
|
| 386 |
+
raw_keywords_content = " ".join(str(item) for item in content_kw)
|
| 387 |
+
self.logger.debug(
|
| 388 |
+
f"Meta keywords for {url} was a list, joined: {raw_keywords_content}"
|
| 389 |
+
)
|
| 390 |
+
|
| 391 |
+
if raw_keywords_content:
|
| 392 |
+
meta_keywords = [
|
| 393 |
+
k.strip() for k in raw_keywords_content.split(",") if k.strip()
|
| 394 |
+
]
|
| 395 |
+
else:
|
| 396 |
+
self.logger.debug(f"No meta keywords found for {url}")
|
| 397 |
+
|
| 398 |
+
return page_title, meta_description, meta_keywords
|
| 399 |
+
|
| 400 |
+
def _should_skip_url(self, url: str, depth: int) -> Tuple[bool, Optional[str]]:
|
| 401 |
+
"""Check if URL should be skipped.
|
| 402 |
+
|
| 403 |
+
Args:
|
| 404 |
+
url: URL to check
|
| 405 |
+
depth: Current depth of URL
|
| 406 |
+
|
| 407 |
+
Returns:
|
| 408 |
+
Tuple of (should_skip, skip_reason)
|
| 409 |
+
"""
|
| 410 |
+
if url in self.visited_urls:
|
| 411 |
+
return True, f"Skipped (visited): {url}"
|
| 412 |
+
|
| 413 |
+
if depth > self.max_depth:
|
| 414 |
+
logger.debug(
|
| 415 |
+
f"Skipping URL {url} due to depth {depth} > max_depth {self.max_depth}"
|
| 416 |
+
)
|
| 417 |
+
return True, f"Skipped (max depth): {url}"
|
| 418 |
+
|
| 419 |
+
return False, None
|
| 420 |
+
|
| 421 |
+
def crawl(
|
| 422 |
+
self, progress_callback: Optional[Callable[[int, int, str], None]] = None
|
| 423 |
+
) -> List[CrawledPage]:
|
| 424 |
+
# Initialize URLs using helper method
|
| 425 |
+
urls_to_visit = self._initialize_crawl_queue()
|
| 426 |
+
crawled_pages: List[CrawledPage] = []
|
| 427 |
+
initial_total_for_progress = len(urls_to_visit)
|
| 428 |
|
| 429 |
processed_count = 0
|
| 430 |
while urls_to_visit:
|
|
|
|
| 443 |
current_url,
|
| 444 |
)
|
| 445 |
|
| 446 |
+
# Check if URL should be skipped using helper method
|
| 447 |
+
should_skip, skip_reason = self._should_skip_url(current_url, current_depth)
|
| 448 |
+
if should_skip:
|
| 449 |
+
if progress_callback and skip_reason:
|
|
|
|
|
|
|
| 450 |
dynamic_total = (
|
| 451 |
initial_total_for_progress
|
| 452 |
if self.use_sitemap
|
| 453 |
else processed_count + len(urls_to_visit) + 1
|
| 454 |
)
|
| 455 |
+
progress_callback(processed_count, dynamic_total, skip_reason)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 456 |
continue
|
| 457 |
|
| 458 |
self.logger.info(
|
|
|
|
| 474 |
html_content = response.text
|
| 475 |
soup = BeautifulSoup(html_content, "html.parser")
|
| 476 |
|
| 477 |
+
# Extract metadata using helper method
|
| 478 |
+
page_title, meta_description, meta_keywords = (
|
| 479 |
+
self._extract_page_metadata(soup, current_url)
|
| 480 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 481 |
|
| 482 |
text_content = self._extract_text(soup)
|
| 483 |
|
ankigen_core/exceptions.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Custom exceptions for AnkiGen application.
|
| 2 |
+
|
| 3 |
+
This module provides a hierarchy of custom exceptions to standardize
|
| 4 |
+
error handling across the codebase.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class AnkigenError(Exception):
|
| 9 |
+
"""Base exception for all AnkiGen errors."""
|
| 10 |
+
|
| 11 |
+
pass
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class ValidationError(AnkigenError):
|
| 15 |
+
"""Raised when input validation fails."""
|
| 16 |
+
|
| 17 |
+
pass
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class SecurityError(AnkigenError):
|
| 21 |
+
"""Raised when a security check fails (SSRF, command injection, etc.)."""
|
| 22 |
+
|
| 23 |
+
pass
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class APIError(AnkigenError):
|
| 27 |
+
"""Base exception for API-related errors."""
|
| 28 |
+
|
| 29 |
+
pass
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class OpenAIAPIError(APIError):
|
| 33 |
+
"""Raised when OpenAI API calls fail."""
|
| 34 |
+
|
| 35 |
+
pass
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class Context7APIError(APIError):
|
| 39 |
+
"""Raised when Context7 API calls fail."""
|
| 40 |
+
|
| 41 |
+
pass
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
class CrawlerError(AnkigenError):
|
| 45 |
+
"""Base exception for web crawler errors."""
|
| 46 |
+
|
| 47 |
+
pass
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class URLValidationError(CrawlerError):
|
| 51 |
+
"""Raised when URL validation fails."""
|
| 52 |
+
|
| 53 |
+
pass
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class ContentExtractionError(CrawlerError):
|
| 57 |
+
"""Raised when content extraction from web page fails."""
|
| 58 |
+
|
| 59 |
+
pass
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
class ExportError(AnkigenError):
|
| 63 |
+
"""Base exception for export-related errors."""
|
| 64 |
+
|
| 65 |
+
pass
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
class CardGenerationError(AnkigenError):
|
| 69 |
+
"""Raised when card generation fails."""
|
| 70 |
+
|
| 71 |
+
pass
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
class ConfigurationError(AnkigenError):
|
| 75 |
+
"""Raised when configuration is invalid or missing."""
|
| 76 |
+
|
| 77 |
+
pass
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def handle_exception(
|
| 81 |
+
exc: Exception,
|
| 82 |
+
logger,
|
| 83 |
+
message: str,
|
| 84 |
+
reraise: bool = True,
|
| 85 |
+
reraise_as: type[Exception] | None = None,
|
| 86 |
+
) -> None:
|
| 87 |
+
"""Standardized exception handler.
|
| 88 |
+
|
| 89 |
+
Args:
|
| 90 |
+
exc: The exception to handle
|
| 91 |
+
logger: Logger instance to use
|
| 92 |
+
message: Error message to log
|
| 93 |
+
reraise: Whether to re-raise the exception
|
| 94 |
+
reraise_as: Optional exception type to wrap and re-raise as
|
| 95 |
+
|
| 96 |
+
Raises:
|
| 97 |
+
The original exception or wrapped exception if reraise is True
|
| 98 |
+
"""
|
| 99 |
+
logger.error(f"{message}: {exc}", exc_info=True)
|
| 100 |
+
|
| 101 |
+
if reraise:
|
| 102 |
+
if reraise_as:
|
| 103 |
+
raise reraise_as(f"{message}: {exc}") from exc
|
| 104 |
+
raise
|
ankigen_core/exporters.py
CHANGED
|
@@ -4,6 +4,7 @@ import gradio as gr
|
|
| 4 |
import pandas as pd
|
| 5 |
import genanki
|
| 6 |
import random
|
|
|
|
| 7 |
from typing import List, Dict, Any, Optional
|
| 8 |
import csv
|
| 9 |
from datetime import datetime
|
|
@@ -23,6 +24,57 @@ def _format_field_as_string(value: Any) -> str:
|
|
| 23 |
return str(value).strip()
|
| 24 |
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
# --- Constants for APKG Generation (Subtask 10) ---
|
| 27 |
ANKI_BASIC_MODEL_NAME = "AnkiGen Basic"
|
| 28 |
ANKI_CLOZE_MODEL_NAME = "AnkiGen Cloze"
|
|
@@ -587,19 +639,18 @@ def export_cards_to_csv(
|
|
| 587 |
KeyError: If a card dictionary is missing essential keys like 'front' or 'back'.
|
| 588 |
ValueError: If the cards list is empty or not provided.
|
| 589 |
"""
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
raise ValueError("No cards provided to export.")
|
| 593 |
|
|
|
|
| 594 |
if not filename:
|
| 595 |
-
|
| 596 |
-
# Ensure filename is just the name, not a path if not intended
|
| 597 |
-
# For simplicity, this example saves in the current working directory if no path is specified.
|
| 598 |
-
filename = f"ankigen_cards_{timestamp}.csv"
|
| 599 |
logger.info(f"No filename provided, generated: {filename}")
|
| 600 |
|
|
|
|
|
|
|
|
|
|
| 601 |
# Define the fieldnames expected in the CSV.
|
| 602 |
-
# 'front' and 'back' are mandatory.
|
| 603 |
fieldnames = ["front", "back", "tags", "note_type"]
|
| 604 |
|
| 605 |
try:
|
|
@@ -611,7 +662,7 @@ def export_cards_to_csv(
|
|
| 611 |
writer.writeheader()
|
| 612 |
for i, card in enumerate(cards):
|
| 613 |
try:
|
| 614 |
-
# Ensure mandatory fields exist
|
| 615 |
if "front" not in card or "back" not in card:
|
| 616 |
raise KeyError(
|
| 617 |
f"Card at index {i} is missing 'front' or 'back' key."
|
|
@@ -628,16 +679,13 @@ def export_cards_to_csv(
|
|
| 628 |
logger.error(
|
| 629 |
f"Skipping card due to KeyError: {e_inner}. Card data: {card}"
|
| 630 |
)
|
| 631 |
-
# Optionally re-raise if one bad card should stop the whole export,
|
| 632 |
-
# or continue to export valid cards.
|
| 633 |
-
# For this implementation, we log and continue.
|
| 634 |
continue
|
| 635 |
logger.info(f"Successfully exported cards to {filename}")
|
| 636 |
return filename
|
| 637 |
except IOError as e_io:
|
| 638 |
logger.error(f"IOError during CSV export to {filename}: {e_io}", exc_info=True)
|
| 639 |
-
raise
|
| 640 |
-
except Exception as e_general:
|
| 641 |
logger.error(
|
| 642 |
f"Unexpected error during CSV export to {filename}: {e_general}",
|
| 643 |
exc_info=True,
|
|
@@ -664,16 +712,18 @@ def export_cards_to_apkg(
|
|
| 664 |
The path to the exported file.
|
| 665 |
"""
|
| 666 |
logger.info(f"Starting APKG export for {len(cards)} cards to deck '{deck_name}'.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 667 |
if not filename:
|
| 668 |
-
|
| 669 |
-
filename = f"ankigen_deck_{timestamp}.apkg"
|
| 670 |
elif not filename.lower().endswith(".apkg"):
|
| 671 |
filename += ".apkg"
|
| 672 |
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
os.makedirs(output_dir)
|
| 676 |
-
logger.info(f"Created output directory for APKG: {output_dir}")
|
| 677 |
|
| 678 |
anki_basic_model = BASIC_MODEL
|
| 679 |
anki_cloze_model = CLOZE_MODEL
|
|
@@ -687,20 +737,17 @@ def export_cards_to_apkg(
|
|
| 687 |
tags_for_note_object = card_dict.get("tags_for_note_object", [])
|
| 688 |
|
| 689 |
# Extract all potential fields, defaulting to empty strings
|
| 690 |
-
|
| 691 |
-
|
| 692 |
-
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
|
| 696 |
-
|
| 697 |
-
|
| 698 |
-
|
| 699 |
-
|
| 700 |
-
|
| 701 |
-
|
| 702 |
-
# The 'Question' field from card_dict is used as the main text for both basic and cloze.
|
| 703 |
-
# For cloze, this 'Question' field should contain the cloze-formatted text (e.g., "The capital of {{c1::France}} is Paris.")
|
| 704 |
if not question:
|
| 705 |
logger.error(
|
| 706 |
f"SKIPPING CARD DUE TO EMPTY 'Question' (front/text) field. Card data: {card_dict}"
|
|
@@ -709,11 +756,10 @@ def export_cards_to_apkg(
|
|
| 709 |
|
| 710 |
try:
|
| 711 |
if note_type.lower() == "cloze":
|
| 712 |
-
# CLOZE_MODEL fields
|
| 713 |
-
# Learning_Outcomes, Difficulty, SourceURL, TagsStr
|
| 714 |
note_fields = [
|
| 715 |
-
question, # Text
|
| 716 |
-
answer, # Back Extra
|
| 717 |
explanation,
|
| 718 |
example,
|
| 719 |
prerequisites,
|
|
@@ -728,8 +774,7 @@ def export_cards_to_apkg(
|
|
| 728 |
tags=tags_for_note_object,
|
| 729 |
)
|
| 730 |
else: # Basic
|
| 731 |
-
# BASIC_MODEL fields
|
| 732 |
-
# Learning_Outcomes, Difficulty, SourceURL, TagsStr
|
| 733 |
note_fields = [
|
| 734 |
question,
|
| 735 |
answer,
|
|
@@ -755,24 +800,17 @@ def export_cards_to_apkg(
|
|
| 755 |
)
|
| 756 |
logger.warning(f"Skipping card due to error: Question='{question[:50]}...'")
|
| 757 |
|
| 758 |
-
if notes_added_count == 0
|
| 759 |
-
logger.error(
|
| 760 |
"No valid notes could be created from the provided cards. APKG generation aborted."
|
| 761 |
)
|
| 762 |
-
# This error should be caught by the calling function in app.py to inform the user
|
| 763 |
raise gr.Error("Failed to create any valid Anki notes from the input.")
|
| 764 |
-
elif not cards: # No cards provided initially
|
| 765 |
-
logger.info("No cards provided to export to APKG. APKG generation skipped.")
|
| 766 |
-
# Depending on desired behavior, could raise or return a specific status/filename
|
| 767 |
-
# For now, let's assume an empty/default filename or None indicates no action if no cards
|
| 768 |
-
# However, the function is typed to return str, so raising is more consistent if no file is made.
|
| 769 |
-
raise gr.Error("No cards were provided to generate an APKG file.")
|
| 770 |
-
else: # notes_added_count > 0
|
| 771 |
-
logger.info(
|
| 772 |
-
f"Added {notes_added_count} notes to deck '{deck_name}'. Proceeding to package."
|
| 773 |
-
)
|
| 774 |
|
| 775 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 776 |
package = genanki.Package(anki_deck)
|
| 777 |
try:
|
| 778 |
package.write_to_file(filename)
|
|
@@ -846,18 +884,18 @@ def export_dataframe_to_csv(
|
|
| 846 |
logger.info(
|
| 847 |
f"Attempting to export DataFrame to CSV. Suggested filename: {filename_suggestion}"
|
| 848 |
)
|
| 849 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 850 |
logger.warning(
|
| 851 |
"No data provided to export_dataframe_to_csv. Skipping CSV export."
|
| 852 |
)
|
| 853 |
-
raise gr.Error(
|
| 854 |
-
"No card data available"
|
| 855 |
-
) # Notify user via Gradio with Error instead of Info
|
| 856 |
-
# return None # This line is now unreachable due to the raise
|
| 857 |
|
| 858 |
try:
|
| 859 |
-
#
|
| 860 |
-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 861 |
base_name_from_suggestion = "ankigen_cards" # Default base part
|
| 862 |
|
| 863 |
# Sanitize and use the suggestion (e.g., subject name) if provided
|
|
@@ -867,28 +905,23 @@ def export_dataframe_to_csv(
|
|
| 867 |
safe_suggestion = (
|
| 868 |
processed_suggestion.replace(" ", "_")
|
| 869 |
.replace("/", "-")
|
| 870 |
-
.replace("
|
| 871 |
)
|
| 872 |
-
if
|
| 873 |
-
safe_suggestion
|
| 874 |
-
): # If suggestion wasn't just '.csv' or empty after processing
|
| 875 |
base_name_from_suggestion = f"ankigen_{safe_suggestion[:50]}"
|
| 876 |
-
# If suggestion was empty or only '.csv', default base_name_from_suggestion remains 'ankigen_cards'
|
| 877 |
|
| 878 |
-
|
|
|
|
|
|
|
|
|
|
| 879 |
|
| 880 |
-
# Ensure output directory exists
|
| 881 |
-
|
| 882 |
-
if output_dir and not os.path.exists(output_dir):
|
| 883 |
-
os.makedirs(output_dir)
|
| 884 |
-
logger.info(f"Created output directory for CSV: {output_dir}")
|
| 885 |
|
| 886 |
-
data.to_csv(final_filename, index=False)
|
| 887 |
logger.info(f"Successfully exported DataFrame to CSV: {final_filename}")
|
| 888 |
-
gr.Info(
|
| 889 |
-
|
| 890 |
-
) # User-friendly message
|
| 891 |
-
return final_filename # MODIFIED: Return final_filename
|
| 892 |
except Exception as e:
|
| 893 |
logger.error(f"Error exporting DataFrame to CSV: {e}", exc_info=True)
|
| 894 |
gr.Error(f"Error exporting DataFrame to CSV: {e}")
|
|
@@ -902,9 +935,8 @@ def export_dataframe_to_apkg(
|
|
| 902 |
deck_name: str,
|
| 903 |
) -> str:
|
| 904 |
"""Exports a DataFrame of cards to an Anki .apkg file."""
|
| 905 |
-
|
| 906 |
-
|
| 907 |
-
raise ValueError("No cards in DataFrame to export.")
|
| 908 |
|
| 909 |
logger.info(
|
| 910 |
f"Starting APKG export for DataFrame with {len(df)} rows to deck '{deck_name}'. Output: {output_path}"
|
|
@@ -918,25 +950,17 @@ def export_dataframe_to_apkg(
|
|
| 918 |
)
|
| 919 |
topic = _format_field_as_string(row.get("Topic", ""))
|
| 920 |
difficulty_raw = _format_field_as_string(row.get("Difficulty", ""))
|
| 921 |
-
difficulty_plain_for_tag = strip_html_tags(
|
| 922 |
-
difficulty_raw
|
| 923 |
-
) # Strip HTML for the tag
|
| 924 |
|
| 925 |
-
tags_list_for_note_obj = []
|
| 926 |
if topic:
|
| 927 |
tags_list_for_note_obj.append(topic.replace(" ", "_").replace(",", "_"))
|
| 928 |
-
if difficulty_plain_for_tag:
|
| 929 |
-
# Further sanitize for Anki tags: replace spaces with underscores, remove other invalid chars if any.
|
| 930 |
-
# Anki tags also often don't like colons or other special chars except underscore/hyphen.
|
| 931 |
-
# For now, just replacing space, as that's the error seen.
|
| 932 |
safe_difficulty_tag = difficulty_plain_for_tag.replace(" ", "_")
|
| 933 |
tags_list_for_note_obj.append(safe_difficulty_tag)
|
| 934 |
|
| 935 |
-
tags_str_for_field = " ".join(
|
| 936 |
-
tags_list_for_note_obj
|
| 937 |
-
) # For the 'TagsStr' model field
|
| 938 |
|
| 939 |
-
# Prepare a dictionary that contains all possible fields our models might need.
|
| 940 |
card_data_for_note = {
|
| 941 |
"note_type": note_type_val,
|
| 942 |
"tags_for_note_object": tags_list_for_note_obj,
|
|
@@ -949,7 +973,7 @@ def export_dataframe_to_apkg(
|
|
| 949 |
"Learning_Outcomes": _format_field_as_string(
|
| 950 |
row.get("Learning_Outcomes", "")
|
| 951 |
),
|
| 952 |
-
"Difficulty": difficulty_raw,
|
| 953 |
"SourceURL": _format_field_as_string(row.get("Source_URL", "")),
|
| 954 |
}
|
| 955 |
cards_for_apkg.append(card_data_for_note)
|
|
|
|
| 4 |
import pandas as pd
|
| 5 |
import genanki
|
| 6 |
import random
|
| 7 |
+
import html
|
| 8 |
from typing import List, Dict, Any, Optional
|
| 9 |
import csv
|
| 10 |
from datetime import datetime
|
|
|
|
| 24 |
return str(value).strip()
|
| 25 |
|
| 26 |
|
| 27 |
+
def _generate_timestamped_filename(
|
| 28 |
+
base_name: str, extension: str, include_timestamp: bool = True
|
| 29 |
+
) -> str:
|
| 30 |
+
"""Generate a filename with optional timestamp.
|
| 31 |
+
|
| 32 |
+
Args:
|
| 33 |
+
base_name: The base name for the file (without extension)
|
| 34 |
+
extension: File extension (e.g., 'csv', 'apkg')
|
| 35 |
+
include_timestamp: Whether to include timestamp in filename
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
Generated filename with extension
|
| 39 |
+
"""
|
| 40 |
+
if include_timestamp:
|
| 41 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 42 |
+
return f"{base_name}_{timestamp}.{extension}"
|
| 43 |
+
return f"{base_name}.{extension}"
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _ensure_output_directory(filepath: str) -> None:
|
| 47 |
+
"""Ensure the output directory exists for the given filepath.
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
filepath: Full path to the file
|
| 51 |
+
|
| 52 |
+
Creates the directory if it doesn't exist.
|
| 53 |
+
"""
|
| 54 |
+
output_dir = os.path.dirname(filepath)
|
| 55 |
+
if output_dir and not os.path.exists(output_dir):
|
| 56 |
+
os.makedirs(output_dir)
|
| 57 |
+
logger.info(f"Created output directory: {output_dir}")
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def _validate_non_empty_data(data: Any, data_type: str) -> None:
|
| 61 |
+
"""Validate that data is not empty.
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
data: The data to validate (list, DataFrame, etc.)
|
| 65 |
+
data_type: Description of data type for error messages
|
| 66 |
+
|
| 67 |
+
Raises:
|
| 68 |
+
ValueError: If data is empty or None
|
| 69 |
+
"""
|
| 70 |
+
if data is None:
|
| 71 |
+
raise ValueError(f"No {data_type} provided to export.")
|
| 72 |
+
if isinstance(data, list) and not data:
|
| 73 |
+
raise ValueError(f"No {data_type} provided to export.")
|
| 74 |
+
if isinstance(data, pd.DataFrame) and data.empty:
|
| 75 |
+
raise ValueError(f"No {data_type} available to export.")
|
| 76 |
+
|
| 77 |
+
|
| 78 |
# --- Constants for APKG Generation (Subtask 10) ---
|
| 79 |
ANKI_BASIC_MODEL_NAME = "AnkiGen Basic"
|
| 80 |
ANKI_CLOZE_MODEL_NAME = "AnkiGen Cloze"
|
|
|
|
| 639 |
KeyError: If a card dictionary is missing essential keys like 'front' or 'back'.
|
| 640 |
ValueError: If the cards list is empty or not provided.
|
| 641 |
"""
|
| 642 |
+
# Validation using helper
|
| 643 |
+
_validate_non_empty_data(cards, "cards")
|
|
|
|
| 644 |
|
| 645 |
+
# Filename generation using helper
|
| 646 |
if not filename:
|
| 647 |
+
filename = _generate_timestamped_filename("ankigen_cards", "csv")
|
|
|
|
|
|
|
|
|
|
| 648 |
logger.info(f"No filename provided, generated: {filename}")
|
| 649 |
|
| 650 |
+
# Ensure output directory exists using helper
|
| 651 |
+
_ensure_output_directory(filename)
|
| 652 |
+
|
| 653 |
# Define the fieldnames expected in the CSV.
|
|
|
|
| 654 |
fieldnames = ["front", "back", "tags", "note_type"]
|
| 655 |
|
| 656 |
try:
|
|
|
|
| 662 |
writer.writeheader()
|
| 663 |
for i, card in enumerate(cards):
|
| 664 |
try:
|
| 665 |
+
# Ensure mandatory fields exist
|
| 666 |
if "front" not in card or "back" not in card:
|
| 667 |
raise KeyError(
|
| 668 |
f"Card at index {i} is missing 'front' or 'back' key."
|
|
|
|
| 679 |
logger.error(
|
| 680 |
f"Skipping card due to KeyError: {e_inner}. Card data: {card}"
|
| 681 |
)
|
|
|
|
|
|
|
|
|
|
| 682 |
continue
|
| 683 |
logger.info(f"Successfully exported cards to {filename}")
|
| 684 |
return filename
|
| 685 |
except IOError as e_io:
|
| 686 |
logger.error(f"IOError during CSV export to {filename}: {e_io}", exc_info=True)
|
| 687 |
+
raise
|
| 688 |
+
except Exception as e_general:
|
| 689 |
logger.error(
|
| 690 |
f"Unexpected error during CSV export to {filename}: {e_general}",
|
| 691 |
exc_info=True,
|
|
|
|
| 712 |
The path to the exported file.
|
| 713 |
"""
|
| 714 |
logger.info(f"Starting APKG export for {len(cards)} cards to deck '{deck_name}'.")
|
| 715 |
+
|
| 716 |
+
# Validation using helper - note this now raises ValueError instead of gr.Error
|
| 717 |
+
_validate_non_empty_data(cards, "cards")
|
| 718 |
+
|
| 719 |
+
# Filename generation using helper
|
| 720 |
if not filename:
|
| 721 |
+
filename = _generate_timestamped_filename("ankigen_deck", "apkg")
|
|
|
|
| 722 |
elif not filename.lower().endswith(".apkg"):
|
| 723 |
filename += ".apkg"
|
| 724 |
|
| 725 |
+
# Ensure output directory exists using helper
|
| 726 |
+
_ensure_output_directory(filename)
|
|
|
|
|
|
|
| 727 |
|
| 728 |
anki_basic_model = BASIC_MODEL
|
| 729 |
anki_cloze_model = CLOZE_MODEL
|
|
|
|
| 737 |
tags_for_note_object = card_dict.get("tags_for_note_object", [])
|
| 738 |
|
| 739 |
# Extract all potential fields, defaulting to empty strings
|
| 740 |
+
# Security: Sanitize HTML to prevent XSS when viewing cards in Anki
|
| 741 |
+
question = html.escape(card_dict.get("Question", ""))
|
| 742 |
+
answer = html.escape(card_dict.get("Answer", ""))
|
| 743 |
+
explanation = html.escape(card_dict.get("Explanation", ""))
|
| 744 |
+
example = html.escape(card_dict.get("Example", ""))
|
| 745 |
+
prerequisites = html.escape(card_dict.get("Prerequisites", ""))
|
| 746 |
+
learning_outcomes = html.escape(card_dict.get("Learning_Outcomes", ""))
|
| 747 |
+
difficulty = html.escape(card_dict.get("Difficulty", ""))
|
| 748 |
+
source_url = html.escape(card_dict.get("SourceURL", ""))
|
| 749 |
+
tags_str_field = html.escape(card_dict.get("TagsStr", ""))
|
| 750 |
+
|
|
|
|
|
|
|
|
|
|
| 751 |
if not question:
|
| 752 |
logger.error(
|
| 753 |
f"SKIPPING CARD DUE TO EMPTY 'Question' (front/text) field. Card data: {card_dict}"
|
|
|
|
| 756 |
|
| 757 |
try:
|
| 758 |
if note_type.lower() == "cloze":
|
| 759 |
+
# CLOZE_MODEL fields
|
|
|
|
| 760 |
note_fields = [
|
| 761 |
+
question, # Text
|
| 762 |
+
answer, # Back Extra
|
| 763 |
explanation,
|
| 764 |
example,
|
| 765 |
prerequisites,
|
|
|
|
| 774 |
tags=tags_for_note_object,
|
| 775 |
)
|
| 776 |
else: # Basic
|
| 777 |
+
# BASIC_MODEL fields
|
|
|
|
| 778 |
note_fields = [
|
| 779 |
question,
|
| 780 |
answer,
|
|
|
|
| 800 |
)
|
| 801 |
logger.warning(f"Skipping card due to error: Question='{question[:50]}...'")
|
| 802 |
|
| 803 |
+
if notes_added_count == 0:
|
| 804 |
+
logger.error(
|
| 805 |
"No valid notes could be created from the provided cards. APKG generation aborted."
|
| 806 |
)
|
|
|
|
| 807 |
raise gr.Error("Failed to create any valid Anki notes from the input.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 808 |
|
| 809 |
+
logger.info(
|
| 810 |
+
f"Added {notes_added_count} notes to deck '{deck_name}'. Proceeding to package."
|
| 811 |
+
)
|
| 812 |
+
|
| 813 |
+
# Package and write
|
| 814 |
package = genanki.Package(anki_deck)
|
| 815 |
try:
|
| 816 |
package.write_to_file(filename)
|
|
|
|
| 884 |
logger.info(
|
| 885 |
f"Attempting to export DataFrame to CSV. Suggested filename: {filename_suggestion}"
|
| 886 |
)
|
| 887 |
+
|
| 888 |
+
# Validation using helper
|
| 889 |
+
try:
|
| 890 |
+
_validate_non_empty_data(data, "card data")
|
| 891 |
+
except ValueError:
|
| 892 |
logger.warning(
|
| 893 |
"No data provided to export_dataframe_to_csv. Skipping CSV export."
|
| 894 |
)
|
| 895 |
+
raise gr.Error("No card data available")
|
|
|
|
|
|
|
|
|
|
| 896 |
|
| 897 |
try:
|
| 898 |
+
# Generate filename from suggestion
|
|
|
|
| 899 |
base_name_from_suggestion = "ankigen_cards" # Default base part
|
| 900 |
|
| 901 |
# Sanitize and use the suggestion (e.g., subject name) if provided
|
|
|
|
| 905 |
safe_suggestion = (
|
| 906 |
processed_suggestion.replace(" ", "_")
|
| 907 |
.replace("/", "-")
|
| 908 |
+
.replace("\\", "-")
|
| 909 |
)
|
| 910 |
+
if safe_suggestion:
|
|
|
|
|
|
|
| 911 |
base_name_from_suggestion = f"ankigen_{safe_suggestion[:50]}"
|
|
|
|
| 912 |
|
| 913 |
+
# Generate timestamped filename using helper
|
| 914 |
+
final_filename = _generate_timestamped_filename(
|
| 915 |
+
base_name_from_suggestion, "csv"
|
| 916 |
+
)
|
| 917 |
|
| 918 |
+
# Ensure output directory exists using helper
|
| 919 |
+
_ensure_output_directory(final_filename)
|
|
|
|
|
|
|
|
|
|
| 920 |
|
| 921 |
+
data.to_csv(final_filename, index=False)
|
| 922 |
logger.info(f"Successfully exported DataFrame to CSV: {final_filename}")
|
| 923 |
+
gr.Info(f"CSV ready for download: {os.path.basename(final_filename)}")
|
| 924 |
+
return final_filename
|
|
|
|
|
|
|
| 925 |
except Exception as e:
|
| 926 |
logger.error(f"Error exporting DataFrame to CSV: {e}", exc_info=True)
|
| 927 |
gr.Error(f"Error exporting DataFrame to CSV: {e}")
|
|
|
|
| 935 |
deck_name: str,
|
| 936 |
) -> str:
|
| 937 |
"""Exports a DataFrame of cards to an Anki .apkg file."""
|
| 938 |
+
# Validation using helper
|
| 939 |
+
_validate_non_empty_data(df, "cards in DataFrame")
|
|
|
|
| 940 |
|
| 941 |
logger.info(
|
| 942 |
f"Starting APKG export for DataFrame with {len(df)} rows to deck '{deck_name}'. Output: {output_path}"
|
|
|
|
| 950 |
)
|
| 951 |
topic = _format_field_as_string(row.get("Topic", ""))
|
| 952 |
difficulty_raw = _format_field_as_string(row.get("Difficulty", ""))
|
| 953 |
+
difficulty_plain_for_tag = strip_html_tags(difficulty_raw)
|
|
|
|
|
|
|
| 954 |
|
| 955 |
+
tags_list_for_note_obj = []
|
| 956 |
if topic:
|
| 957 |
tags_list_for_note_obj.append(topic.replace(" ", "_").replace(",", "_"))
|
| 958 |
+
if difficulty_plain_for_tag:
|
|
|
|
|
|
|
|
|
|
| 959 |
safe_difficulty_tag = difficulty_plain_for_tag.replace(" ", "_")
|
| 960 |
tags_list_for_note_obj.append(safe_difficulty_tag)
|
| 961 |
|
| 962 |
+
tags_str_for_field = " ".join(tags_list_for_note_obj)
|
|
|
|
|
|
|
| 963 |
|
|
|
|
| 964 |
card_data_for_note = {
|
| 965 |
"note_type": note_type_val,
|
| 966 |
"tags_for_note_object": tags_list_for_note_obj,
|
|
|
|
| 973 |
"Learning_Outcomes": _format_field_as_string(
|
| 974 |
row.get("Learning_Outcomes", "")
|
| 975 |
),
|
| 976 |
+
"Difficulty": difficulty_raw,
|
| 977 |
"SourceURL": _format_field_as_string(row.get("Source_URL", "")),
|
| 978 |
}
|
| 979 |
cards_for_apkg.append(card_data_for_note)
|
ankigen_core/llm_interface.py
CHANGED
|
@@ -74,6 +74,52 @@ class OpenAIClientManager:
|
|
| 74 |
)
|
| 75 |
return self._client
|
| 76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
# Retry decorator for API calls - kept similar to original
|
| 79 |
@retry(
|
|
@@ -114,6 +160,7 @@ async def structured_output_completion(
|
|
| 114 |
):
|
| 115 |
effective_system_prompt = f"{system_prompt}\nProvide your response as a JSON object matching the specified schema."
|
| 116 |
|
|
|
|
| 117 |
completion = await openai_client.chat.completions.create(
|
| 118 |
model=model,
|
| 119 |
messages=[
|
|
@@ -122,6 +169,7 @@ async def structured_output_completion(
|
|
| 122 |
],
|
| 123 |
response_format=response_format, # Pass the dict directly
|
| 124 |
temperature=0.7, # Consider making this configurable
|
|
|
|
| 125 |
)
|
| 126 |
|
| 127 |
if not hasattr(completion, "choices") or not completion.choices:
|
|
@@ -252,8 +300,30 @@ async def process_crawled_page(
|
|
| 252 |
custom_system_prompt: Optional[str] = None,
|
| 253 |
custom_user_prompt_template: Optional[str] = None,
|
| 254 |
max_prompt_content_tokens: int = 6000,
|
|
|
|
| 255 |
) -> List[Card]:
|
| 256 |
-
"""Process a crawled page and extract structured Card objects using OpenAI.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
logger.info(
|
| 258 |
f"Processing page: {page.url} with model {model}, max_prompt_content_tokens: {max_prompt_content_tokens}"
|
| 259 |
)
|
|
@@ -362,6 +432,7 @@ Generate a few high-quality Anki cards from this content.
|
|
| 362 |
f"Attempting to generate cards for {page.url} using model {model}."
|
| 363 |
)
|
| 364 |
response_format_param = {"type": "json_object"}
|
|
|
|
| 365 |
response_data = await openai_client.chat.completions.create(
|
| 366 |
model=model,
|
| 367 |
messages=[
|
|
@@ -370,6 +441,7 @@ Generate a few high-quality Anki cards from this content.
|
|
| 370 |
],
|
| 371 |
response_format=response_format_param,
|
| 372 |
temperature=0.5,
|
|
|
|
| 373 |
)
|
| 374 |
|
| 375 |
if (
|
|
@@ -466,6 +538,12 @@ Generate a few high-quality Anki cards from this content.
|
|
| 466 |
logger.info(
|
| 467 |
f"Successfully generated {len(validated_cards)} Cards from {page.url}."
|
| 468 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 469 |
return validated_cards
|
| 470 |
|
| 471 |
except json.JSONDecodeError as e:
|
|
@@ -509,6 +587,7 @@ async def process_crawled_pages(
|
|
| 509 |
custom_system_prompt: Optional[str] = None,
|
| 510 |
custom_user_prompt_template: Optional[str] = None,
|
| 511 |
progress_callback: Optional[Callable[[int, int], None]] = None,
|
|
|
|
| 512 |
) -> List[Card]:
|
| 513 |
if not pages:
|
| 514 |
logger.info("No pages provided to process_crawled_pages.")
|
|
@@ -536,6 +615,7 @@ async def process_crawled_pages(
|
|
| 536 |
custom_system_prompt=custom_system_prompt,
|
| 537 |
custom_user_prompt_template=custom_user_prompt_template,
|
| 538 |
max_prompt_content_tokens=max_prompt_content_tokens,
|
|
|
|
| 539 |
)
|
| 540 |
if page_cards is None:
|
| 541 |
logger.warning(
|
|
|
|
| 74 |
)
|
| 75 |
return self._client
|
| 76 |
|
| 77 |
+
def __enter__(self):
|
| 78 |
+
"""Context manager entry."""
|
| 79 |
+
return self
|
| 80 |
+
|
| 81 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 82 |
+
"""Context manager exit - cleanup resources."""
|
| 83 |
+
self.close()
|
| 84 |
+
return False
|
| 85 |
+
|
| 86 |
+
async def __aenter__(self):
|
| 87 |
+
"""Async context manager entry."""
|
| 88 |
+
return self
|
| 89 |
+
|
| 90 |
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
| 91 |
+
"""Async context manager exit - cleanup resources."""
|
| 92 |
+
await self.aclose()
|
| 93 |
+
return False
|
| 94 |
+
|
| 95 |
+
def close(self) -> None:
|
| 96 |
+
"""Close the OpenAI client synchronously."""
|
| 97 |
+
if self._client:
|
| 98 |
+
try:
|
| 99 |
+
# OpenAI client has a close method for cleanup
|
| 100 |
+
if hasattr(self._client, "close"):
|
| 101 |
+
self._client.close()
|
| 102 |
+
logger.debug("OpenAI client closed")
|
| 103 |
+
except Exception as e:
|
| 104 |
+
logger.warning(f"Error closing OpenAI client: {e}")
|
| 105 |
+
finally:
|
| 106 |
+
self._client = None
|
| 107 |
+
|
| 108 |
+
async def aclose(self) -> None:
|
| 109 |
+
"""Close the OpenAI client asynchronously."""
|
| 110 |
+
if self._client:
|
| 111 |
+
try:
|
| 112 |
+
# OpenAI async client has an aclose method
|
| 113 |
+
if hasattr(self._client, "aclose"):
|
| 114 |
+
await self._client.aclose()
|
| 115 |
+
elif hasattr(self._client, "close"):
|
| 116 |
+
self._client.close()
|
| 117 |
+
logger.debug("OpenAI client closed (async)")
|
| 118 |
+
except Exception as e:
|
| 119 |
+
logger.warning(f"Error closing OpenAI client: {e}")
|
| 120 |
+
finally:
|
| 121 |
+
self._client = None
|
| 122 |
+
|
| 123 |
|
| 124 |
# Retry decorator for API calls - kept similar to original
|
| 125 |
@retry(
|
|
|
|
| 160 |
):
|
| 161 |
effective_system_prompt = f"{system_prompt}\nProvide your response as a JSON object matching the specified schema."
|
| 162 |
|
| 163 |
+
# Security: Add timeout to prevent indefinite hanging
|
| 164 |
completion = await openai_client.chat.completions.create(
|
| 165 |
model=model,
|
| 166 |
messages=[
|
|
|
|
| 169 |
],
|
| 170 |
response_format=response_format, # Pass the dict directly
|
| 171 |
temperature=0.7, # Consider making this configurable
|
| 172 |
+
timeout=120.0, # 120 second timeout
|
| 173 |
)
|
| 174 |
|
| 175 |
if not hasattr(completion, "choices") or not completion.choices:
|
|
|
|
| 300 |
custom_system_prompt: Optional[str] = None,
|
| 301 |
custom_user_prompt_template: Optional[str] = None,
|
| 302 |
max_prompt_content_tokens: int = 6000,
|
| 303 |
+
cache: Optional[ResponseCache] = None,
|
| 304 |
) -> List[Card]:
|
| 305 |
+
"""Process a crawled page and extract structured Card objects using OpenAI.
|
| 306 |
+
|
| 307 |
+
Args:
|
| 308 |
+
openai_client: The OpenAI client instance
|
| 309 |
+
page: The crawled page to process
|
| 310 |
+
model: The model to use for generation
|
| 311 |
+
custom_system_prompt: Optional custom system prompt
|
| 312 |
+
custom_user_prompt_template: Optional custom user prompt template
|
| 313 |
+
max_prompt_content_tokens: Maximum tokens for content
|
| 314 |
+
cache: Optional ResponseCache for page-level caching
|
| 315 |
+
|
| 316 |
+
Returns:
|
| 317 |
+
List of generated Card objects
|
| 318 |
+
"""
|
| 319 |
+
# Check page-level cache first
|
| 320 |
+
if cache:
|
| 321 |
+
cache_key = f"{page.url}:{model}"
|
| 322 |
+
cached_cards = cache.get(cache_key, "page_cache")
|
| 323 |
+
if cached_cards is not None:
|
| 324 |
+
logger.info(f"Using cached cards for page: {page.url}")
|
| 325 |
+
return cached_cards
|
| 326 |
+
|
| 327 |
logger.info(
|
| 328 |
f"Processing page: {page.url} with model {model}, max_prompt_content_tokens: {max_prompt_content_tokens}"
|
| 329 |
)
|
|
|
|
| 432 |
f"Attempting to generate cards for {page.url} using model {model}."
|
| 433 |
)
|
| 434 |
response_format_param = {"type": "json_object"}
|
| 435 |
+
# Security: Add timeout to prevent indefinite hanging
|
| 436 |
response_data = await openai_client.chat.completions.create(
|
| 437 |
model=model,
|
| 438 |
messages=[
|
|
|
|
| 441 |
],
|
| 442 |
response_format=response_format_param,
|
| 443 |
temperature=0.5,
|
| 444 |
+
timeout=120.0, # 120 second timeout
|
| 445 |
)
|
| 446 |
|
| 447 |
if (
|
|
|
|
| 538 |
logger.info(
|
| 539 |
f"Successfully generated {len(validated_cards)} Cards from {page.url}."
|
| 540 |
)
|
| 541 |
+
# Cache successful results for page-level caching
|
| 542 |
+
if cache:
|
| 543 |
+
cache_key = f"{page.url}:{model}"
|
| 544 |
+
cache.set(cache_key, "page_cache", validated_cards)
|
| 545 |
+
logger.debug(f"Cached {len(validated_cards)} cards for {page.url}")
|
| 546 |
+
|
| 547 |
return validated_cards
|
| 548 |
|
| 549 |
except json.JSONDecodeError as e:
|
|
|
|
| 587 |
custom_system_prompt: Optional[str] = None,
|
| 588 |
custom_user_prompt_template: Optional[str] = None,
|
| 589 |
progress_callback: Optional[Callable[[int, int], None]] = None,
|
| 590 |
+
cache: Optional[ResponseCache] = None,
|
| 591 |
) -> List[Card]:
|
| 592 |
if not pages:
|
| 593 |
logger.info("No pages provided to process_crawled_pages.")
|
|
|
|
| 615 |
custom_system_prompt=custom_system_prompt,
|
| 616 |
custom_user_prompt_template=custom_user_prompt_template,
|
| 617 |
max_prompt_content_tokens=max_prompt_content_tokens,
|
| 618 |
+
cache=cache,
|
| 619 |
)
|
| 620 |
if page_cards is None:
|
| 621 |
logger.warning(
|
ankigen_core/ui_logic.py
CHANGED
|
@@ -250,18 +250,16 @@ def use_selected_subjects(subjects_df: pd.DataFrame | None):
|
|
| 250 |
)
|
| 251 |
|
| 252 |
|
| 253 |
-
def create_crawler_main_mode_elements() ->
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
]
|
| 264 |
-
):
|
| 265 |
"""Creates the UI components for the Web Crawler mode integrated into the main tab."""
|
| 266 |
ui_components: List[gr.components.Component] = []
|
| 267 |
|
|
|
|
| 250 |
)
|
| 251 |
|
| 252 |
|
| 253 |
+
def create_crawler_main_mode_elements() -> Tuple[
|
| 254 |
+
List[gr.components.Component], # ui_components (url_input, max_depth, etc.)
|
| 255 |
+
gr.Button, # crawl_button
|
| 256 |
+
gr.Progress, # progress_bar
|
| 257 |
+
gr.Textbox, # progress_status_textbox
|
| 258 |
+
gr.Textbox, # custom_system_prompt
|
| 259 |
+
gr.Textbox, # custom_user_prompt_template
|
| 260 |
+
gr.Checkbox, # use_sitemap_checkbox
|
| 261 |
+
gr.Textbox, # sitemap_url_textbox
|
| 262 |
+
]:
|
|
|
|
|
|
|
| 263 |
"""Creates the UI components for the Web Crawler mode integrated into the main tab."""
|
| 264 |
ui_components: List[gr.components.Component] = []
|
| 265 |
|
ankigen_core/utils.py
CHANGED
|
@@ -6,7 +6,6 @@ import sys
|
|
| 6 |
import hashlib
|
| 7 |
import requests
|
| 8 |
from bs4 import BeautifulSoup
|
| 9 |
-
from functools import lru_cache
|
| 10 |
from typing import Any, Optional
|
| 11 |
import time
|
| 12 |
|
|
@@ -14,7 +13,7 @@ import time
|
|
| 14 |
_logger_instance = None
|
| 15 |
|
| 16 |
|
| 17 |
-
def setup_logging():
|
| 18 |
"""Configure logging to both file and console"""
|
| 19 |
global _logger_instance
|
| 20 |
if _logger_instance:
|
|
@@ -49,7 +48,7 @@ def setup_logging():
|
|
| 49 |
return logger
|
| 50 |
|
| 51 |
|
| 52 |
-
def get_logger():
|
| 53 |
"""Returns the initialized logger instance."""
|
| 54 |
if _logger_instance is None:
|
| 55 |
return setup_logging()
|
|
@@ -62,39 +61,65 @@ logger = get_logger()
|
|
| 62 |
|
| 63 |
# --- Caching ---
|
| 64 |
class ResponseCache:
|
| 65 |
-
"""
|
| 66 |
|
| 67 |
-
def __init__(self, maxsize=128):
|
| 68 |
-
|
| 69 |
-
self.
|
| 70 |
-
self.
|
| 71 |
-
self.
|
| 72 |
-
|
| 73 |
-
def _get_from_dict_actual(self, cache_key: str):
|
| 74 |
-
"""Actual dictionary lookup, intended to be wrapped by lru_cache."""
|
| 75 |
-
logger.debug(f"Cache DICT GET: key={cache_key}")
|
| 76 |
-
return self._dict_cache.get(cache_key)
|
| 77 |
|
| 78 |
def get(self, prompt: str, model: str) -> Optional[Any]:
|
| 79 |
-
"""
|
| 80 |
cache_key = self._create_key(prompt, model)
|
| 81 |
-
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
def set(self, prompt: str, model: str, response: Any):
|
| 85 |
-
"""
|
| 86 |
cache_key = self._create_key(prompt, model)
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
#
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
def _create_key(self, prompt: str, model: str) -> str:
|
| 97 |
-
"""
|
|
|
|
| 98 |
return hashlib.md5(f"{model}:{prompt}".encode("utf-8")).hexdigest()
|
| 99 |
|
| 100 |
|
|
@@ -178,7 +203,7 @@ class RateLimiter:
|
|
| 178 |
self.last_request_timestamp: float = 0.0
|
| 179 |
# Use a lock if this were to be used by multiple threads, but for now assuming single thread access per instance
|
| 180 |
|
| 181 |
-
def wait(self):
|
| 182 |
"""Blocks until it's safe to make the next request."""
|
| 183 |
current_time = time.monotonic() # Use monotonic clock for intervals
|
| 184 |
time_since_last_request = current_time - self.last_request_timestamp
|
|
|
|
| 6 |
import hashlib
|
| 7 |
import requests
|
| 8 |
from bs4 import BeautifulSoup
|
|
|
|
| 9 |
from typing import Any, Optional
|
| 10 |
import time
|
| 11 |
|
|
|
|
| 13 |
_logger_instance = None
|
| 14 |
|
| 15 |
|
| 16 |
+
def setup_logging() -> logging.Logger:
|
| 17 |
"""Configure logging to both file and console"""
|
| 18 |
global _logger_instance
|
| 19 |
if _logger_instance:
|
|
|
|
| 48 |
return logger
|
| 49 |
|
| 50 |
|
| 51 |
+
def get_logger() -> logging.Logger:
|
| 52 |
"""Returns the initialized logger instance."""
|
| 53 |
if _logger_instance is None:
|
| 54 |
return setup_logging()
|
|
|
|
| 61 |
|
| 62 |
# --- Caching ---
|
| 63 |
class ResponseCache:
|
| 64 |
+
"""Simple and efficient LRU cache for API responses with proper eviction."""
|
| 65 |
|
| 66 |
+
def __init__(self, maxsize: int = 128):
|
| 67 |
+
self.maxsize = maxsize
|
| 68 |
+
self._cache = {} # {key: response}
|
| 69 |
+
self._access_order = [] # Track access order for LRU eviction
|
| 70 |
+
self.hits = 0
|
| 71 |
+
self.misses = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
def get(self, prompt: str, model: str) -> Optional[Any]:
|
| 74 |
+
"""Retrieve item from cache, updating LRU order."""
|
| 75 |
cache_key = self._create_key(prompt, model)
|
| 76 |
+
|
| 77 |
+
if cache_key in self._cache:
|
| 78 |
+
# Move to end (most recently used)
|
| 79 |
+
self._access_order.remove(cache_key)
|
| 80 |
+
self._access_order.append(cache_key)
|
| 81 |
+
self.hits += 1
|
| 82 |
+
logger.debug(
|
| 83 |
+
f"Cache HIT: {cache_key[:16]}... (hits={self.hits}, misses={self.misses})"
|
| 84 |
+
)
|
| 85 |
+
return self._cache[cache_key]
|
| 86 |
+
|
| 87 |
+
self.misses += 1
|
| 88 |
+
logger.debug(
|
| 89 |
+
f"Cache MISS: {cache_key[:16]}... (hits={self.hits}, misses={self.misses})"
|
| 90 |
+
)
|
| 91 |
+
return None
|
| 92 |
|
| 93 |
def set(self, prompt: str, model: str, response: Any):
|
| 94 |
+
"""Store item in cache with LRU eviction when full."""
|
| 95 |
cache_key = self._create_key(prompt, model)
|
| 96 |
+
|
| 97 |
+
# If key exists, update and move to end
|
| 98 |
+
if cache_key in self._cache:
|
| 99 |
+
self._access_order.remove(cache_key)
|
| 100 |
+
# If cache is full, evict least recently used
|
| 101 |
+
elif len(self._cache) >= self.maxsize:
|
| 102 |
+
evicted_key = self._access_order.pop(0)
|
| 103 |
+
del self._cache[evicted_key]
|
| 104 |
+
logger.debug(
|
| 105 |
+
f"Cache EVICT: {evicted_key[:16]}... (size={len(self._cache)})"
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
self._cache[cache_key] = response
|
| 109 |
+
self._access_order.append(cache_key)
|
| 110 |
+
logger.debug(f"Cache SET: {cache_key[:16]}... (size={len(self._cache)})")
|
| 111 |
+
|
| 112 |
+
def clear(self) -> None:
|
| 113 |
+
"""Clear all cache entries and statistics."""
|
| 114 |
+
self._cache.clear()
|
| 115 |
+
self._access_order.clear()
|
| 116 |
+
self.hits = 0
|
| 117 |
+
self.misses = 0
|
| 118 |
+
logger.debug("Cache CLEARED")
|
| 119 |
|
| 120 |
def _create_key(self, prompt: str, model: str) -> str:
|
| 121 |
+
"""Create cache key from prompt and model (MD5 hash for size efficiency)."""
|
| 122 |
+
# Hash to keep keys manageable size while maintaining uniqueness
|
| 123 |
return hashlib.md5(f"{model}:{prompt}".encode("utf-8")).hexdigest()
|
| 124 |
|
| 125 |
|
|
|
|
| 203 |
self.last_request_timestamp: float = 0.0
|
| 204 |
# Use a lock if this were to be used by multiple threads, but for now assuming single thread access per instance
|
| 205 |
|
| 206 |
+
def wait(self) -> None:
|
| 207 |
"""Blocks until it's safe to make the next request."""
|
| 208 |
current_time = time.monotonic() # Use monotonic clock for intervals
|
| 209 |
time_since_last_request = current_time - self.last_request_timestamp
|
pyproject.toml
CHANGED
|
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
|
|
| 5 |
[project]
|
| 6 |
name = "ankigen"
|
| 7 |
version = "0.2.0"
|
| 8 |
-
description = ""
|
| 9 |
authors = [
|
| 10 |
{ name = "Justin", email = "[email protected]" },
|
| 11 |
]
|
|
|
|
| 5 |
[project]
|
| 6 |
name = "ankigen"
|
| 7 |
version = "0.2.0"
|
| 8 |
+
description = "AI-powered Anki flashcard generator using OpenAI GPT models with CLI and web interface"
|
| 9 |
authors = [
|
| 10 |
{ name = "Justin", email = "[email protected]" },
|
| 11 |
]
|