brickfrog commited on
Commit
93bd7fb
·
verified ·
1 Parent(s): 3acb91e

Upload folder using huggingface_hub

Browse files
.gitignore CHANGED
@@ -197,4 +197,5 @@ tasks/
197
  scripts/
198
 
199
  .taskmasterconfig
200
- .cursor
 
 
197
  scripts/
198
 
199
  .taskmasterconfig
200
+ .cursor
201
+ .serena/
.pre-commit-config.yaml CHANGED
@@ -1,6 +1,6 @@
1
  repos:
2
  - repo: https://github.com/astral-sh/ruff-pre-commit
3
- rev: v0.5.7 # Use a recent ruff version
4
  hooks:
5
  - id: ruff
6
  args: [--fix, --exit-non-zero-on-fix]
 
1
  repos:
2
  - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ rev: v0.13.1 # Updated to match pyproject.toml version
4
  hooks:
5
  - id: ruff
6
  args: [--fix, --exit-non-zero-on-fix]
ankigen_core/agents/generators.py CHANGED
@@ -108,9 +108,10 @@ class SubjectExpertAgent(BaseAgentWrapper):
108
  f"Generating batch {batch_num}: {cards_in_this_batch} cards"
109
  )
110
 
111
- # Reset agent for each batch to avoid conversation history accumulation
112
- self.agent = None
113
- await self.initialize()
 
114
 
115
  user_input = (
116
  f"Generate {cards_in_this_batch} flashcards for the topic: {topic}"
@@ -158,13 +159,13 @@ class SubjectExpertAgent(BaseAgentWrapper):
158
  batch_num += 1
159
 
160
  logger.info(
161
- f"Batch {batch_num-1} generated {len(batch_cards)} cards. {cards_remaining} cards remaining."
162
  )
163
 
164
  # Safety check to prevent infinite loops
165
  if len(batch_cards) == 0:
166
  logger.warning(
167
- f"No cards generated in batch {batch_num-1}, stopping generation"
168
  )
169
  break
170
 
@@ -175,7 +176,7 @@ class SubjectExpertAgent(BaseAgentWrapper):
175
  )
176
 
177
  logger.info(
178
- f"✅ Generated {len(all_cards)} cards total across {batch_num-1} batches for topic '{topic}'"
179
  )
180
  return all_cards
181
 
 
108
  f"Generating batch {batch_num}: {cards_in_this_batch} cards"
109
  )
110
 
111
+ # Initialize agent only once - Runner.run() creates fresh context each time
112
+ # No conversation history accumulation across batches (significant performance gain)
113
+ if not self.agent:
114
+ await self.initialize()
115
 
116
  user_input = (
117
  f"Generate {cards_in_this_batch} flashcards for the topic: {topic}"
 
159
  batch_num += 1
160
 
161
  logger.info(
162
+ f"Batch {batch_num - 1} generated {len(batch_cards)} cards. {cards_remaining} cards remaining."
163
  )
164
 
165
  # Safety check to prevent infinite loops
166
  if len(batch_cards) == 0:
167
  logger.warning(
168
+ f"No cards generated in batch {batch_num - 1}, stopping generation"
169
  )
170
  break
171
 
 
176
  )
177
 
178
  logger.info(
179
+ f"✅ Generated {len(all_cards)} cards total across {batch_num - 1} batches for topic '{topic}'"
180
  )
181
  return all_cards
182
 
ankigen_core/card_generator.py CHANGED
@@ -312,9 +312,9 @@ def generate_cards_from_crawled_content(
312
  for i, card_obj in enumerate(all_cards):
313
  # Extract data, assuming it's already plain text from Card object creation
314
  topic = (
315
- card_obj.metadata.get("topic", f"Crawled Content - Card {i+1}")
316
  if card_obj.metadata
317
- else f"Crawled Content - Card {i+1}"
318
  )
319
 
320
  # Ensure list-based metadata are joined as plain strings for DataFrame
 
312
  for i, card_obj in enumerate(all_cards):
313
  # Extract data, assuming it's already plain text from Card object creation
314
  topic = (
315
+ card_obj.metadata.get("topic", f"Crawled Content - Card {i + 1}")
316
  if card_obj.metadata
317
+ else f"Crawled Content - Card {i + 1}"
318
  )
319
 
320
  # Ensure list-based metadata are joined as plain strings for DataFrame
ankigen_core/context7.py CHANGED
@@ -4,19 +4,37 @@ import asyncio
4
  import subprocess
5
  import json
6
  from typing import Optional, Dict, Any
 
 
 
 
 
 
7
  from ankigen_core.logging import logger
 
 
 
 
 
 
8
 
9
 
10
  class Context7Client:
11
  """Context7 MCP client for fetching library documentation"""
12
 
13
  def __init__(self):
14
- self.server_process = None
15
-
 
 
 
 
 
 
16
  async def call_context7_tool(
17
  self, tool_name: str, args: Dict[str, Any]
18
  ) -> Optional[Dict[str, Any]]:
19
- """Call a Context7 tool via direct JSONRPC"""
20
  try:
21
  # Build the JSONRPC request
22
  request = {
@@ -47,9 +65,35 @@ class Context7Client:
47
  },
48
  }
49
 
50
- # Send both requests
51
- input_data = json.dumps(init_request) + "\n" + json.dumps(request) + "\n"
52
- stdout, stderr = await process.communicate(input=input_data.encode())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  # Parse responses
55
  responses = stdout.decode().strip().split("\n")
@@ -204,6 +248,15 @@ class Context7Client:
204
  self, library_id: str, topic: Optional[str] = None, tokens: int = 5000
205
  ) -> Optional[str]:
206
  """Get documentation for a library"""
 
 
 
 
 
 
 
 
 
207
  logger.info(
208
  f"Fetching docs for: {library_id}" + (f" (topic: {topic})" if topic else "")
209
  )
@@ -233,7 +286,7 @@ class Context7Client:
233
  return await self.get_library_docs(library_id, topic, tokens)
234
 
235
 
236
- async def test_context7():
237
  """Test the Context7 integration"""
238
  client = Context7Client()
239
 
 
4
  import subprocess
5
  import json
6
  from typing import Optional, Dict, Any
7
+ from tenacity import (
8
+ retry,
9
+ stop_after_attempt,
10
+ wait_exponential,
11
+ retry_if_exception_type,
12
+ )
13
  from ankigen_core.logging import logger
14
+ from ankigen_core.exceptions import (
15
+ ValidationError,
16
+ )
17
+
18
+ MAX_STRING_LENGTH = 200 # Prevent excessively long inputs
19
+ SUBPROCESS_TIMEOUT = 60.0 # 60 second timeout for Context7 calls
20
 
21
 
22
  class Context7Client:
23
  """Context7 MCP client for fetching library documentation"""
24
 
25
  def __init__(self):
26
+ pass # No state needed - each call creates fresh subprocess
27
+
28
+ @retry(
29
+ stop=stop_after_attempt(3),
30
+ wait=wait_exponential(multiplier=1, min=2, max=10),
31
+ retry=retry_if_exception_type((TimeoutError, ConnectionError)),
32
+ reraise=True,
33
+ )
34
  async def call_context7_tool(
35
  self, tool_name: str, args: Dict[str, Any]
36
  ) -> Optional[Dict[str, Any]]:
37
+ """Call a Context7 tool via direct JSONRPC with retry logic"""
38
  try:
39
  # Build the JSONRPC request
40
  request = {
 
65
  },
66
  }
67
 
68
+ # Send both requests with timeout protection
69
+ # Optimize: Use list join for string concatenation
70
+ input_data = "\n".join([json.dumps(init_request), json.dumps(request), ""])
71
+ try:
72
+ stdout, stderr = await asyncio.wait_for(
73
+ process.communicate(input=input_data.encode()),
74
+ timeout=SUBPROCESS_TIMEOUT,
75
+ )
76
+ except asyncio.TimeoutError:
77
+ # Proper process cleanup on timeout
78
+ try:
79
+ if process.returncode is None: # Process still running
80
+ process.kill()
81
+ # Wait for process to actually terminate
82
+ await asyncio.wait_for(process.wait(), timeout=5.0)
83
+ except Exception as cleanup_error:
84
+ logger.error(f"Error during process cleanup: {cleanup_error}")
85
+ raise TimeoutError(
86
+ f"Context7 subprocess timed out after {SUBPROCESS_TIMEOUT}s"
87
+ )
88
+ except Exception:
89
+ # Clean up process on any other error
90
+ try:
91
+ if process.returncode is None:
92
+ process.kill()
93
+ await asyncio.wait_for(process.wait(), timeout=5.0)
94
+ except Exception:
95
+ pass # Best effort cleanup
96
+ raise
97
 
98
  # Parse responses
99
  responses = stdout.decode().strip().split("\n")
 
248
  self, library_id: str, topic: Optional[str] = None, tokens: int = 5000
249
  ) -> Optional[str]:
250
  """Get documentation for a library"""
251
+ # Security: Validate library_id (should start with /)
252
+ if (
253
+ not library_id
254
+ or not library_id.startswith("/")
255
+ or len(library_id) > MAX_STRING_LENGTH
256
+ ):
257
+ logger.error(f"Invalid library ID format (security): '{library_id}'")
258
+ raise ValidationError("Invalid library ID format")
259
+
260
  logger.info(
261
  f"Fetching docs for: {library_id}" + (f" (topic: {topic})" if topic else "")
262
  )
 
286
  return await self.get_library_docs(library_id, topic, tokens)
287
 
288
 
289
+ async def test_context7() -> None:
290
  """Test the Context7 integration"""
291
  client = Context7Client()
292
 
ankigen_core/crawler.py CHANGED
@@ -1,13 +1,61 @@
1
  import requests
 
2
  from bs4 import BeautifulSoup, Tag
3
  from urllib.parse import urljoin, urlparse
4
  import re
 
 
5
  from typing import List, Set, Optional, Callable, Tuple
6
  import xml.etree.ElementTree as ET # Added for Sitemap parsing
7
 
8
  from ankigen_core.models import CrawledPage
9
  from ankigen_core.utils import RateLimiter, get_logger
10
  from ankigen_core.logging import logger # Added
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
 
13
  class WebCrawler:
@@ -41,20 +89,83 @@ class WebCrawler:
41
  self.logger = get_logger()
42
  self.session = requests.Session()
43
  self.session.headers.update({"User-Agent": self.user_agent})
 
 
 
 
 
 
 
44
  self.rate_limiter = RateLimiter(self.requests_per_second)
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  def _is_valid_url(self, url: str) -> bool:
47
  """
48
- Checks if the URL is valid for crawling (same domain, scheme, matches patterns).
 
49
  """
50
  try:
 
 
 
 
 
 
 
51
  parsed_url = urlparse(url)
 
 
52
  if not parsed_url.scheme or parsed_url.scheme.lower() not in [
53
  "http",
54
  "https",
55
  ]:
56
  logger.debug(f"Invalid scheme for URL: {url}")
57
  return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  if parsed_url.netloc != self.base_domain:
59
  logger.debug(f"URL {url} not in base domain {self.base_domain}")
60
  return False
@@ -76,6 +187,10 @@ class WebCrawler:
76
  except ValueError: # Handle potential errors from urlparse on malformed URLs
77
  logger.warning(f"ValueError when parsing URL: {url}", exc_info=True)
78
  return False
 
 
 
 
79
  return True
80
 
81
  def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[str]:
@@ -194,40 +309,122 @@ class WebCrawler:
194
 
195
  # --- End Sitemap Processing Methods ---
196
 
197
- def crawl(
198
- self, progress_callback: Optional[Callable[[int, int, str], None]] = None
199
- ) -> List[CrawledPage]:
 
 
 
200
  urls_to_visit: List[Tuple[str, int, Optional[str]]] = []
201
- crawled_pages: List[CrawledPage] = []
202
- initial_total_for_progress = 0
203
 
204
  if self.use_sitemap and self.sitemap_url:
205
  self.logger.info(f"Attempting to use sitemap: {self.sitemap_url}")
206
  sitemap_extracted_urls = self._get_urls_from_sitemap()
207
  if sitemap_extracted_urls:
208
  for url in sitemap_extracted_urls:
209
- if self._is_valid_url(
210
- url
211
- ): # Checks domain, include/exclude patterns
212
- urls_to_visit.append(
213
- (url, 0, None)
214
- ) # Add with depth 0 and None parent
215
  self.logger.info(
216
  f"Initialized {len(urls_to_visit)} URLs to visit from sitemap after validation."
217
  )
218
- initial_total_for_progress = len(urls_to_visit)
219
  else:
220
  self.logger.warning(
221
- "Sitemap processing yielded no URLs, or sitemap_url not set. Falling back to start_url if provided."
222
  )
223
- # Fallback to start_url if sitemap is empty or fails
224
  if self._is_valid_url(self.start_url):
225
- urls_to_visit.append((self.start_url, 0, None)) # None parent
226
- initial_total_for_progress = len(urls_to_visit)
227
  else:
228
  if self._is_valid_url(self.start_url):
229
- urls_to_visit.append((self.start_url, 0, None)) # None parent
230
- initial_total_for_progress = len(urls_to_visit)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
 
232
  processed_count = 0
233
  while urls_to_visit:
@@ -246,28 +443,16 @@ class WebCrawler:
246
  current_url,
247
  )
248
 
249
- if current_url in self.visited_urls:
250
- self.logger.debug(f"URL already visited: {current_url}. Skipping.")
251
- if progress_callback:
252
- # When skipping, processed_count doesn't increment, but one item is removed from effective queue for this iteration.
253
- # current_total_for_progress should reflect this for accuracy if it's dynamic.
254
- # If sitemap, it remains initial_total_for_progress.
255
  dynamic_total = (
256
  initial_total_for_progress
257
  if self.use_sitemap
258
  else processed_count + len(urls_to_visit) + 1
259
  )
260
- progress_callback(
261
- processed_count,
262
- dynamic_total,
263
- f"Skipped (visited): {current_url}",
264
- )
265
- continue
266
-
267
- if current_depth > self.max_depth:
268
- logger.debug(
269
- f"Skipping URL {current_url} due to depth {current_depth} > max_depth {self.max_depth}"
270
- )
271
  continue
272
 
273
  self.logger.info(
@@ -289,52 +474,10 @@ class WebCrawler:
289
  html_content = response.text
290
  soup = BeautifulSoup(html_content, "html.parser")
291
 
292
- # Revert to original BeautifulSoup parsing logic for title, meta_description, meta_keywords
293
- page_title_tag = soup.find("title")
294
- page_title: Optional[str] = None
295
- if isinstance(page_title_tag, Tag) and page_title_tag.string:
296
- page_title = page_title_tag.string.strip()
297
- else:
298
- self.logger.debug(f"No title tag found for {current_url}")
299
-
300
- meta_desc_tag = soup.find("meta", attrs={"name": "description"})
301
- meta_description: Optional[str] = None
302
- if isinstance(meta_desc_tag, Tag):
303
- content = meta_desc_tag.get("content")
304
- if isinstance(content, str):
305
- meta_description = content.strip()
306
- elif isinstance(content, list):
307
- meta_description = " ".join(
308
- str(item) for item in content
309
- ).strip()
310
- self.logger.debug(
311
- f"Meta description for {current_url} was a list, joined: {meta_description}"
312
- )
313
- else:
314
- self.logger.debug(f"No meta description found for {current_url}")
315
-
316
- meta_keywords_tag = soup.find("meta", attrs={"name": "keywords"})
317
- meta_keywords: List[str] = []
318
- if isinstance(meta_keywords_tag, Tag):
319
- content = meta_keywords_tag.get("content")
320
- raw_keywords_content: str = ""
321
- if isinstance(content, str):
322
- raw_keywords_content = content
323
- elif isinstance(content, list):
324
- raw_keywords_content = " ".join(str(item) for item in content)
325
- self.logger.debug(
326
- f"Meta keywords for {current_url} was a list, joined: {raw_keywords_content}"
327
- )
328
-
329
- if raw_keywords_content:
330
- meta_keywords = [
331
- k.strip()
332
- for k in raw_keywords_content.split(",")
333
- if k.strip()
334
- ]
335
- else:
336
- self.logger.debug(f"No meta keywords found for {current_url}")
337
- # End reverted section
338
 
339
  text_content = self._extract_text(soup)
340
 
 
1
  import requests
2
+ from requests.adapters import HTTPAdapter
3
  from bs4 import BeautifulSoup, Tag
4
  from urllib.parse import urljoin, urlparse
5
  import re
6
+ import ipaddress
7
+ import socket
8
  from typing import List, Set, Optional, Callable, Tuple
9
  import xml.etree.ElementTree as ET # Added for Sitemap parsing
10
 
11
  from ankigen_core.models import CrawledPage
12
  from ankigen_core.utils import RateLimiter, get_logger
13
  from ankigen_core.logging import logger # Added
14
+ from ankigen_core.exceptions import (
15
+ SecurityError,
16
+ )
17
+
18
+ # Security: Maximum URL length to prevent abuse
19
+ MAX_URL_LENGTH = 2048
20
+
21
+
22
+ class SSRFProtectionAdapter(HTTPAdapter):
23
+ """
24
+ Custom HTTP adapter that prevents SSRF attacks by validating
25
+ IP addresses at connection time (prevents DNS rebinding attacks).
26
+ """
27
+
28
+ def send(self, request, **kwargs) -> requests.Response:
29
+ """Override send to validate IP before making request."""
30
+ # Parse the URL to get hostname
31
+ parsed = urlparse(request.url)
32
+ hostname = parsed.hostname
33
+
34
+ if hostname:
35
+ try:
36
+ # Resolve hostname to IP at request time (prevents DNS rebinding)
37
+ ip_str = socket.gethostbyname(hostname)
38
+ ip = ipaddress.ip_address(ip_str)
39
+
40
+ # Block private, loopback, link-local, and reserved addresses
41
+ if (
42
+ ip.is_private
43
+ or ip.is_loopback
44
+ or ip.is_link_local
45
+ or ip.is_reserved
46
+ ):
47
+ msg = f"SSRF protection: Blocked request to private IP {ip_str} for hostname {hostname}"
48
+ logger.error(msg)
49
+ raise SecurityError(msg)
50
+ except (socket.gaierror, ValueError) as e:
51
+ logger.error(
52
+ f"SSRF protection: DNS resolution failed for {hostname}: {e}"
53
+ )
54
+ raise requests.exceptions.ConnectionError(
55
+ f"DNS resolution failed for {hostname}"
56
+ )
57
+
58
+ return super().send(request, **kwargs)
59
 
60
 
61
  class WebCrawler:
 
89
  self.logger = get_logger()
90
  self.session = requests.Session()
91
  self.session.headers.update({"User-Agent": self.user_agent})
92
+
93
+ # Security: Add SSRF protection adapter to prevent DNS rebinding attacks
94
+ # Performance: Configure connection pooling (10 connections per host, 20 total)
95
+ ssrf_adapter = SSRFProtectionAdapter(pool_connections=10, pool_maxsize=20)
96
+ self.session.mount("http://", ssrf_adapter)
97
+ self.session.mount("https://", ssrf_adapter)
98
+
99
  self.rate_limiter = RateLimiter(self.requests_per_second)
100
 
101
+ def __enter__(self):
102
+ """Context manager entry."""
103
+ return self
104
+
105
+ def __exit__(self, exc_type, exc_val, exc_tb):
106
+ """Context manager exit - cleanup resources."""
107
+ self.close()
108
+ return False
109
+
110
+ def close(self) -> None:
111
+ """Close the requests session and cleanup resources."""
112
+ if hasattr(self, "session") and self.session:
113
+ self.session.close()
114
+ self.logger.debug("WebCrawler session closed")
115
+
116
  def _is_valid_url(self, url: str) -> bool:
117
  """
118
+ Checks if the URL is valid for crawling with SSRF protection.
119
+ Validates scheme, domain, patterns, and blocks private IP ranges.
120
  """
121
  try:
122
+ # Security: URL length check
123
+ if len(url) > MAX_URL_LENGTH:
124
+ logger.warning(
125
+ f"URL exceeds maximum length ({MAX_URL_LENGTH}): {url[:100]}..."
126
+ )
127
+ return False
128
+
129
  parsed_url = urlparse(url)
130
+
131
+ # Security: Protocol whitelist (http/https only)
132
  if not parsed_url.scheme or parsed_url.scheme.lower() not in [
133
  "http",
134
  "https",
135
  ]:
136
  logger.debug(f"Invalid scheme for URL: {url}")
137
  return False
138
+
139
+ # Security: SSRF protection - block private IP ranges
140
+ hostname = parsed_url.hostname
141
+ if not hostname:
142
+ logger.warning(f"URL missing hostname: {url}")
143
+ return False
144
+
145
+ # Resolve hostname to IP and check if it's private
146
+ try:
147
+ # Get IP address for hostname
148
+ ip_str = socket.gethostbyname(hostname)
149
+ ip = ipaddress.ip_address(ip_str)
150
+
151
+ # Block private, loopback, link-local, and reserved addresses
152
+ if (
153
+ ip.is_private
154
+ or ip.is_loopback
155
+ or ip.is_link_local
156
+ or ip.is_reserved
157
+ ):
158
+ logger.error(
159
+ f"SSRF protection: Blocked private/internal IP {ip_str} for hostname {hostname}"
160
+ )
161
+ return False
162
+
163
+ except (socket.gaierror, ValueError, OSError) as e:
164
+ # DNS resolution failed or invalid IP
165
+ logger.warning(f"Could not resolve hostname {hostname}: {e}")
166
+ return False
167
+
168
+ # Domain check
169
  if parsed_url.netloc != self.base_domain:
170
  logger.debug(f"URL {url} not in base domain {self.base_domain}")
171
  return False
 
187
  except ValueError: # Handle potential errors from urlparse on malformed URLs
188
  logger.warning(f"ValueError when parsing URL: {url}", exc_info=True)
189
  return False
190
+ except Exception as e:
191
+ logger.error(f"Unexpected error validating URL {url}: {e}", exc_info=True)
192
+ return False
193
+
194
  return True
195
 
196
  def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[str]:
 
309
 
310
  # --- End Sitemap Processing Methods ---
311
 
312
+ def _initialize_crawl_queue(self) -> List[Tuple[str, int, Optional[str]]]:
313
+ """Initialize the crawl queue from sitemap or start URL.
314
+
315
+ Returns:
316
+ List of tuples (url, depth, parent_url) to visit
317
+ """
318
  urls_to_visit: List[Tuple[str, int, Optional[str]]] = []
 
 
319
 
320
  if self.use_sitemap and self.sitemap_url:
321
  self.logger.info(f"Attempting to use sitemap: {self.sitemap_url}")
322
  sitemap_extracted_urls = self._get_urls_from_sitemap()
323
  if sitemap_extracted_urls:
324
  for url in sitemap_extracted_urls:
325
+ if self._is_valid_url(url):
326
+ urls_to_visit.append((url, 0, None))
 
 
 
 
327
  self.logger.info(
328
  f"Initialized {len(urls_to_visit)} URLs to visit from sitemap after validation."
329
  )
 
330
  else:
331
  self.logger.warning(
332
+ "Sitemap processing yielded no URLs. Falling back to start_url."
333
  )
 
334
  if self._is_valid_url(self.start_url):
335
+ urls_to_visit.append((self.start_url, 0, None))
 
336
  else:
337
  if self._is_valid_url(self.start_url):
338
+ urls_to_visit.append((self.start_url, 0, None))
339
+
340
+ return urls_to_visit
341
+
342
+ def _extract_page_metadata(
343
+ self, soup: BeautifulSoup, url: str
344
+ ) -> Tuple[Optional[str], Optional[str], List[str]]:
345
+ """Extract title, meta description, and meta keywords from page.
346
+
347
+ Args:
348
+ soup: BeautifulSoup object of the page
349
+ url: URL being processed (for logging)
350
+
351
+ Returns:
352
+ Tuple of (title, meta_description, meta_keywords_list)
353
+ """
354
+ # Extract title
355
+ page_title_tag = soup.find("title")
356
+ page_title: Optional[str] = None
357
+ if isinstance(page_title_tag, Tag) and page_title_tag.string:
358
+ page_title = page_title_tag.string.strip()
359
+ else:
360
+ self.logger.debug(f"No title tag found for {url}")
361
+
362
+ # Extract meta description
363
+ meta_desc_tag = soup.find("meta", attrs={"name": "description"})
364
+ meta_description: Optional[str] = None
365
+ if isinstance(meta_desc_tag, Tag):
366
+ content = meta_desc_tag.get("content")
367
+ if isinstance(content, str):
368
+ meta_description = content.strip()
369
+ elif isinstance(content, list):
370
+ meta_description = " ".join(str(item) for item in content).strip()
371
+ self.logger.debug(
372
+ f"Meta description for {url} was a list, joined: {meta_description}"
373
+ )
374
+ else:
375
+ self.logger.debug(f"No meta description found for {url}")
376
+
377
+ # Extract meta keywords
378
+ meta_keywords_tag = soup.find("meta", attrs={"name": "keywords"})
379
+ meta_keywords: List[str] = []
380
+ if isinstance(meta_keywords_tag, Tag):
381
+ content_kw = meta_keywords_tag.get("content")
382
+ raw_keywords_content: str = ""
383
+ if isinstance(content_kw, str):
384
+ raw_keywords_content = content_kw
385
+ elif isinstance(content_kw, list):
386
+ raw_keywords_content = " ".join(str(item) for item in content_kw)
387
+ self.logger.debug(
388
+ f"Meta keywords for {url} was a list, joined: {raw_keywords_content}"
389
+ )
390
+
391
+ if raw_keywords_content:
392
+ meta_keywords = [
393
+ k.strip() for k in raw_keywords_content.split(",") if k.strip()
394
+ ]
395
+ else:
396
+ self.logger.debug(f"No meta keywords found for {url}")
397
+
398
+ return page_title, meta_description, meta_keywords
399
+
400
+ def _should_skip_url(self, url: str, depth: int) -> Tuple[bool, Optional[str]]:
401
+ """Check if URL should be skipped.
402
+
403
+ Args:
404
+ url: URL to check
405
+ depth: Current depth of URL
406
+
407
+ Returns:
408
+ Tuple of (should_skip, skip_reason)
409
+ """
410
+ if url in self.visited_urls:
411
+ return True, f"Skipped (visited): {url}"
412
+
413
+ if depth > self.max_depth:
414
+ logger.debug(
415
+ f"Skipping URL {url} due to depth {depth} > max_depth {self.max_depth}"
416
+ )
417
+ return True, f"Skipped (max depth): {url}"
418
+
419
+ return False, None
420
+
421
+ def crawl(
422
+ self, progress_callback: Optional[Callable[[int, int, str], None]] = None
423
+ ) -> List[CrawledPage]:
424
+ # Initialize URLs using helper method
425
+ urls_to_visit = self._initialize_crawl_queue()
426
+ crawled_pages: List[CrawledPage] = []
427
+ initial_total_for_progress = len(urls_to_visit)
428
 
429
  processed_count = 0
430
  while urls_to_visit:
 
443
  current_url,
444
  )
445
 
446
+ # Check if URL should be skipped using helper method
447
+ should_skip, skip_reason = self._should_skip_url(current_url, current_depth)
448
+ if should_skip:
449
+ if progress_callback and skip_reason:
 
 
450
  dynamic_total = (
451
  initial_total_for_progress
452
  if self.use_sitemap
453
  else processed_count + len(urls_to_visit) + 1
454
  )
455
+ progress_callback(processed_count, dynamic_total, skip_reason)
 
 
 
 
 
 
 
 
 
 
456
  continue
457
 
458
  self.logger.info(
 
474
  html_content = response.text
475
  soup = BeautifulSoup(html_content, "html.parser")
476
 
477
+ # Extract metadata using helper method
478
+ page_title, meta_description, meta_keywords = (
479
+ self._extract_page_metadata(soup, current_url)
480
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
481
 
482
  text_content = self._extract_text(soup)
483
 
ankigen_core/exceptions.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Custom exceptions for AnkiGen application.
2
+
3
+ This module provides a hierarchy of custom exceptions to standardize
4
+ error handling across the codebase.
5
+ """
6
+
7
+
8
+ class AnkigenError(Exception):
9
+ """Base exception for all AnkiGen errors."""
10
+
11
+ pass
12
+
13
+
14
+ class ValidationError(AnkigenError):
15
+ """Raised when input validation fails."""
16
+
17
+ pass
18
+
19
+
20
+ class SecurityError(AnkigenError):
21
+ """Raised when a security check fails (SSRF, command injection, etc.)."""
22
+
23
+ pass
24
+
25
+
26
+ class APIError(AnkigenError):
27
+ """Base exception for API-related errors."""
28
+
29
+ pass
30
+
31
+
32
+ class OpenAIAPIError(APIError):
33
+ """Raised when OpenAI API calls fail."""
34
+
35
+ pass
36
+
37
+
38
+ class Context7APIError(APIError):
39
+ """Raised when Context7 API calls fail."""
40
+
41
+ pass
42
+
43
+
44
+ class CrawlerError(AnkigenError):
45
+ """Base exception for web crawler errors."""
46
+
47
+ pass
48
+
49
+
50
+ class URLValidationError(CrawlerError):
51
+ """Raised when URL validation fails."""
52
+
53
+ pass
54
+
55
+
56
+ class ContentExtractionError(CrawlerError):
57
+ """Raised when content extraction from web page fails."""
58
+
59
+ pass
60
+
61
+
62
+ class ExportError(AnkigenError):
63
+ """Base exception for export-related errors."""
64
+
65
+ pass
66
+
67
+
68
+ class CardGenerationError(AnkigenError):
69
+ """Raised when card generation fails."""
70
+
71
+ pass
72
+
73
+
74
+ class ConfigurationError(AnkigenError):
75
+ """Raised when configuration is invalid or missing."""
76
+
77
+ pass
78
+
79
+
80
+ def handle_exception(
81
+ exc: Exception,
82
+ logger,
83
+ message: str,
84
+ reraise: bool = True,
85
+ reraise_as: type[Exception] | None = None,
86
+ ) -> None:
87
+ """Standardized exception handler.
88
+
89
+ Args:
90
+ exc: The exception to handle
91
+ logger: Logger instance to use
92
+ message: Error message to log
93
+ reraise: Whether to re-raise the exception
94
+ reraise_as: Optional exception type to wrap and re-raise as
95
+
96
+ Raises:
97
+ The original exception or wrapped exception if reraise is True
98
+ """
99
+ logger.error(f"{message}: {exc}", exc_info=True)
100
+
101
+ if reraise:
102
+ if reraise_as:
103
+ raise reraise_as(f"{message}: {exc}") from exc
104
+ raise
ankigen_core/exporters.py CHANGED
@@ -4,6 +4,7 @@ import gradio as gr
4
  import pandas as pd
5
  import genanki
6
  import random
 
7
  from typing import List, Dict, Any, Optional
8
  import csv
9
  from datetime import datetime
@@ -23,6 +24,57 @@ def _format_field_as_string(value: Any) -> str:
23
  return str(value).strip()
24
 
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  # --- Constants for APKG Generation (Subtask 10) ---
27
  ANKI_BASIC_MODEL_NAME = "AnkiGen Basic"
28
  ANKI_CLOZE_MODEL_NAME = "AnkiGen Cloze"
@@ -587,19 +639,18 @@ def export_cards_to_csv(
587
  KeyError: If a card dictionary is missing essential keys like 'front' or 'back'.
588
  ValueError: If the cards list is empty or not provided.
589
  """
590
- if not cards:
591
- logger.warning("export_cards_to_csv called with an empty list of cards.")
592
- raise ValueError("No cards provided to export.")
593
 
 
594
  if not filename:
595
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
596
- # Ensure filename is just the name, not a path if not intended
597
- # For simplicity, this example saves in the current working directory if no path is specified.
598
- filename = f"ankigen_cards_{timestamp}.csv"
599
  logger.info(f"No filename provided, generated: {filename}")
600
 
 
 
 
601
  # Define the fieldnames expected in the CSV.
602
- # 'front' and 'back' are mandatory.
603
  fieldnames = ["front", "back", "tags", "note_type"]
604
 
605
  try:
@@ -611,7 +662,7 @@ def export_cards_to_csv(
611
  writer.writeheader()
612
  for i, card in enumerate(cards):
613
  try:
614
- # Ensure mandatory fields exist, others are optional via card.get in row_to_write
615
  if "front" not in card or "back" not in card:
616
  raise KeyError(
617
  f"Card at index {i} is missing 'front' or 'back' key."
@@ -628,16 +679,13 @@ def export_cards_to_csv(
628
  logger.error(
629
  f"Skipping card due to KeyError: {e_inner}. Card data: {card}"
630
  )
631
- # Optionally re-raise if one bad card should stop the whole export,
632
- # or continue to export valid cards.
633
- # For this implementation, we log and continue.
634
  continue
635
  logger.info(f"Successfully exported cards to {filename}")
636
  return filename
637
  except IOError as e_io:
638
  logger.error(f"IOError during CSV export to {filename}: {e_io}", exc_info=True)
639
- raise # Re-raise the IOError
640
- except Exception as e_general: # Catch any other unexpected errors
641
  logger.error(
642
  f"Unexpected error during CSV export to {filename}: {e_general}",
643
  exc_info=True,
@@ -664,16 +712,18 @@ def export_cards_to_apkg(
664
  The path to the exported file.
665
  """
666
  logger.info(f"Starting APKG export for {len(cards)} cards to deck '{deck_name}'.")
 
 
 
 
 
667
  if not filename:
668
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
669
- filename = f"ankigen_deck_{timestamp}.apkg"
670
  elif not filename.lower().endswith(".apkg"):
671
  filename += ".apkg"
672
 
673
- output_dir = os.path.dirname(filename)
674
- if output_dir and not os.path.exists(output_dir):
675
- os.makedirs(output_dir)
676
- logger.info(f"Created output directory for APKG: {output_dir}")
677
 
678
  anki_basic_model = BASIC_MODEL
679
  anki_cloze_model = CLOZE_MODEL
@@ -687,20 +737,17 @@ def export_cards_to_apkg(
687
  tags_for_note_object = card_dict.get("tags_for_note_object", [])
688
 
689
  # Extract all potential fields, defaulting to empty strings
690
- question = card_dict.get("Question", "")
691
- answer = card_dict.get("Answer", "")
692
- explanation = card_dict.get("Explanation", "")
693
- example = card_dict.get("Example", "")
694
- prerequisites = card_dict.get("Prerequisites", "")
695
- learning_outcomes = card_dict.get("Learning_Outcomes", "")
696
- difficulty = card_dict.get("Difficulty", "")
697
- source_url = card_dict.get("SourceURL", "")
698
- tags_str_field = card_dict.get(
699
- "TagsStr", ""
700
- ) # This is the string for the model's TagsStr field
701
-
702
- # The 'Question' field from card_dict is used as the main text for both basic and cloze.
703
- # For cloze, this 'Question' field should contain the cloze-formatted text (e.g., "The capital of {{c1::France}} is Paris.")
704
  if not question:
705
  logger.error(
706
  f"SKIPPING CARD DUE TO EMPTY 'Question' (front/text) field. Card data: {card_dict}"
@@ -709,11 +756,10 @@ def export_cards_to_apkg(
709
 
710
  try:
711
  if note_type.lower() == "cloze":
712
- # CLOZE_MODEL fields: Text, Back Extra, Explanation, Example, Prerequisites,
713
- # Learning_Outcomes, Difficulty, SourceURL, TagsStr
714
  note_fields = [
715
- question, # Text (this is the card_dict['Question'] which should be cloze-formatted)
716
- answer, # Back Extra (this is card_dict['Answer'])
717
  explanation,
718
  example,
719
  prerequisites,
@@ -728,8 +774,7 @@ def export_cards_to_apkg(
728
  tags=tags_for_note_object,
729
  )
730
  else: # Basic
731
- # BASIC_MODEL fields: Question, Answer, Explanation, Example, Prerequisites,
732
- # Learning_Outcomes, Difficulty, SourceURL, TagsStr
733
  note_fields = [
734
  question,
735
  answer,
@@ -755,24 +800,17 @@ def export_cards_to_apkg(
755
  )
756
  logger.warning(f"Skipping card due to error: Question='{question[:50]}...'")
757
 
758
- if notes_added_count == 0 and cards: # Some cards were provided but none were added
759
- logger.error( # Changed to error for more visibility
760
  "No valid notes could be created from the provided cards. APKG generation aborted."
761
  )
762
- # This error should be caught by the calling function in app.py to inform the user
763
  raise gr.Error("Failed to create any valid Anki notes from the input.")
764
- elif not cards: # No cards provided initially
765
- logger.info("No cards provided to export to APKG. APKG generation skipped.")
766
- # Depending on desired behavior, could raise or return a specific status/filename
767
- # For now, let's assume an empty/default filename or None indicates no action if no cards
768
- # However, the function is typed to return str, so raising is more consistent if no file is made.
769
- raise gr.Error("No cards were provided to generate an APKG file.")
770
- else: # notes_added_count > 0
771
- logger.info(
772
- f"Added {notes_added_count} notes to deck '{deck_name}'. Proceeding to package."
773
- )
774
 
775
- # Only proceed to package and write if notes were successfully added
 
 
 
 
776
  package = genanki.Package(anki_deck)
777
  try:
778
  package.write_to_file(filename)
@@ -846,18 +884,18 @@ def export_dataframe_to_csv(
846
  logger.info(
847
  f"Attempting to export DataFrame to CSV. Suggested filename: {filename_suggestion}"
848
  )
849
- if data is None or data.empty:
 
 
 
 
850
  logger.warning(
851
  "No data provided to export_dataframe_to_csv. Skipping CSV export."
852
  )
853
- raise gr.Error(
854
- "No card data available"
855
- ) # Notify user via Gradio with Error instead of Info
856
- # return None # This line is now unreachable due to the raise
857
 
858
  try:
859
- # Create a specific filename using both suggestion and timestamp
860
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
861
  base_name_from_suggestion = "ankigen_cards" # Default base part
862
 
863
  # Sanitize and use the suggestion (e.g., subject name) if provided
@@ -867,28 +905,23 @@ def export_dataframe_to_csv(
867
  safe_suggestion = (
868
  processed_suggestion.replace(" ", "_")
869
  .replace("/", "-")
870
- .replace("\\\\", "-")
871
  )
872
- if (
873
- safe_suggestion
874
- ): # If suggestion wasn't just '.csv' or empty after processing
875
  base_name_from_suggestion = f"ankigen_{safe_suggestion[:50]}"
876
- # If suggestion was empty or only '.csv', default base_name_from_suggestion remains 'ankigen_cards'
877
 
878
- final_filename = f"{base_name_from_suggestion}_{timestamp}.csv"
 
 
 
879
 
880
- # Ensure output directory exists if filename contains path
881
- output_dir = os.path.dirname(final_filename)
882
- if output_dir and not os.path.exists(output_dir):
883
- os.makedirs(output_dir)
884
- logger.info(f"Created output directory for CSV: {output_dir}")
885
 
886
- data.to_csv(final_filename, index=False) # MODIFIED: Write to final_filename
887
  logger.info(f"Successfully exported DataFrame to CSV: {final_filename}")
888
- gr.Info(
889
- f"CSV ready for download: {os.path.basename(final_filename)}"
890
- ) # User-friendly message
891
- return final_filename # MODIFIED: Return final_filename
892
  except Exception as e:
893
  logger.error(f"Error exporting DataFrame to CSV: {e}", exc_info=True)
894
  gr.Error(f"Error exporting DataFrame to CSV: {e}")
@@ -902,9 +935,8 @@ def export_dataframe_to_apkg(
902
  deck_name: str,
903
  ) -> str:
904
  """Exports a DataFrame of cards to an Anki .apkg file."""
905
- if df.empty:
906
- logger.warning("export_dataframe_to_apkg called with an empty DataFrame.")
907
- raise ValueError("No cards in DataFrame to export.")
908
 
909
  logger.info(
910
  f"Starting APKG export for DataFrame with {len(df)} rows to deck '{deck_name}'. Output: {output_path}"
@@ -918,25 +950,17 @@ def export_dataframe_to_apkg(
918
  )
919
  topic = _format_field_as_string(row.get("Topic", ""))
920
  difficulty_raw = _format_field_as_string(row.get("Difficulty", ""))
921
- difficulty_plain_for_tag = strip_html_tags(
922
- difficulty_raw
923
- ) # Strip HTML for the tag
924
 
925
- tags_list_for_note_obj = [] # For genanki.Note(tags=...)
926
  if topic:
927
  tags_list_for_note_obj.append(topic.replace(" ", "_").replace(",", "_"))
928
- if difficulty_plain_for_tag: # Use the plain text version for the tag
929
- # Further sanitize for Anki tags: replace spaces with underscores, remove other invalid chars if any.
930
- # Anki tags also often don't like colons or other special chars except underscore/hyphen.
931
- # For now, just replacing space, as that's the error seen.
932
  safe_difficulty_tag = difficulty_plain_for_tag.replace(" ", "_")
933
  tags_list_for_note_obj.append(safe_difficulty_tag)
934
 
935
- tags_str_for_field = " ".join(
936
- tags_list_for_note_obj
937
- ) # For the 'TagsStr' model field
938
 
939
- # Prepare a dictionary that contains all possible fields our models might need.
940
  card_data_for_note = {
941
  "note_type": note_type_val,
942
  "tags_for_note_object": tags_list_for_note_obj,
@@ -949,7 +973,7 @@ def export_dataframe_to_apkg(
949
  "Learning_Outcomes": _format_field_as_string(
950
  row.get("Learning_Outcomes", "")
951
  ),
952
- "Difficulty": difficulty_raw, # Keep the original HTML for the 'Difficulty' field itself
953
  "SourceURL": _format_field_as_string(row.get("Source_URL", "")),
954
  }
955
  cards_for_apkg.append(card_data_for_note)
 
4
  import pandas as pd
5
  import genanki
6
  import random
7
+ import html
8
  from typing import List, Dict, Any, Optional
9
  import csv
10
  from datetime import datetime
 
24
  return str(value).strip()
25
 
26
 
27
+ def _generate_timestamped_filename(
28
+ base_name: str, extension: str, include_timestamp: bool = True
29
+ ) -> str:
30
+ """Generate a filename with optional timestamp.
31
+
32
+ Args:
33
+ base_name: The base name for the file (without extension)
34
+ extension: File extension (e.g., 'csv', 'apkg')
35
+ include_timestamp: Whether to include timestamp in filename
36
+
37
+ Returns:
38
+ Generated filename with extension
39
+ """
40
+ if include_timestamp:
41
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
42
+ return f"{base_name}_{timestamp}.{extension}"
43
+ return f"{base_name}.{extension}"
44
+
45
+
46
+ def _ensure_output_directory(filepath: str) -> None:
47
+ """Ensure the output directory exists for the given filepath.
48
+
49
+ Args:
50
+ filepath: Full path to the file
51
+
52
+ Creates the directory if it doesn't exist.
53
+ """
54
+ output_dir = os.path.dirname(filepath)
55
+ if output_dir and not os.path.exists(output_dir):
56
+ os.makedirs(output_dir)
57
+ logger.info(f"Created output directory: {output_dir}")
58
+
59
+
60
+ def _validate_non_empty_data(data: Any, data_type: str) -> None:
61
+ """Validate that data is not empty.
62
+
63
+ Args:
64
+ data: The data to validate (list, DataFrame, etc.)
65
+ data_type: Description of data type for error messages
66
+
67
+ Raises:
68
+ ValueError: If data is empty or None
69
+ """
70
+ if data is None:
71
+ raise ValueError(f"No {data_type} provided to export.")
72
+ if isinstance(data, list) and not data:
73
+ raise ValueError(f"No {data_type} provided to export.")
74
+ if isinstance(data, pd.DataFrame) and data.empty:
75
+ raise ValueError(f"No {data_type} available to export.")
76
+
77
+
78
  # --- Constants for APKG Generation (Subtask 10) ---
79
  ANKI_BASIC_MODEL_NAME = "AnkiGen Basic"
80
  ANKI_CLOZE_MODEL_NAME = "AnkiGen Cloze"
 
639
  KeyError: If a card dictionary is missing essential keys like 'front' or 'back'.
640
  ValueError: If the cards list is empty or not provided.
641
  """
642
+ # Validation using helper
643
+ _validate_non_empty_data(cards, "cards")
 
644
 
645
+ # Filename generation using helper
646
  if not filename:
647
+ filename = _generate_timestamped_filename("ankigen_cards", "csv")
 
 
 
648
  logger.info(f"No filename provided, generated: {filename}")
649
 
650
+ # Ensure output directory exists using helper
651
+ _ensure_output_directory(filename)
652
+
653
  # Define the fieldnames expected in the CSV.
 
654
  fieldnames = ["front", "back", "tags", "note_type"]
655
 
656
  try:
 
662
  writer.writeheader()
663
  for i, card in enumerate(cards):
664
  try:
665
+ # Ensure mandatory fields exist
666
  if "front" not in card or "back" not in card:
667
  raise KeyError(
668
  f"Card at index {i} is missing 'front' or 'back' key."
 
679
  logger.error(
680
  f"Skipping card due to KeyError: {e_inner}. Card data: {card}"
681
  )
 
 
 
682
  continue
683
  logger.info(f"Successfully exported cards to {filename}")
684
  return filename
685
  except IOError as e_io:
686
  logger.error(f"IOError during CSV export to {filename}: {e_io}", exc_info=True)
687
+ raise
688
+ except Exception as e_general:
689
  logger.error(
690
  f"Unexpected error during CSV export to {filename}: {e_general}",
691
  exc_info=True,
 
712
  The path to the exported file.
713
  """
714
  logger.info(f"Starting APKG export for {len(cards)} cards to deck '{deck_name}'.")
715
+
716
+ # Validation using helper - note this now raises ValueError instead of gr.Error
717
+ _validate_non_empty_data(cards, "cards")
718
+
719
+ # Filename generation using helper
720
  if not filename:
721
+ filename = _generate_timestamped_filename("ankigen_deck", "apkg")
 
722
  elif not filename.lower().endswith(".apkg"):
723
  filename += ".apkg"
724
 
725
+ # Ensure output directory exists using helper
726
+ _ensure_output_directory(filename)
 
 
727
 
728
  anki_basic_model = BASIC_MODEL
729
  anki_cloze_model = CLOZE_MODEL
 
737
  tags_for_note_object = card_dict.get("tags_for_note_object", [])
738
 
739
  # Extract all potential fields, defaulting to empty strings
740
+ # Security: Sanitize HTML to prevent XSS when viewing cards in Anki
741
+ question = html.escape(card_dict.get("Question", ""))
742
+ answer = html.escape(card_dict.get("Answer", ""))
743
+ explanation = html.escape(card_dict.get("Explanation", ""))
744
+ example = html.escape(card_dict.get("Example", ""))
745
+ prerequisites = html.escape(card_dict.get("Prerequisites", ""))
746
+ learning_outcomes = html.escape(card_dict.get("Learning_Outcomes", ""))
747
+ difficulty = html.escape(card_dict.get("Difficulty", ""))
748
+ source_url = html.escape(card_dict.get("SourceURL", ""))
749
+ tags_str_field = html.escape(card_dict.get("TagsStr", ""))
750
+
 
 
 
751
  if not question:
752
  logger.error(
753
  f"SKIPPING CARD DUE TO EMPTY 'Question' (front/text) field. Card data: {card_dict}"
 
756
 
757
  try:
758
  if note_type.lower() == "cloze":
759
+ # CLOZE_MODEL fields
 
760
  note_fields = [
761
+ question, # Text
762
+ answer, # Back Extra
763
  explanation,
764
  example,
765
  prerequisites,
 
774
  tags=tags_for_note_object,
775
  )
776
  else: # Basic
777
+ # BASIC_MODEL fields
 
778
  note_fields = [
779
  question,
780
  answer,
 
800
  )
801
  logger.warning(f"Skipping card due to error: Question='{question[:50]}...'")
802
 
803
+ if notes_added_count == 0:
804
+ logger.error(
805
  "No valid notes could be created from the provided cards. APKG generation aborted."
806
  )
 
807
  raise gr.Error("Failed to create any valid Anki notes from the input.")
 
 
 
 
 
 
 
 
 
 
808
 
809
+ logger.info(
810
+ f"Added {notes_added_count} notes to deck '{deck_name}'. Proceeding to package."
811
+ )
812
+
813
+ # Package and write
814
  package = genanki.Package(anki_deck)
815
  try:
816
  package.write_to_file(filename)
 
884
  logger.info(
885
  f"Attempting to export DataFrame to CSV. Suggested filename: {filename_suggestion}"
886
  )
887
+
888
+ # Validation using helper
889
+ try:
890
+ _validate_non_empty_data(data, "card data")
891
+ except ValueError:
892
  logger.warning(
893
  "No data provided to export_dataframe_to_csv. Skipping CSV export."
894
  )
895
+ raise gr.Error("No card data available")
 
 
 
896
 
897
  try:
898
+ # Generate filename from suggestion
 
899
  base_name_from_suggestion = "ankigen_cards" # Default base part
900
 
901
  # Sanitize and use the suggestion (e.g., subject name) if provided
 
905
  safe_suggestion = (
906
  processed_suggestion.replace(" ", "_")
907
  .replace("/", "-")
908
+ .replace("\\", "-")
909
  )
910
+ if safe_suggestion:
 
 
911
  base_name_from_suggestion = f"ankigen_{safe_suggestion[:50]}"
 
912
 
913
+ # Generate timestamped filename using helper
914
+ final_filename = _generate_timestamped_filename(
915
+ base_name_from_suggestion, "csv"
916
+ )
917
 
918
+ # Ensure output directory exists using helper
919
+ _ensure_output_directory(final_filename)
 
 
 
920
 
921
+ data.to_csv(final_filename, index=False)
922
  logger.info(f"Successfully exported DataFrame to CSV: {final_filename}")
923
+ gr.Info(f"CSV ready for download: {os.path.basename(final_filename)}")
924
+ return final_filename
 
 
925
  except Exception as e:
926
  logger.error(f"Error exporting DataFrame to CSV: {e}", exc_info=True)
927
  gr.Error(f"Error exporting DataFrame to CSV: {e}")
 
935
  deck_name: str,
936
  ) -> str:
937
  """Exports a DataFrame of cards to an Anki .apkg file."""
938
+ # Validation using helper
939
+ _validate_non_empty_data(df, "cards in DataFrame")
 
940
 
941
  logger.info(
942
  f"Starting APKG export for DataFrame with {len(df)} rows to deck '{deck_name}'. Output: {output_path}"
 
950
  )
951
  topic = _format_field_as_string(row.get("Topic", ""))
952
  difficulty_raw = _format_field_as_string(row.get("Difficulty", ""))
953
+ difficulty_plain_for_tag = strip_html_tags(difficulty_raw)
 
 
954
 
955
+ tags_list_for_note_obj = []
956
  if topic:
957
  tags_list_for_note_obj.append(topic.replace(" ", "_").replace(",", "_"))
958
+ if difficulty_plain_for_tag:
 
 
 
959
  safe_difficulty_tag = difficulty_plain_for_tag.replace(" ", "_")
960
  tags_list_for_note_obj.append(safe_difficulty_tag)
961
 
962
+ tags_str_for_field = " ".join(tags_list_for_note_obj)
 
 
963
 
 
964
  card_data_for_note = {
965
  "note_type": note_type_val,
966
  "tags_for_note_object": tags_list_for_note_obj,
 
973
  "Learning_Outcomes": _format_field_as_string(
974
  row.get("Learning_Outcomes", "")
975
  ),
976
+ "Difficulty": difficulty_raw,
977
  "SourceURL": _format_field_as_string(row.get("Source_URL", "")),
978
  }
979
  cards_for_apkg.append(card_data_for_note)
ankigen_core/llm_interface.py CHANGED
@@ -74,6 +74,52 @@ class OpenAIClientManager:
74
  )
75
  return self._client
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  # Retry decorator for API calls - kept similar to original
79
  @retry(
@@ -114,6 +160,7 @@ async def structured_output_completion(
114
  ):
115
  effective_system_prompt = f"{system_prompt}\nProvide your response as a JSON object matching the specified schema."
116
 
 
117
  completion = await openai_client.chat.completions.create(
118
  model=model,
119
  messages=[
@@ -122,6 +169,7 @@ async def structured_output_completion(
122
  ],
123
  response_format=response_format, # Pass the dict directly
124
  temperature=0.7, # Consider making this configurable
 
125
  )
126
 
127
  if not hasattr(completion, "choices") or not completion.choices:
@@ -252,8 +300,30 @@ async def process_crawled_page(
252
  custom_system_prompt: Optional[str] = None,
253
  custom_user_prompt_template: Optional[str] = None,
254
  max_prompt_content_tokens: int = 6000,
 
255
  ) -> List[Card]:
256
- """Process a crawled page and extract structured Card objects using OpenAI."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  logger.info(
258
  f"Processing page: {page.url} with model {model}, max_prompt_content_tokens: {max_prompt_content_tokens}"
259
  )
@@ -362,6 +432,7 @@ Generate a few high-quality Anki cards from this content.
362
  f"Attempting to generate cards for {page.url} using model {model}."
363
  )
364
  response_format_param = {"type": "json_object"}
 
365
  response_data = await openai_client.chat.completions.create(
366
  model=model,
367
  messages=[
@@ -370,6 +441,7 @@ Generate a few high-quality Anki cards from this content.
370
  ],
371
  response_format=response_format_param,
372
  temperature=0.5,
 
373
  )
374
 
375
  if (
@@ -466,6 +538,12 @@ Generate a few high-quality Anki cards from this content.
466
  logger.info(
467
  f"Successfully generated {len(validated_cards)} Cards from {page.url}."
468
  )
 
 
 
 
 
 
469
  return validated_cards
470
 
471
  except json.JSONDecodeError as e:
@@ -509,6 +587,7 @@ async def process_crawled_pages(
509
  custom_system_prompt: Optional[str] = None,
510
  custom_user_prompt_template: Optional[str] = None,
511
  progress_callback: Optional[Callable[[int, int], None]] = None,
 
512
  ) -> List[Card]:
513
  if not pages:
514
  logger.info("No pages provided to process_crawled_pages.")
@@ -536,6 +615,7 @@ async def process_crawled_pages(
536
  custom_system_prompt=custom_system_prompt,
537
  custom_user_prompt_template=custom_user_prompt_template,
538
  max_prompt_content_tokens=max_prompt_content_tokens,
 
539
  )
540
  if page_cards is None:
541
  logger.warning(
 
74
  )
75
  return self._client
76
 
77
+ def __enter__(self):
78
+ """Context manager entry."""
79
+ return self
80
+
81
+ def __exit__(self, exc_type, exc_val, exc_tb):
82
+ """Context manager exit - cleanup resources."""
83
+ self.close()
84
+ return False
85
+
86
+ async def __aenter__(self):
87
+ """Async context manager entry."""
88
+ return self
89
+
90
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
91
+ """Async context manager exit - cleanup resources."""
92
+ await self.aclose()
93
+ return False
94
+
95
+ def close(self) -> None:
96
+ """Close the OpenAI client synchronously."""
97
+ if self._client:
98
+ try:
99
+ # OpenAI client has a close method for cleanup
100
+ if hasattr(self._client, "close"):
101
+ self._client.close()
102
+ logger.debug("OpenAI client closed")
103
+ except Exception as e:
104
+ logger.warning(f"Error closing OpenAI client: {e}")
105
+ finally:
106
+ self._client = None
107
+
108
+ async def aclose(self) -> None:
109
+ """Close the OpenAI client asynchronously."""
110
+ if self._client:
111
+ try:
112
+ # OpenAI async client has an aclose method
113
+ if hasattr(self._client, "aclose"):
114
+ await self._client.aclose()
115
+ elif hasattr(self._client, "close"):
116
+ self._client.close()
117
+ logger.debug("OpenAI client closed (async)")
118
+ except Exception as e:
119
+ logger.warning(f"Error closing OpenAI client: {e}")
120
+ finally:
121
+ self._client = None
122
+
123
 
124
  # Retry decorator for API calls - kept similar to original
125
  @retry(
 
160
  ):
161
  effective_system_prompt = f"{system_prompt}\nProvide your response as a JSON object matching the specified schema."
162
 
163
+ # Security: Add timeout to prevent indefinite hanging
164
  completion = await openai_client.chat.completions.create(
165
  model=model,
166
  messages=[
 
169
  ],
170
  response_format=response_format, # Pass the dict directly
171
  temperature=0.7, # Consider making this configurable
172
+ timeout=120.0, # 120 second timeout
173
  )
174
 
175
  if not hasattr(completion, "choices") or not completion.choices:
 
300
  custom_system_prompt: Optional[str] = None,
301
  custom_user_prompt_template: Optional[str] = None,
302
  max_prompt_content_tokens: int = 6000,
303
+ cache: Optional[ResponseCache] = None,
304
  ) -> List[Card]:
305
+ """Process a crawled page and extract structured Card objects using OpenAI.
306
+
307
+ Args:
308
+ openai_client: The OpenAI client instance
309
+ page: The crawled page to process
310
+ model: The model to use for generation
311
+ custom_system_prompt: Optional custom system prompt
312
+ custom_user_prompt_template: Optional custom user prompt template
313
+ max_prompt_content_tokens: Maximum tokens for content
314
+ cache: Optional ResponseCache for page-level caching
315
+
316
+ Returns:
317
+ List of generated Card objects
318
+ """
319
+ # Check page-level cache first
320
+ if cache:
321
+ cache_key = f"{page.url}:{model}"
322
+ cached_cards = cache.get(cache_key, "page_cache")
323
+ if cached_cards is not None:
324
+ logger.info(f"Using cached cards for page: {page.url}")
325
+ return cached_cards
326
+
327
  logger.info(
328
  f"Processing page: {page.url} with model {model}, max_prompt_content_tokens: {max_prompt_content_tokens}"
329
  )
 
432
  f"Attempting to generate cards for {page.url} using model {model}."
433
  )
434
  response_format_param = {"type": "json_object"}
435
+ # Security: Add timeout to prevent indefinite hanging
436
  response_data = await openai_client.chat.completions.create(
437
  model=model,
438
  messages=[
 
441
  ],
442
  response_format=response_format_param,
443
  temperature=0.5,
444
+ timeout=120.0, # 120 second timeout
445
  )
446
 
447
  if (
 
538
  logger.info(
539
  f"Successfully generated {len(validated_cards)} Cards from {page.url}."
540
  )
541
+ # Cache successful results for page-level caching
542
+ if cache:
543
+ cache_key = f"{page.url}:{model}"
544
+ cache.set(cache_key, "page_cache", validated_cards)
545
+ logger.debug(f"Cached {len(validated_cards)} cards for {page.url}")
546
+
547
  return validated_cards
548
 
549
  except json.JSONDecodeError as e:
 
587
  custom_system_prompt: Optional[str] = None,
588
  custom_user_prompt_template: Optional[str] = None,
589
  progress_callback: Optional[Callable[[int, int], None]] = None,
590
+ cache: Optional[ResponseCache] = None,
591
  ) -> List[Card]:
592
  if not pages:
593
  logger.info("No pages provided to process_crawled_pages.")
 
615
  custom_system_prompt=custom_system_prompt,
616
  custom_user_prompt_template=custom_user_prompt_template,
617
  max_prompt_content_tokens=max_prompt_content_tokens,
618
+ cache=cache,
619
  )
620
  if page_cards is None:
621
  logger.warning(
ankigen_core/ui_logic.py CHANGED
@@ -250,18 +250,16 @@ def use_selected_subjects(subjects_df: pd.DataFrame | None):
250
  )
251
 
252
 
253
- def create_crawler_main_mode_elements() -> (
254
- Tuple[
255
- List[gr.components.Component], # ui_components (url_input, max_depth, etc.)
256
- gr.Button, # crawl_button
257
- gr.Progress, # progress_bar
258
- gr.Textbox, # progress_status_textbox
259
- gr.Textbox, # custom_system_prompt
260
- gr.Textbox, # custom_user_prompt_template
261
- gr.Checkbox, # use_sitemap_checkbox
262
- gr.Textbox, # sitemap_url_textbox
263
- ]
264
- ):
265
  """Creates the UI components for the Web Crawler mode integrated into the main tab."""
266
  ui_components: List[gr.components.Component] = []
267
 
 
250
  )
251
 
252
 
253
+ def create_crawler_main_mode_elements() -> Tuple[
254
+ List[gr.components.Component], # ui_components (url_input, max_depth, etc.)
255
+ gr.Button, # crawl_button
256
+ gr.Progress, # progress_bar
257
+ gr.Textbox, # progress_status_textbox
258
+ gr.Textbox, # custom_system_prompt
259
+ gr.Textbox, # custom_user_prompt_template
260
+ gr.Checkbox, # use_sitemap_checkbox
261
+ gr.Textbox, # sitemap_url_textbox
262
+ ]:
 
 
263
  """Creates the UI components for the Web Crawler mode integrated into the main tab."""
264
  ui_components: List[gr.components.Component] = []
265
 
ankigen_core/utils.py CHANGED
@@ -6,7 +6,6 @@ import sys
6
  import hashlib
7
  import requests
8
  from bs4 import BeautifulSoup
9
- from functools import lru_cache
10
  from typing import Any, Optional
11
  import time
12
 
@@ -14,7 +13,7 @@ import time
14
  _logger_instance = None
15
 
16
 
17
- def setup_logging():
18
  """Configure logging to both file and console"""
19
  global _logger_instance
20
  if _logger_instance:
@@ -49,7 +48,7 @@ def setup_logging():
49
  return logger
50
 
51
 
52
- def get_logger():
53
  """Returns the initialized logger instance."""
54
  if _logger_instance is None:
55
  return setup_logging()
@@ -62,39 +61,65 @@ logger = get_logger()
62
 
63
  # --- Caching ---
64
  class ResponseCache:
65
- """A simple cache for API responses using LRU for get operations."""
66
 
67
- def __init__(self, maxsize=128):
68
- # This internal method will be decorated by lru_cache
69
- self._internal_get_from_dict = self._get_from_dict_actual
70
- self._lru_cached_get = lru_cache(maxsize=maxsize)(self._internal_get_from_dict)
71
- self._dict_cache = {} # Main store for set operations
72
-
73
- def _get_from_dict_actual(self, cache_key: str):
74
- """Actual dictionary lookup, intended to be wrapped by lru_cache."""
75
- logger.debug(f"Cache DICT GET: key={cache_key}")
76
- return self._dict_cache.get(cache_key)
77
 
78
  def get(self, prompt: str, model: str) -> Optional[Any]:
79
- """Retrieves an item from the cache. Uses LRU for this get path."""
80
  cache_key = self._create_key(prompt, model)
81
- # Use the LRU cached getter which looks up in _dict_cache
82
- return self._lru_cached_get(cache_key)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  def set(self, prompt: str, model: str, response: Any):
85
- """Sets an item in the cache."""
86
  cache_key = self._create_key(prompt, model)
87
- logger.debug(f"Cache SET: key={cache_key}, type={type(response)}")
88
- self._dict_cache[cache_key] = response
89
- # To make the LRU cache aware of this new item for subsequent gets:
90
- # We can call the LRU getter so it caches it, or clear specific lru entry if updating.
91
- # For simplicity, if a new item is set, a subsequent get will fetch and cache it via LRU.
92
- # Or, we can "prime" the lru_cache, but that's more complex.
93
- # Current approach: set updates _dict_cache. Next get for this key will use _lru_cached_get,
94
- # which will fetch from _dict_cache and then be LRU-managed.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
  def _create_key(self, prompt: str, model: str) -> str:
97
- """Creates a unique MD5 hash key for caching."""
 
98
  return hashlib.md5(f"{model}:{prompt}".encode("utf-8")).hexdigest()
99
 
100
 
@@ -178,7 +203,7 @@ class RateLimiter:
178
  self.last_request_timestamp: float = 0.0
179
  # Use a lock if this were to be used by multiple threads, but for now assuming single thread access per instance
180
 
181
- def wait(self):
182
  """Blocks until it's safe to make the next request."""
183
  current_time = time.monotonic() # Use monotonic clock for intervals
184
  time_since_last_request = current_time - self.last_request_timestamp
 
6
  import hashlib
7
  import requests
8
  from bs4 import BeautifulSoup
 
9
  from typing import Any, Optional
10
  import time
11
 
 
13
  _logger_instance = None
14
 
15
 
16
+ def setup_logging() -> logging.Logger:
17
  """Configure logging to both file and console"""
18
  global _logger_instance
19
  if _logger_instance:
 
48
  return logger
49
 
50
 
51
+ def get_logger() -> logging.Logger:
52
  """Returns the initialized logger instance."""
53
  if _logger_instance is None:
54
  return setup_logging()
 
61
 
62
  # --- Caching ---
63
  class ResponseCache:
64
+ """Simple and efficient LRU cache for API responses with proper eviction."""
65
 
66
+ def __init__(self, maxsize: int = 128):
67
+ self.maxsize = maxsize
68
+ self._cache = {} # {key: response}
69
+ self._access_order = [] # Track access order for LRU eviction
70
+ self.hits = 0
71
+ self.misses = 0
 
 
 
 
72
 
73
  def get(self, prompt: str, model: str) -> Optional[Any]:
74
+ """Retrieve item from cache, updating LRU order."""
75
  cache_key = self._create_key(prompt, model)
76
+
77
+ if cache_key in self._cache:
78
+ # Move to end (most recently used)
79
+ self._access_order.remove(cache_key)
80
+ self._access_order.append(cache_key)
81
+ self.hits += 1
82
+ logger.debug(
83
+ f"Cache HIT: {cache_key[:16]}... (hits={self.hits}, misses={self.misses})"
84
+ )
85
+ return self._cache[cache_key]
86
+
87
+ self.misses += 1
88
+ logger.debug(
89
+ f"Cache MISS: {cache_key[:16]}... (hits={self.hits}, misses={self.misses})"
90
+ )
91
+ return None
92
 
93
  def set(self, prompt: str, model: str, response: Any):
94
+ """Store item in cache with LRU eviction when full."""
95
  cache_key = self._create_key(prompt, model)
96
+
97
+ # If key exists, update and move to end
98
+ if cache_key in self._cache:
99
+ self._access_order.remove(cache_key)
100
+ # If cache is full, evict least recently used
101
+ elif len(self._cache) >= self.maxsize:
102
+ evicted_key = self._access_order.pop(0)
103
+ del self._cache[evicted_key]
104
+ logger.debug(
105
+ f"Cache EVICT: {evicted_key[:16]}... (size={len(self._cache)})"
106
+ )
107
+
108
+ self._cache[cache_key] = response
109
+ self._access_order.append(cache_key)
110
+ logger.debug(f"Cache SET: {cache_key[:16]}... (size={len(self._cache)})")
111
+
112
+ def clear(self) -> None:
113
+ """Clear all cache entries and statistics."""
114
+ self._cache.clear()
115
+ self._access_order.clear()
116
+ self.hits = 0
117
+ self.misses = 0
118
+ logger.debug("Cache CLEARED")
119
 
120
  def _create_key(self, prompt: str, model: str) -> str:
121
+ """Create cache key from prompt and model (MD5 hash for size efficiency)."""
122
+ # Hash to keep keys manageable size while maintaining uniqueness
123
  return hashlib.md5(f"{model}:{prompt}".encode("utf-8")).hexdigest()
124
 
125
 
 
203
  self.last_request_timestamp: float = 0.0
204
  # Use a lock if this were to be used by multiple threads, but for now assuming single thread access per instance
205
 
206
+ def wait(self) -> None:
207
  """Blocks until it's safe to make the next request."""
208
  current_time = time.monotonic() # Use monotonic clock for intervals
209
  time_since_last_request = current_time - self.last_request_timestamp
pyproject.toml CHANGED
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
5
  [project]
6
  name = "ankigen"
7
  version = "0.2.0"
8
- description = ""
9
  authors = [
10
  { name = "Justin", email = "[email protected]" },
11
  ]
 
5
  [project]
6
  name = "ankigen"
7
  version = "0.2.0"
8
+ description = "AI-powered Anki flashcard generator using OpenAI GPT models with CLI and web interface"
9
  authors = [
10
  { name = "Justin", email = "[email protected]" },
11
  ]