brickfrog's picture
Upload folder using huggingface_hub
d09f6aa verified
# Module for utility functions (logging, caching, web fetching)
import logging
from logging.handlers import RotatingFileHandler
import sys
import hashlib
import requests
from bs4 import BeautifulSoup
from functools import lru_cache
from typing import Any, Optional
# --- Logging Setup ---
_logger_instance = None
def setup_logging():
"""Configure logging to both file and console"""
global _logger_instance
if _logger_instance:
return _logger_instance
logger = logging.getLogger("ankigen")
logger.setLevel(logging.DEBUG) # Keep debug level for the root logger
# Prevent duplicate handlers if called multiple times (though get_logger should prevent this)
if logger.hasHandlers():
logger.handlers.clear()
detailed_formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(module)s:%(lineno)d - %(message)s"
)
simple_formatter = logging.Formatter("%(levelname)s: %(message)s")
file_handler = RotatingFileHandler(
"ankigen.log", maxBytes=1024 * 1024, backupCount=5
)
file_handler.setLevel(logging.DEBUG) # File handler logs everything from DEBUG up
file_handler.setFormatter(detailed_formatter)
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(logging.INFO) # Console handler logs INFO and above
console_handler.setFormatter(simple_formatter)
logger.addHandler(file_handler)
logger.addHandler(console_handler)
_logger_instance = logger
return logger
def get_logger():
"""Returns the initialized logger instance."""
if _logger_instance is None:
return setup_logging()
return _logger_instance
# Initialize logger when module is loaded
logger = get_logger()
# --- Caching ---
class ResponseCache:
"""A simple cache for API responses using LRU for get operations."""
def __init__(self, maxsize=128):
# This internal method will be decorated by lru_cache
self._internal_get_from_dict = self._get_from_dict_actual
self._lru_cached_get = lru_cache(maxsize=maxsize)(self._internal_get_from_dict)
self._dict_cache = {} # Main store for set operations
def _get_from_dict_actual(self, cache_key: str):
"""Actual dictionary lookup, intended to be wrapped by lru_cache."""
logger.debug(f"Cache DICT GET: key={cache_key}")
return self._dict_cache.get(cache_key)
def get(self, prompt: str, model: str) -> Optional[Any]:
"""Retrieves an item from the cache. Uses LRU for this get path."""
cache_key = self._create_key(prompt, model)
# Use the LRU cached getter which looks up in _dict_cache
return self._lru_cached_get(cache_key)
def set(self, prompt: str, model: str, response: Any):
"""Sets an item in the cache."""
cache_key = self._create_key(prompt, model)
logger.debug(f"Cache SET: key={cache_key}, type={type(response)}")
self._dict_cache[cache_key] = response
# To make the LRU cache aware of this new item for subsequent gets:
# We can call the LRU getter so it caches it, or clear specific lru entry if updating.
# For simplicity, if a new item is set, a subsequent get will fetch and cache it via LRU.
# Or, we can "prime" the lru_cache, but that's more complex.
# Current approach: set updates _dict_cache. Next get for this key will use _lru_cached_get,
# which will fetch from _dict_cache and then be LRU-managed.
def _create_key(self, prompt: str, model: str) -> str:
"""Creates a unique MD5 hash key for caching."""
return hashlib.md5(f"{model}:{prompt}".encode("utf-8")).hexdigest()
# --- Web Content Fetching ---
def fetch_webpage_text(url: str) -> str:
"""Fetches and extracts main text content from a URL."""
logger_util = get_logger() # Use the logger from this module
try:
logger_util.info(f"Fetching content from URL: {url}")
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status()
logger_util.debug(f"Parsing HTML content for {url}")
try:
soup = BeautifulSoup(response.text, "lxml")
except ImportError: # Keep existing fallback
logger_util.warning("lxml not found, using html.parser instead.")
soup = BeautifulSoup(response.text, "html.parser")
except Exception as e: # Catch other BeautifulSoup init errors
logger_util.error(
f"BeautifulSoup initialization failed for {url}: {e}", exc_info=True
)
raise RuntimeError(f"Failed to parse HTML content for {url}.")
for script_or_style in soup(["script", "style"]):
script_or_style.extract()
main_content = soup.find("main")
if not main_content:
main_content = soup.find("article")
if main_content:
text = main_content.get_text()
logger_util.debug(f"Extracted text from <{main_content.name}> tag.")
else:
body = soup.find("body")
if body:
text = body.get_text()
logger_util.debug("Extracted text from <body> tag (fallback).")
else:
text = ""
logger_util.warning(f"Could not find <body> tag in {url}")
# Simpler text cleaning: join stripped lines
lines = (line.strip() for line in text.splitlines())
cleaned_text = "\n".join(line for line in lines if line)
if not cleaned_text:
logger_util.warning(f"Could not extract meaningful text from {url}")
return ""
logger_util.info(
f"Successfully extracted text from {url} (Length: {len(cleaned_text)} chars)"
)
return cleaned_text
except requests.exceptions.RequestException as e:
logger_util.error(f"Network error fetching URL {url}: {e}", exc_info=True)
raise ConnectionError(f"Could not fetch URL: {e}")
except Exception as e:
logger_util.error(f"Error processing URL {url}: {e}", exc_info=True)
if isinstance(e, (ValueError, ConnectionError, RuntimeError)):
raise e
else:
raise RuntimeError(
f"An unexpected error occurred while processing the URL: {e}"
)