import logging import asyncio import nest_asyncio import os import json import httpx import subprocess import sys from config import settings from pydantic import BaseModel, Field logger = logging.getLogger(__name__) def ensure_playwright_installed(): """Ensure Playwright browsers are installed, especially for Hugging Face deployment.""" try: # Check if we're in a Hugging Face environment if os.getenv('SPACE_ID') or os.getenv('HF_SPACE_ID') or os.getenv('SPACES_BUILDKIT_VERSION'): logger.info("Detected Hugging Face environment, checking Playwright installation...") # Try to install Playwright browsers result = subprocess.run([ sys.executable, "-m", "playwright", "install", "chromium" ], capture_output=True, text=True, timeout=300) if result.returncode == 0: logger.info("Playwright browsers installed successfully") return True else: logger.warning(f"Playwright installation failed: {result.stderr}") return False else: logger.info("Not in Hugging Face environment, assuming Playwright is available") return True except subprocess.TimeoutExpired: logger.error("Playwright installation timed out") return False except Exception as e: logger.error(f"Error installing Playwright browsers: {e}") return False class FormattingRules(BaseModel): """Schema for formatting rules extraction""" margins: str = Field(description="Margin requirements for the manuscript") font: str = Field(description="Font requirements including size, type, etc.") line_spacing: str = Field(description="Line spacing requirements") citations: str = Field(description="Citation style and formatting requirements") sections: str = Field(description="Required sections and their structure") other_rules: str = Field(description="Any other formatting requirements") summary: str = Field(description="A brief summary of the key formatting requirements") def format_rules_for_display(rules_data): """ Format the extracted rules data into a readable markdown string. """ if not rules_data: return "Could not extract formatting rules from the provided URL." formatted_rules = f""" # Manuscript Formatting Guidelines ## Margins {rules_data.get('margins', 'Not specified')} ## Font {rules_data.get('font', 'Not specified')} ## Line Spacing {rules_data.get('line_spacing', 'Not specified')} ## Citations {rules_data.get('citations', 'Not specified')} ## Section Structure {rules_data.get('sections', 'Not specified')} ## Other Requirements {rules_data.get('other_rules', 'Not specified')} ## Summary {rules_data.get('summary', 'Not specified')} """ return formatted_rules def get_rules_from_url(url: str) -> str: """ Extracts formatting rules from a given URL using crawl4ai. """ logger.info(f"Extracting rules from URL: {url}") # Ensure Playwright is installed (especially for Hugging Face) playwright_available = ensure_playwright_installed() # Apply nest_asyncio here, when the function is called nest_asyncio.apply() # Import crawl4ai modules here to avoid event loop issues at module level try: from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig, LLMConfig from crawl4ai.extraction_strategy import LLMExtractionStrategy except ImportError as e: logger.error(f"Failed to import crawl4ai: {e}") return f"Failed to import required modules for web crawling. Error: {str(e)}" if not playwright_available: logger.warning("Playwright installation failed, falling back to simple HTTP request and raw HTML processing") try: with httpx.Client() as client: response = client.get(url, follow_redirects=True) response.raise_for_status() raw_html = response.text # Use crawl4ai to process the raw HTML url = f"raw:{raw_html}" except Exception as e: return f"Failed to extract rules from {url} after Playwright failure. Error: {str(e)}" async def _extract_rules_async(url: str) -> str: """ Asynchronously extracts formatting rules from a given URL using crawl4ai. """ # Configure the LLM extraction extraction_strategy = LLMExtractionStrategy( llm_config=LLMConfig( provider=f"{settings.llm_provider}/{settings.llm_model_name}", api_token=settings.openrouter_api_key ), schema=FormattingRules.schema(), extraction_type="schema", instruction=""" From the crawled content, extract all formatting rules for manuscript submissions. Focus on requirements for margins, font, line spacing, citations, section structure, and any other formatting guidelines. Provide a comprehensive extraction of all formatting-related information. If a specific requirement is not mentioned in the content, include "Not specified" in the corresponding field. """ ) # Configure the crawler run_config = CrawlerRunConfig( word_count_threshold=10, exclude_external_links=True, process_iframes=True, remove_overlay_elements=True, exclude_social_media_links=True, check_robots_txt=True, semaphore_count=3, extraction_strategy=extraction_strategy ) # Initialize the crawler and run try: async with AsyncWebCrawler() as crawler: try: result = await crawler.arun( url=url, config=run_config ) logger.info(f"Crawler result for {url}: {result}") # Handle robots.txt blocking if not result.success and "robots.txt" in str(result.error_message): logger.warning(f"Crawl blocked by robots.txt for {url}. Falling back to direct download.") try: with httpx.Client() as client: response = client.get(url, follow_redirects=True) response.raise_for_status() raw_html = response.text logger.info(f"Successfully downloaded HTML content for {url}.") # Re-run crawl4ai with raw HTML raw_html_url = f"raw:{raw_html}" result = await crawler.arun(url=raw_html_url, config=run_config) logger.info(f"Crawler result for raw HTML: {result}") except httpx.HTTPStatusError as e: logger.error(f"HTTP error while fetching {url}: {e}", exc_info=True) return "Failed to download the page content after being blocked by robots.txt." except Exception as e: logger.error(f"An error occurred during fallback processing for {url}: {e}", exc_info=True) return "An error occurred during the fallback extraction process." except Exception as e: logger.error(f"An error occurred during crawling {url}: {e}", exc_info=True) return "An error occurred while trying to extract formatting rules." except Exception as e: logger.error(f"Failed to initialize AsyncWebCrawler: {e}", exc_info=True) # Fallback to simple HTTP request if crawler initialization fails try: with httpx.Client() as client: response = client.get(url, follow_redirects=True) response.raise_for_status() return f"# Formatting Rules (Fallback Extraction)\n\nExtracted from: {url}\n\n{response.text[:2000]}...\n\n*Note: Browser-based extraction failed, showing raw content. Please review manually.*" except Exception as fallback_e: logger.error(f"Fallback HTTP request also failed: {fallback_e}") return f"Failed to extract rules from {url}. Both browser and HTTP extraction failed. Error: {str(e)}" if result.success and result.extracted_content: # The extracted content is often a list containing a JSON string. raw_data = result.extracted_content if isinstance(raw_data, list) and len(raw_data) > 0: raw_data = raw_data[0] # Ensure we have a dictionary to work with if isinstance(raw_data, str): try: rules_data = json.loads(raw_data) # If the parsed data is a list, take the first element if isinstance(rules_data, list) and len(rules_data) > 0: rules_data = rules_data[0] except json.JSONDecodeError: logger.error(f"Failed to parse JSON from extracted content: {raw_data}") return "Failed to parse the extracted formatting rules." elif isinstance(raw_data, dict): rules_data = raw_data else: logger.warning(f"Unexpected type for extracted content: {type(raw_data)}") return "Could not process the extracted formatting rules." # Store the raw data for debugging logger.info(f"Parsed rules data: {json.dumps(rules_data, indent=2)}") # Format the rules for display formatted_rules = format_rules_for_display(rules_data) if not formatted_rules: return "Failed to format the extracted rules." logger.info(f"Formatted rules: {formatted_rules[:100]}...") return formatted_rules elif result.success and result.markdown: # Fallback to markdown if structured extraction fails logger.info(f"Extraction failed, falling back to markdown for {url}") return result.markdown else: logger.warning(f"Failed to extract rules or markdown for {url}. Crawler success: {result.success}") return "Could not extract formatting rules from the provided URL. The crawler did not return any content." # Run the async function using the patched event loop return asyncio.run(_extract_rules_async(url))