Spaces:

Zwounds
/

FormatReview

Sleeping

File size: 10,937 Bytes

import logging
import asyncio
import nest_asyncio
import os
import json
import httpx
import subprocess
import sys
from config import settings
from pydantic import BaseModel, Field

logger = logging.getLogger(__name__)

def ensure_playwright_installed():
    """Ensure Playwright browsers are installed, especially for Hugging Face deployment."""
    try:
        # Check if we're in a Hugging Face environment
        if os.getenv('SPACE_ID') or os.getenv('HF_SPACE_ID') or os.getenv('SPACES_BUILDKIT_VERSION'):
            logger.info("Detected Hugging Face environment, checking Playwright installation...")
            
            # Try to install Playwright browsers
            result = subprocess.run([
                sys.executable, "-m", "playwright", "install", "chromium"
            ], capture_output=True, text=True, timeout=300)
            
            if result.returncode == 0:
                logger.info("Playwright browsers installed successfully")
                return True
            else:
                logger.warning(f"Playwright installation failed: {result.stderr}")
                return False
                
        else:
            logger.info("Not in Hugging Face environment, assuming Playwright is available")
            return True
            
    except subprocess.TimeoutExpired:
        logger.error("Playwright installation timed out")
        return False
    except Exception as e:
        logger.error(f"Error installing Playwright browsers: {e}")
        return False

class FormattingRules(BaseModel):
    """Schema for formatting rules extraction"""
    margins: str = Field(description="Margin requirements for the manuscript")
    font: str = Field(description="Font requirements including size, type, etc.")
    line_spacing: str = Field(description="Line spacing requirements")
    citations: str = Field(description="Citation style and formatting requirements")
    sections: str = Field(description="Required sections and their structure")
    other_rules: str = Field(description="Any other formatting requirements")
    summary: str = Field(description="A brief summary of the key formatting requirements")

def format_rules_for_display(rules_data):
    """
    Format the extracted rules data into a readable markdown string.
    """
    if not rules_data:
        return "Could not extract formatting rules from the provided URL."
    
    formatted_rules = f"""
# Manuscript Formatting Guidelines

## Margins
{rules_data.get('margins', 'Not specified')}

## Font
{rules_data.get('font', 'Not specified')}

## Line Spacing
{rules_data.get('line_spacing', 'Not specified')}

## Citations
{rules_data.get('citations', 'Not specified')}

## Section Structure
{rules_data.get('sections', 'Not specified')}

## Other Requirements
{rules_data.get('other_rules', 'Not specified')}

## Summary
{rules_data.get('summary', 'Not specified')}
"""
    return formatted_rules

def get_rules_from_url(url: str) -> str:
    """
    Extracts formatting rules from a given URL using crawl4ai.
    """
    logger.info(f"Extracting rules from URL: {url}")
    
    # Ensure Playwright is installed (especially for Hugging Face)
    playwright_available = ensure_playwright_installed()
    
    # Apply nest_asyncio here, when the function is called
    nest_asyncio.apply()
    
    # Import crawl4ai modules here to avoid event loop issues at module level
    try:
        from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig, LLMConfig
        from crawl4ai.extraction_strategy import LLMExtractionStrategy
    except ImportError as e:
        logger.error(f"Failed to import crawl4ai: {e}")
        return f"Failed to import required modules for web crawling. Error: {str(e)}"

    if not playwright_available:
        logger.warning("Playwright installation failed, falling back to simple HTTP request and raw HTML processing")
        try:
            with httpx.Client() as client:
                response = client.get(url, follow_redirects=True)
                response.raise_for_status()
                raw_html = response.text
                # Use crawl4ai to process the raw HTML
                url = f"raw:{raw_html}"
        except Exception as e:
            return f"Failed to extract rules from {url} after Playwright failure. Error: {str(e)}"
    
    
    async def _extract_rules_async(url: str) -> str:
        """
        Asynchronously extracts formatting rules from a given URL using crawl4ai.
        """
        
        # Configure the LLM extraction
        extraction_strategy = LLMExtractionStrategy(
            llm_config=LLMConfig(
                provider=f"{settings.llm_provider}/{settings.llm_model_name}",
                api_token=settings.openrouter_api_key
            ),
            schema=FormattingRules.schema(),
            extraction_type="schema",
            instruction="""
            From the crawled content, extract all formatting rules for manuscript submissions.
            Focus on requirements for margins, font, line spacing, citations, section structure,
            and any other formatting guidelines. Provide a comprehensive extraction of all
            formatting-related information.
            
            If a specific requirement is not mentioned in the content, include "Not specified" in the corresponding field.
            """
        )
        
        # Configure the crawler
        run_config = CrawlerRunConfig(
            word_count_threshold=10,
            exclude_external_links=True,
            process_iframes=True,
            remove_overlay_elements=True,
            exclude_social_media_links=True,
            check_robots_txt=True,
            semaphore_count=3,
            extraction_strategy=extraction_strategy
        )

        # Initialize the crawler and run
        try:
            async with AsyncWebCrawler() as crawler:
                try:
                    result = await crawler.arun(
                        url=url,
                        config=run_config
                    )
                    logger.info(f"Crawler result for {url}: {result}")

                    # Handle robots.txt blocking
                    if not result.success and "robots.txt" in str(result.error_message):
                        logger.warning(f"Crawl blocked by robots.txt for {url}. Falling back to direct download.")
                        try:
                            with httpx.Client() as client:
                                response = client.get(url, follow_redirects=True)
                                response.raise_for_status()
                            
                            raw_html = response.text
                            logger.info(f"Successfully downloaded HTML content for {url}.")
                            
                            # Re-run crawl4ai with raw HTML
                            raw_html_url = f"raw:{raw_html}"
                            result = await crawler.arun(url=raw_html_url, config=run_config)
                            logger.info(f"Crawler result for raw HTML: {result}")

                        except httpx.HTTPStatusError as e:
                            logger.error(f"HTTP error while fetching {url}: {e}", exc_info=True)
                            return "Failed to download the page content after being blocked by robots.txt."
                        except Exception as e:
                            logger.error(f"An error occurred during fallback processing for {url}: {e}", exc_info=True)
                            return "An error occurred during the fallback extraction process."

                except Exception as e:
                    logger.error(f"An error occurred during crawling {url}: {e}", exc_info=True)
                    return "An error occurred while trying to extract formatting rules."
        
        except Exception as e:
            logger.error(f"Failed to initialize AsyncWebCrawler: {e}", exc_info=True)
            # Fallback to simple HTTP request if crawler initialization fails
            try:
                with httpx.Client() as client:
                    response = client.get(url, follow_redirects=True)
                    response.raise_for_status()
                    return f"# Formatting Rules (Fallback Extraction)\n\nExtracted from: {url}\n\n{response.text[:2000]}...\n\n*Note: Browser-based extraction failed, showing raw content. Please review manually.*"
            except Exception as fallback_e:
                logger.error(f"Fallback HTTP request also failed: {fallback_e}")
                return f"Failed to extract rules from {url}. Both browser and HTTP extraction failed. Error: {str(e)}"
        
        if result.success and result.extracted_content:
            # The extracted content is often a list containing a JSON string.
            raw_data = result.extracted_content
            if isinstance(raw_data, list) and len(raw_data) > 0:
                raw_data = raw_data[0]

            # Ensure we have a dictionary to work with
            if isinstance(raw_data, str):
                try:
                    rules_data = json.loads(raw_data)
                    # If the parsed data is a list, take the first element
                    if isinstance(rules_data, list) and len(rules_data) > 0:
                        rules_data = rules_data[0]
                except json.JSONDecodeError:
                    logger.error(f"Failed to parse JSON from extracted content: {raw_data}")
                    return "Failed to parse the extracted formatting rules."
            elif isinstance(raw_data, dict):
                rules_data = raw_data
            else:
                logger.warning(f"Unexpected type for extracted content: {type(raw_data)}")
                return "Could not process the extracted formatting rules."

            # Store the raw data for debugging
            logger.info(f"Parsed rules data: {json.dumps(rules_data, indent=2)}")
            
            # Format the rules for display
            formatted_rules = format_rules_for_display(rules_data)
            if not formatted_rules:
                return "Failed to format the extracted rules."
            
            logger.info(f"Formatted rules: {formatted_rules[:100]}...")
            return formatted_rules
        elif result.success and result.markdown:
            # Fallback to markdown if structured extraction fails
            logger.info(f"Extraction failed, falling back to markdown for {url}")
            return result.markdown
        else:
            logger.warning(f"Failed to extract rules or markdown for {url}. Crawler success: {result.success}")
            return "Could not extract formatting rules from the provided URL. The crawler did not return any content."
    
    # Run the async function using the patched event loop
    return asyncio.run(_extract_rules_async(url))