Spaces:
Sleeping
Sleeping
import logging | |
import asyncio | |
import nest_asyncio | |
import os | |
import json | |
import httpx | |
import subprocess | |
import sys | |
from config import settings | |
from pydantic import BaseModel, Field | |
logger = logging.getLogger(__name__) | |
def ensure_playwright_installed(): | |
"""Ensure Playwright browsers are installed, especially for Hugging Face deployment.""" | |
try: | |
# Check if we're in a Hugging Face environment | |
if os.getenv('SPACE_ID') or os.getenv('HF_SPACE_ID') or os.getenv('SPACES_BUILDKIT_VERSION'): | |
logger.info("Detected Hugging Face environment, checking Playwright installation...") | |
# Try to install Playwright browsers | |
result = subprocess.run([ | |
sys.executable, "-m", "playwright", "install", "chromium" | |
], capture_output=True, text=True, timeout=300) | |
if result.returncode == 0: | |
logger.info("Playwright browsers installed successfully") | |
return True | |
else: | |
logger.warning(f"Playwright installation failed: {result.stderr}") | |
return False | |
else: | |
logger.info("Not in Hugging Face environment, assuming Playwright is available") | |
return True | |
except subprocess.TimeoutExpired: | |
logger.error("Playwright installation timed out") | |
return False | |
except Exception as e: | |
logger.error(f"Error installing Playwright browsers: {e}") | |
return False | |
class FormattingRules(BaseModel): | |
"""Schema for formatting rules extraction""" | |
margins: str = Field(description="Margin requirements for the manuscript") | |
font: str = Field(description="Font requirements including size, type, etc.") | |
line_spacing: str = Field(description="Line spacing requirements") | |
citations: str = Field(description="Citation style and formatting requirements") | |
sections: str = Field(description="Required sections and their structure") | |
other_rules: str = Field(description="Any other formatting requirements") | |
summary: str = Field(description="A brief summary of the key formatting requirements") | |
def format_rules_for_display(rules_data): | |
""" | |
Format the extracted rules data into a readable markdown string. | |
""" | |
if not rules_data: | |
return "Could not extract formatting rules from the provided URL." | |
formatted_rules = f""" | |
# Manuscript Formatting Guidelines | |
## Margins | |
{rules_data.get('margins', 'Not specified')} | |
## Font | |
{rules_data.get('font', 'Not specified')} | |
## Line Spacing | |
{rules_data.get('line_spacing', 'Not specified')} | |
## Citations | |
{rules_data.get('citations', 'Not specified')} | |
## Section Structure | |
{rules_data.get('sections', 'Not specified')} | |
## Other Requirements | |
{rules_data.get('other_rules', 'Not specified')} | |
## Summary | |
{rules_data.get('summary', 'Not specified')} | |
""" | |
return formatted_rules | |
def get_rules_from_url(url: str) -> str: | |
""" | |
Extracts formatting rules from a given URL using crawl4ai. | |
""" | |
logger.info(f"Extracting rules from URL: {url}") | |
# Ensure Playwright is installed (especially for Hugging Face) | |
playwright_available = ensure_playwright_installed() | |
# Apply nest_asyncio here, when the function is called | |
nest_asyncio.apply() | |
# Import crawl4ai modules here to avoid event loop issues at module level | |
try: | |
from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig, LLMConfig | |
from crawl4ai.extraction_strategy import LLMExtractionStrategy | |
except ImportError as e: | |
logger.error(f"Failed to import crawl4ai: {e}") | |
return f"Failed to import required modules for web crawling. Error: {str(e)}" | |
if not playwright_available: | |
logger.warning("Playwright installation failed, falling back to simple HTTP request and raw HTML processing") | |
try: | |
with httpx.Client() as client: | |
response = client.get(url, follow_redirects=True) | |
response.raise_for_status() | |
raw_html = response.text | |
# Use crawl4ai to process the raw HTML | |
url = f"raw:{raw_html}" | |
except Exception as e: | |
return f"Failed to extract rules from {url} after Playwright failure. Error: {str(e)}" | |
async def _extract_rules_async(url: str) -> str: | |
""" | |
Asynchronously extracts formatting rules from a given URL using crawl4ai. | |
""" | |
# Configure the LLM extraction | |
extraction_strategy = LLMExtractionStrategy( | |
llm_config=LLMConfig( | |
provider=f"{settings.llm_provider}/{settings.llm_model_name}", | |
api_token=settings.openrouter_api_key | |
), | |
schema=FormattingRules.schema(), | |
extraction_type="schema", | |
instruction=""" | |
From the crawled content, extract all formatting rules for manuscript submissions. | |
Focus on requirements for margins, font, line spacing, citations, section structure, | |
and any other formatting guidelines. Provide a comprehensive extraction of all | |
formatting-related information. | |
If a specific requirement is not mentioned in the content, include "Not specified" in the corresponding field. | |
""" | |
) | |
# Configure the crawler | |
run_config = CrawlerRunConfig( | |
word_count_threshold=10, | |
exclude_external_links=True, | |
process_iframes=True, | |
remove_overlay_elements=True, | |
exclude_social_media_links=True, | |
check_robots_txt=True, | |
semaphore_count=3, | |
extraction_strategy=extraction_strategy | |
) | |
# Initialize the crawler and run | |
try: | |
async with AsyncWebCrawler() as crawler: | |
try: | |
result = await crawler.arun( | |
url=url, | |
config=run_config | |
) | |
logger.info(f"Crawler result for {url}: {result}") | |
# Handle robots.txt blocking | |
if not result.success and "robots.txt" in str(result.error_message): | |
logger.warning(f"Crawl blocked by robots.txt for {url}. Falling back to direct download.") | |
try: | |
with httpx.Client() as client: | |
response = client.get(url, follow_redirects=True) | |
response.raise_for_status() | |
raw_html = response.text | |
logger.info(f"Successfully downloaded HTML content for {url}.") | |
# Re-run crawl4ai with raw HTML | |
raw_html_url = f"raw:{raw_html}" | |
result = await crawler.arun(url=raw_html_url, config=run_config) | |
logger.info(f"Crawler result for raw HTML: {result}") | |
except httpx.HTTPStatusError as e: | |
logger.error(f"HTTP error while fetching {url}: {e}", exc_info=True) | |
return "Failed to download the page content after being blocked by robots.txt." | |
except Exception as e: | |
logger.error(f"An error occurred during fallback processing for {url}: {e}", exc_info=True) | |
return "An error occurred during the fallback extraction process." | |
except Exception as e: | |
logger.error(f"An error occurred during crawling {url}: {e}", exc_info=True) | |
return "An error occurred while trying to extract formatting rules." | |
except Exception as e: | |
logger.error(f"Failed to initialize AsyncWebCrawler: {e}", exc_info=True) | |
# Fallback to simple HTTP request if crawler initialization fails | |
try: | |
with httpx.Client() as client: | |
response = client.get(url, follow_redirects=True) | |
response.raise_for_status() | |
return f"# Formatting Rules (Fallback Extraction)\n\nExtracted from: {url}\n\n{response.text[:2000]}...\n\n*Note: Browser-based extraction failed, showing raw content. Please review manually.*" | |
except Exception as fallback_e: | |
logger.error(f"Fallback HTTP request also failed: {fallback_e}") | |
return f"Failed to extract rules from {url}. Both browser and HTTP extraction failed. Error: {str(e)}" | |
if result.success and result.extracted_content: | |
# The extracted content is often a list containing a JSON string. | |
raw_data = result.extracted_content | |
if isinstance(raw_data, list) and len(raw_data) > 0: | |
raw_data = raw_data[0] | |
# Ensure we have a dictionary to work with | |
if isinstance(raw_data, str): | |
try: | |
rules_data = json.loads(raw_data) | |
# If the parsed data is a list, take the first element | |
if isinstance(rules_data, list) and len(rules_data) > 0: | |
rules_data = rules_data[0] | |
except json.JSONDecodeError: | |
logger.error(f"Failed to parse JSON from extracted content: {raw_data}") | |
return "Failed to parse the extracted formatting rules." | |
elif isinstance(raw_data, dict): | |
rules_data = raw_data | |
else: | |
logger.warning(f"Unexpected type for extracted content: {type(raw_data)}") | |
return "Could not process the extracted formatting rules." | |
# Store the raw data for debugging | |
logger.info(f"Parsed rules data: {json.dumps(rules_data, indent=2)}") | |
# Format the rules for display | |
formatted_rules = format_rules_for_display(rules_data) | |
if not formatted_rules: | |
return "Failed to format the extracted rules." | |
logger.info(f"Formatted rules: {formatted_rules[:100]}...") | |
return formatted_rules | |
elif result.success and result.markdown: | |
# Fallback to markdown if structured extraction fails | |
logger.info(f"Extraction failed, falling back to markdown for {url}") | |
return result.markdown | |
else: | |
logger.warning(f"Failed to extract rules or markdown for {url}. Crawler success: {result.success}") | |
return "Could not extract formatting rules from the provided URL. The crawler did not return any content." | |
# Run the async function using the patched event loop | |
return asyncio.run(_extract_rules_async(url)) | |