Spaces:
Sleeping
Sleeping
File size: 10,937 Bytes
bb869fd af140e4 95bad73 bb869fd 95bad73 23943da 95bad73 bb869fd 95bad73 bb869fd 95bad73 23943da 4435587 23943da bb869fd 4435587 bb869fd 95bad73 4435587 95bad73 af140e4 95bad73 d6fb454 bb869fd d6fb454 bb869fd af140e4 4435587 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 |
import logging
import asyncio
import nest_asyncio
import os
import json
import httpx
import subprocess
import sys
from config import settings
from pydantic import BaseModel, Field
logger = logging.getLogger(__name__)
def ensure_playwright_installed():
"""Ensure Playwright browsers are installed, especially for Hugging Face deployment."""
try:
# Check if we're in a Hugging Face environment
if os.getenv('SPACE_ID') or os.getenv('HF_SPACE_ID') or os.getenv('SPACES_BUILDKIT_VERSION'):
logger.info("Detected Hugging Face environment, checking Playwright installation...")
# Try to install Playwright browsers
result = subprocess.run([
sys.executable, "-m", "playwright", "install", "chromium"
], capture_output=True, text=True, timeout=300)
if result.returncode == 0:
logger.info("Playwright browsers installed successfully")
return True
else:
logger.warning(f"Playwright installation failed: {result.stderr}")
return False
else:
logger.info("Not in Hugging Face environment, assuming Playwright is available")
return True
except subprocess.TimeoutExpired:
logger.error("Playwright installation timed out")
return False
except Exception as e:
logger.error(f"Error installing Playwright browsers: {e}")
return False
class FormattingRules(BaseModel):
"""Schema for formatting rules extraction"""
margins: str = Field(description="Margin requirements for the manuscript")
font: str = Field(description="Font requirements including size, type, etc.")
line_spacing: str = Field(description="Line spacing requirements")
citations: str = Field(description="Citation style and formatting requirements")
sections: str = Field(description="Required sections and their structure")
other_rules: str = Field(description="Any other formatting requirements")
summary: str = Field(description="A brief summary of the key formatting requirements")
def format_rules_for_display(rules_data):
"""
Format the extracted rules data into a readable markdown string.
"""
if not rules_data:
return "Could not extract formatting rules from the provided URL."
formatted_rules = f"""
# Manuscript Formatting Guidelines
## Margins
{rules_data.get('margins', 'Not specified')}
## Font
{rules_data.get('font', 'Not specified')}
## Line Spacing
{rules_data.get('line_spacing', 'Not specified')}
## Citations
{rules_data.get('citations', 'Not specified')}
## Section Structure
{rules_data.get('sections', 'Not specified')}
## Other Requirements
{rules_data.get('other_rules', 'Not specified')}
## Summary
{rules_data.get('summary', 'Not specified')}
"""
return formatted_rules
def get_rules_from_url(url: str) -> str:
"""
Extracts formatting rules from a given URL using crawl4ai.
"""
logger.info(f"Extracting rules from URL: {url}")
# Ensure Playwright is installed (especially for Hugging Face)
playwright_available = ensure_playwright_installed()
# Apply nest_asyncio here, when the function is called
nest_asyncio.apply()
# Import crawl4ai modules here to avoid event loop issues at module level
try:
from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy
except ImportError as e:
logger.error(f"Failed to import crawl4ai: {e}")
return f"Failed to import required modules for web crawling. Error: {str(e)}"
if not playwright_available:
logger.warning("Playwright installation failed, falling back to simple HTTP request and raw HTML processing")
try:
with httpx.Client() as client:
response = client.get(url, follow_redirects=True)
response.raise_for_status()
raw_html = response.text
# Use crawl4ai to process the raw HTML
url = f"raw:{raw_html}"
except Exception as e:
return f"Failed to extract rules from {url} after Playwright failure. Error: {str(e)}"
async def _extract_rules_async(url: str) -> str:
"""
Asynchronously extracts formatting rules from a given URL using crawl4ai.
"""
# Configure the LLM extraction
extraction_strategy = LLMExtractionStrategy(
llm_config=LLMConfig(
provider=f"{settings.llm_provider}/{settings.llm_model_name}",
api_token=settings.openrouter_api_key
),
schema=FormattingRules.schema(),
extraction_type="schema",
instruction="""
From the crawled content, extract all formatting rules for manuscript submissions.
Focus on requirements for margins, font, line spacing, citations, section structure,
and any other formatting guidelines. Provide a comprehensive extraction of all
formatting-related information.
If a specific requirement is not mentioned in the content, include "Not specified" in the corresponding field.
"""
)
# Configure the crawler
run_config = CrawlerRunConfig(
word_count_threshold=10,
exclude_external_links=True,
process_iframes=True,
remove_overlay_elements=True,
exclude_social_media_links=True,
check_robots_txt=True,
semaphore_count=3,
extraction_strategy=extraction_strategy
)
# Initialize the crawler and run
try:
async with AsyncWebCrawler() as crawler:
try:
result = await crawler.arun(
url=url,
config=run_config
)
logger.info(f"Crawler result for {url}: {result}")
# Handle robots.txt blocking
if not result.success and "robots.txt" in str(result.error_message):
logger.warning(f"Crawl blocked by robots.txt for {url}. Falling back to direct download.")
try:
with httpx.Client() as client:
response = client.get(url, follow_redirects=True)
response.raise_for_status()
raw_html = response.text
logger.info(f"Successfully downloaded HTML content for {url}.")
# Re-run crawl4ai with raw HTML
raw_html_url = f"raw:{raw_html}"
result = await crawler.arun(url=raw_html_url, config=run_config)
logger.info(f"Crawler result for raw HTML: {result}")
except httpx.HTTPStatusError as e:
logger.error(f"HTTP error while fetching {url}: {e}", exc_info=True)
return "Failed to download the page content after being blocked by robots.txt."
except Exception as e:
logger.error(f"An error occurred during fallback processing for {url}: {e}", exc_info=True)
return "An error occurred during the fallback extraction process."
except Exception as e:
logger.error(f"An error occurred during crawling {url}: {e}", exc_info=True)
return "An error occurred while trying to extract formatting rules."
except Exception as e:
logger.error(f"Failed to initialize AsyncWebCrawler: {e}", exc_info=True)
# Fallback to simple HTTP request if crawler initialization fails
try:
with httpx.Client() as client:
response = client.get(url, follow_redirects=True)
response.raise_for_status()
return f"# Formatting Rules (Fallback Extraction)\n\nExtracted from: {url}\n\n{response.text[:2000]}...\n\n*Note: Browser-based extraction failed, showing raw content. Please review manually.*"
except Exception as fallback_e:
logger.error(f"Fallback HTTP request also failed: {fallback_e}")
return f"Failed to extract rules from {url}. Both browser and HTTP extraction failed. Error: {str(e)}"
if result.success and result.extracted_content:
# The extracted content is often a list containing a JSON string.
raw_data = result.extracted_content
if isinstance(raw_data, list) and len(raw_data) > 0:
raw_data = raw_data[0]
# Ensure we have a dictionary to work with
if isinstance(raw_data, str):
try:
rules_data = json.loads(raw_data)
# If the parsed data is a list, take the first element
if isinstance(rules_data, list) and len(rules_data) > 0:
rules_data = rules_data[0]
except json.JSONDecodeError:
logger.error(f"Failed to parse JSON from extracted content: {raw_data}")
return "Failed to parse the extracted formatting rules."
elif isinstance(raw_data, dict):
rules_data = raw_data
else:
logger.warning(f"Unexpected type for extracted content: {type(raw_data)}")
return "Could not process the extracted formatting rules."
# Store the raw data for debugging
logger.info(f"Parsed rules data: {json.dumps(rules_data, indent=2)}")
# Format the rules for display
formatted_rules = format_rules_for_display(rules_data)
if not formatted_rules:
return "Failed to format the extracted rules."
logger.info(f"Formatted rules: {formatted_rules[:100]}...")
return formatted_rules
elif result.success and result.markdown:
# Fallback to markdown if structured extraction fails
logger.info(f"Extraction failed, falling back to markdown for {url}")
return result.markdown
else:
logger.warning(f"Failed to extract rules or markdown for {url}. Crawler success: {result.success}")
return "Could not extract formatting rules from the provided URL. The crawler did not return any content."
# Run the async function using the patched event loop
return asyncio.run(_extract_rules_async(url))
|