Spaces:

Zwounds
/

FormatReview

Sleeping

App Files Files Community

FormatReview / rule_extractor.py

Zwounds

Upload folder using huggingface_hub

d6fb454 verified about 2 months ago

raw

history blame contribute delete

10.9 kB

	import logging
	import asyncio
	import nest_asyncio
	import os
	import json
	import httpx
	import subprocess
	import sys
	from config import settings
	from pydantic import BaseModel, Field

	logger = logging.getLogger(__name__)

	def ensure_playwright_installed():
	"""Ensure Playwright browsers are installed, especially for Hugging Face deployment."""
	try:
	# Check if we're in a Hugging Face environment
	if os.getenv('SPACE_ID') or os.getenv('HF_SPACE_ID') or os.getenv('SPACES_BUILDKIT_VERSION'):
	logger.info("Detected Hugging Face environment, checking Playwright installation...")

	# Try to install Playwright browsers
	result = subprocess.run([
	sys.executable, "-m", "playwright", "install", "chromium"
	], capture_output=True, text=True, timeout=300)

	if result.returncode == 0:
	logger.info("Playwright browsers installed successfully")
	return True
	else:
	logger.warning(f"Playwright installation failed: {result.stderr}")
	return False

	else:
	logger.info("Not in Hugging Face environment, assuming Playwright is available")
	return True

	except subprocess.TimeoutExpired:
	logger.error("Playwright installation timed out")
	return False
	except Exception as e:
	logger.error(f"Error installing Playwright browsers: {e}")
	return False

	class FormattingRules(BaseModel):
	"""Schema for formatting rules extraction"""
	margins: str = Field(description="Margin requirements for the manuscript")
	font: str = Field(description="Font requirements including size, type, etc.")
	line_spacing: str = Field(description="Line spacing requirements")
	citations: str = Field(description="Citation style and formatting requirements")
	sections: str = Field(description="Required sections and their structure")
	other_rules: str = Field(description="Any other formatting requirements")
	summary: str = Field(description="A brief summary of the key formatting requirements")

	def format_rules_for_display(rules_data):
	"""
	Format the extracted rules data into a readable markdown string.
	"""
	if not rules_data:
	return "Could not extract formatting rules from the provided URL."

	formatted_rules = f"""
	# Manuscript Formatting Guidelines

	## Margins
	{rules_data.get('margins', 'Not specified')}

	## Font
	{rules_data.get('font', 'Not specified')}

	## Line Spacing
	{rules_data.get('line_spacing', 'Not specified')}

	## Citations
	{rules_data.get('citations', 'Not specified')}

	## Section Structure
	{rules_data.get('sections', 'Not specified')}

	## Other Requirements
	{rules_data.get('other_rules', 'Not specified')}

	## Summary
	{rules_data.get('summary', 'Not specified')}
	"""
	return formatted_rules

	def get_rules_from_url(url: str) -> str:
	"""
	Extracts formatting rules from a given URL using crawl4ai.
	"""
	logger.info(f"Extracting rules from URL: {url}")

	# Ensure Playwright is installed (especially for Hugging Face)
	playwright_available = ensure_playwright_installed()

	# Apply nest_asyncio here, when the function is called
	nest_asyncio.apply()

	# Import crawl4ai modules here to avoid event loop issues at module level
	try:
	from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig, LLMConfig
	from crawl4ai.extraction_strategy import LLMExtractionStrategy
	except ImportError as e:
	logger.error(f"Failed to import crawl4ai: {e}")
	return f"Failed to import required modules for web crawling. Error: {str(e)}"

	if not playwright_available:
	logger.warning("Playwright installation failed, falling back to simple HTTP request and raw HTML processing")
	try:
	with httpx.Client() as client:
	response = client.get(url, follow_redirects=True)
	response.raise_for_status()
	raw_html = response.text
	# Use crawl4ai to process the raw HTML
	url = f"raw:{raw_html}"
	except Exception as e:
	return f"Failed to extract rules from {url} after Playwright failure. Error: {str(e)}"


	async def _extract_rules_async(url: str) -> str:
	"""
	Asynchronously extracts formatting rules from a given URL using crawl4ai.
	"""

	# Configure the LLM extraction
	extraction_strategy = LLMExtractionStrategy(
	llm_config=LLMConfig(
	provider=f"{settings.llm_provider}/{settings.llm_model_name}",
	api_token=settings.openrouter_api_key
	),
	schema=FormattingRules.schema(),
	extraction_type="schema",
	instruction="""
	From the crawled content, extract all formatting rules for manuscript submissions.
	Focus on requirements for margins, font, line spacing, citations, section structure,
	and any other formatting guidelines. Provide a comprehensive extraction of all
	formatting-related information.

	If a specific requirement is not mentioned in the content, include "Not specified" in the corresponding field.
	"""
	)

	# Configure the crawler
	run_config = CrawlerRunConfig(
	word_count_threshold=10,
	exclude_external_links=True,
	process_iframes=True,
	remove_overlay_elements=True,
	exclude_social_media_links=True,
	check_robots_txt=True,
	semaphore_count=3,
	extraction_strategy=extraction_strategy
	)

	# Initialize the crawler and run
	try:
	async with AsyncWebCrawler() as crawler:
	try:
	result = await crawler.arun(
	url=url,
	config=run_config
	)
	logger.info(f"Crawler result for {url}: {result}")

	# Handle robots.txt blocking
	if not result.success and "robots.txt" in str(result.error_message):
	logger.warning(f"Crawl blocked by robots.txt for {url}. Falling back to direct download.")
	try:
	with httpx.Client() as client:
	response = client.get(url, follow_redirects=True)
	response.raise_for_status()

	raw_html = response.text
	logger.info(f"Successfully downloaded HTML content for {url}.")

	# Re-run crawl4ai with raw HTML
	raw_html_url = f"raw:{raw_html}"
	result = await crawler.arun(url=raw_html_url, config=run_config)
	logger.info(f"Crawler result for raw HTML: {result}")

	except httpx.HTTPStatusError as e:
	logger.error(f"HTTP error while fetching {url}: {e}", exc_info=True)
	return "Failed to download the page content after being blocked by robots.txt."
	except Exception as e:
	logger.error(f"An error occurred during fallback processing for {url}: {e}", exc_info=True)
	return "An error occurred during the fallback extraction process."

	except Exception as e:
	logger.error(f"An error occurred during crawling {url}: {e}", exc_info=True)
	return "An error occurred while trying to extract formatting rules."

	except Exception as e:
	logger.error(f"Failed to initialize AsyncWebCrawler: {e}", exc_info=True)
	# Fallback to simple HTTP request if crawler initialization fails
	try:
	with httpx.Client() as client:
	response = client.get(url, follow_redirects=True)
	response.raise_for_status()
	return f"# Formatting Rules (Fallback Extraction)\n\nExtracted from: {url}\n\n{response.text[:2000]}...\n\nNote: Browser-based extraction failed, showing raw content. Please review manually."
	except Exception as fallback_e:
	logger.error(f"Fallback HTTP request also failed: {fallback_e}")
	return f"Failed to extract rules from {url}. Both browser and HTTP extraction failed. Error: {str(e)}"

	if result.success and result.extracted_content:
	# The extracted content is often a list containing a JSON string.
	raw_data = result.extracted_content
	if isinstance(raw_data, list) and len(raw_data) > 0:
	raw_data = raw_data[0]

	# Ensure we have a dictionary to work with
	if isinstance(raw_data, str):
	try:
	rules_data = json.loads(raw_data)
	# If the parsed data is a list, take the first element
	if isinstance(rules_data, list) and len(rules_data) > 0:
	rules_data = rules_data[0]
	except json.JSONDecodeError:
	logger.error(f"Failed to parse JSON from extracted content: {raw_data}")
	return "Failed to parse the extracted formatting rules."
	elif isinstance(raw_data, dict):
	rules_data = raw_data
	else:
	logger.warning(f"Unexpected type for extracted content: {type(raw_data)}")
	return "Could not process the extracted formatting rules."

	# Store the raw data for debugging
	logger.info(f"Parsed rules data: {json.dumps(rules_data, indent=2)}")

	# Format the rules for display
	formatted_rules = format_rules_for_display(rules_data)
	if not formatted_rules:
	return "Failed to format the extracted rules."

	logger.info(f"Formatted rules: {formatted_rules[:100]}...")
	return formatted_rules
	elif result.success and result.markdown:
	# Fallback to markdown if structured extraction fails
	logger.info(f"Extraction failed, falling back to markdown for {url}")
	return result.markdown
	else:
	logger.warning(f"Failed to extract rules or markdown for {url}. Crawler success: {result.success}")
	return "Could not extract formatting rules from the provided URL. The crawler did not return any content."

	# Run the async function using the patched event loop
	return asyncio.run(_extract_rules_async(url))