File size: 10,937 Bytes
bb869fd
 
 
 
 
af140e4
95bad73
 
bb869fd
 
 
 
 
95bad73
 
 
 
 
 
 
 
 
23943da
95bad73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb869fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95bad73
 
 
bb869fd
 
 
 
95bad73
 
 
 
 
 
23943da
 
 
 
 
 
 
 
 
4435587
23943da
 
 
bb869fd
4435587
bb869fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95bad73
4435587
95bad73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af140e4
95bad73
 
 
 
 
 
 
d6fb454
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb869fd
d6fb454
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb869fd
af140e4
4435587
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
import logging
import asyncio
import nest_asyncio
import os
import json
import httpx
import subprocess
import sys
from config import settings
from pydantic import BaseModel, Field

logger = logging.getLogger(__name__)

def ensure_playwright_installed():
    """Ensure Playwright browsers are installed, especially for Hugging Face deployment."""
    try:
        # Check if we're in a Hugging Face environment
        if os.getenv('SPACE_ID') or os.getenv('HF_SPACE_ID') or os.getenv('SPACES_BUILDKIT_VERSION'):
            logger.info("Detected Hugging Face environment, checking Playwright installation...")
            
            # Try to install Playwright browsers
            result = subprocess.run([
                sys.executable, "-m", "playwright", "install", "chromium"
            ], capture_output=True, text=True, timeout=300)
            
            if result.returncode == 0:
                logger.info("Playwright browsers installed successfully")
                return True
            else:
                logger.warning(f"Playwright installation failed: {result.stderr}")
                return False
                
        else:
            logger.info("Not in Hugging Face environment, assuming Playwright is available")
            return True
            
    except subprocess.TimeoutExpired:
        logger.error("Playwright installation timed out")
        return False
    except Exception as e:
        logger.error(f"Error installing Playwright browsers: {e}")
        return False

class FormattingRules(BaseModel):
    """Schema for formatting rules extraction"""
    margins: str = Field(description="Margin requirements for the manuscript")
    font: str = Field(description="Font requirements including size, type, etc.")
    line_spacing: str = Field(description="Line spacing requirements")
    citations: str = Field(description="Citation style and formatting requirements")
    sections: str = Field(description="Required sections and their structure")
    other_rules: str = Field(description="Any other formatting requirements")
    summary: str = Field(description="A brief summary of the key formatting requirements")

def format_rules_for_display(rules_data):
    """
    Format the extracted rules data into a readable markdown string.
    """
    if not rules_data:
        return "Could not extract formatting rules from the provided URL."
    
    formatted_rules = f"""
# Manuscript Formatting Guidelines

## Margins
{rules_data.get('margins', 'Not specified')}

## Font
{rules_data.get('font', 'Not specified')}

## Line Spacing
{rules_data.get('line_spacing', 'Not specified')}

## Citations
{rules_data.get('citations', 'Not specified')}

## Section Structure
{rules_data.get('sections', 'Not specified')}

## Other Requirements
{rules_data.get('other_rules', 'Not specified')}

## Summary
{rules_data.get('summary', 'Not specified')}
"""
    return formatted_rules

def get_rules_from_url(url: str) -> str:
    """
    Extracts formatting rules from a given URL using crawl4ai.
    """
    logger.info(f"Extracting rules from URL: {url}")
    
    # Ensure Playwright is installed (especially for Hugging Face)
    playwright_available = ensure_playwright_installed()
    
    # Apply nest_asyncio here, when the function is called
    nest_asyncio.apply()
    
    # Import crawl4ai modules here to avoid event loop issues at module level
    try:
        from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig, LLMConfig
        from crawl4ai.extraction_strategy import LLMExtractionStrategy
    except ImportError as e:
        logger.error(f"Failed to import crawl4ai: {e}")
        return f"Failed to import required modules for web crawling. Error: {str(e)}"

    if not playwright_available:
        logger.warning("Playwright installation failed, falling back to simple HTTP request and raw HTML processing")
        try:
            with httpx.Client() as client:
                response = client.get(url, follow_redirects=True)
                response.raise_for_status()
                raw_html = response.text
                # Use crawl4ai to process the raw HTML
                url = f"raw:{raw_html}"
        except Exception as e:
            return f"Failed to extract rules from {url} after Playwright failure. Error: {str(e)}"
    
    
    async def _extract_rules_async(url: str) -> str:
        """
        Asynchronously extracts formatting rules from a given URL using crawl4ai.
        """
        
        # Configure the LLM extraction
        extraction_strategy = LLMExtractionStrategy(
            llm_config=LLMConfig(
                provider=f"{settings.llm_provider}/{settings.llm_model_name}",
                api_token=settings.openrouter_api_key
            ),
            schema=FormattingRules.schema(),
            extraction_type="schema",
            instruction="""
            From the crawled content, extract all formatting rules for manuscript submissions.
            Focus on requirements for margins, font, line spacing, citations, section structure,
            and any other formatting guidelines. Provide a comprehensive extraction of all
            formatting-related information.
            
            If a specific requirement is not mentioned in the content, include "Not specified" in the corresponding field.
            """
        )
        
        # Configure the crawler
        run_config = CrawlerRunConfig(
            word_count_threshold=10,
            exclude_external_links=True,
            process_iframes=True,
            remove_overlay_elements=True,
            exclude_social_media_links=True,
            check_robots_txt=True,
            semaphore_count=3,
            extraction_strategy=extraction_strategy
        )

        # Initialize the crawler and run
        try:
            async with AsyncWebCrawler() as crawler:
                try:
                    result = await crawler.arun(
                        url=url,
                        config=run_config
                    )
                    logger.info(f"Crawler result for {url}: {result}")

                    # Handle robots.txt blocking
                    if not result.success and "robots.txt" in str(result.error_message):
                        logger.warning(f"Crawl blocked by robots.txt for {url}. Falling back to direct download.")
                        try:
                            with httpx.Client() as client:
                                response = client.get(url, follow_redirects=True)
                                response.raise_for_status()
                            
                            raw_html = response.text
                            logger.info(f"Successfully downloaded HTML content for {url}.")
                            
                            # Re-run crawl4ai with raw HTML
                            raw_html_url = f"raw:{raw_html}"
                            result = await crawler.arun(url=raw_html_url, config=run_config)
                            logger.info(f"Crawler result for raw HTML: {result}")

                        except httpx.HTTPStatusError as e:
                            logger.error(f"HTTP error while fetching {url}: {e}", exc_info=True)
                            return "Failed to download the page content after being blocked by robots.txt."
                        except Exception as e:
                            logger.error(f"An error occurred during fallback processing for {url}: {e}", exc_info=True)
                            return "An error occurred during the fallback extraction process."

                except Exception as e:
                    logger.error(f"An error occurred during crawling {url}: {e}", exc_info=True)
                    return "An error occurred while trying to extract formatting rules."
        
        except Exception as e:
            logger.error(f"Failed to initialize AsyncWebCrawler: {e}", exc_info=True)
            # Fallback to simple HTTP request if crawler initialization fails
            try:
                with httpx.Client() as client:
                    response = client.get(url, follow_redirects=True)
                    response.raise_for_status()
                    return f"# Formatting Rules (Fallback Extraction)\n\nExtracted from: {url}\n\n{response.text[:2000]}...\n\n*Note: Browser-based extraction failed, showing raw content. Please review manually.*"
            except Exception as fallback_e:
                logger.error(f"Fallback HTTP request also failed: {fallback_e}")
                return f"Failed to extract rules from {url}. Both browser and HTTP extraction failed. Error: {str(e)}"
        
        if result.success and result.extracted_content:
            # The extracted content is often a list containing a JSON string.
            raw_data = result.extracted_content
            if isinstance(raw_data, list) and len(raw_data) > 0:
                raw_data = raw_data[0]

            # Ensure we have a dictionary to work with
            if isinstance(raw_data, str):
                try:
                    rules_data = json.loads(raw_data)
                    # If the parsed data is a list, take the first element
                    if isinstance(rules_data, list) and len(rules_data) > 0:
                        rules_data = rules_data[0]
                except json.JSONDecodeError:
                    logger.error(f"Failed to parse JSON from extracted content: {raw_data}")
                    return "Failed to parse the extracted formatting rules."
            elif isinstance(raw_data, dict):
                rules_data = raw_data
            else:
                logger.warning(f"Unexpected type for extracted content: {type(raw_data)}")
                return "Could not process the extracted formatting rules."

            # Store the raw data for debugging
            logger.info(f"Parsed rules data: {json.dumps(rules_data, indent=2)}")
            
            # Format the rules for display
            formatted_rules = format_rules_for_display(rules_data)
            if not formatted_rules:
                return "Failed to format the extracted rules."
            
            logger.info(f"Formatted rules: {formatted_rules[:100]}...")
            return formatted_rules
        elif result.success and result.markdown:
            # Fallback to markdown if structured extraction fails
            logger.info(f"Extraction failed, falling back to markdown for {url}")
            return result.markdown
        else:
            logger.warning(f"Failed to extract rules or markdown for {url}. Crawler success: {result.success}")
            return "Could not extract formatting rules from the provided URL. The crawler did not return any content."
    
    # Run the async function using the patched event loop
    return asyncio.run(_extract_rules_async(url))