Spaces:

Zwounds
/

FormatReview

Sleeping

App Files Files Community

Stephen Zweibel commited on Jun 27

Commit

95bad73

1 Parent(s): 48f85f2

Update app for Hugging Face

Browse files

Files changed (4) hide show

packages.txt +5 -0
requirements.txt +1 -0
rule_extractor.py +96 -35
startup_formatreview.sh +6 -7

packages.txt CHANGED Viewed

@@ -14,3 +14,8 @@ libgbm1
 libpango-1.0-0
 libcairo2
 libasound2

 libpango-1.0-0
 libcairo2
 libasound2
+libglib2.0-0
+libgtk-3-0
+libgdk-pixbuf2.0-0
+libxss1
+libgconf-2-4

requirements.txt CHANGED Viewed

@@ -7,6 +7,7 @@ nest-asyncio>=1.5.8
 # Web crawling and extraction
 crawl4ai>=0.6.0
 litellm>=1.0.0
 # Document processing

 # Web crawling and extraction
 crawl4ai>=0.6.0
+playwright>=1.40.0
 litellm>=1.0.0
 # Document processing

rule_extractor.py CHANGED Viewed

@@ -4,11 +4,43 @@ import nest_asyncio
 import os
 import json
 import httpx
 from config import settings
 from pydantic import BaseModel, Field
 logger = logging.getLogger(__name__)
 class FormattingRules(BaseModel):
     """Schema for formatting rules extraction"""
     margins: str = Field(description="Margin requirements for the manuscript")
@@ -58,12 +90,28 @@ def get_rules_from_url(url: str) -> str:
     """
     logger.info(f"Extracting rules from URL: {url}")
     # Apply nest_asyncio here, when the function is called
     nest_asyncio.apply()
     # Import crawl4ai modules here to avoid event loop issues at module level
-    from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig, LLMConfig
-    from crawl4ai.extraction_strategy import LLMExtractionStrategy
     async def _extract_rules_async(url: str) -> str:
         """
@@ -103,40 +151,53 @@ def get_rules_from_url(url: str) -> str:
         )
         # Initialize the crawler and run
-        async with AsyncWebCrawler() as crawler:
             try:
-                result = await crawler.arun(
-                    url=url,
-                    config=run_config
-                )
-                logger.info(f"Crawler result for {url}: {result}")
-                # Handle robots.txt blocking
-                if not result.success and "robots.txt" in str(result.error_message):
-                    logger.warning(f"Crawl blocked by robots.txt for {url}. Falling back to direct download.")
-                    try:
-                        with httpx.Client() as client:
-                            response = client.get(url, follow_redirects=True)
-                            response.raise_for_status()
-                        raw_html = response.text
-                        logger.info(f"Successfully downloaded HTML content for {url}.")
-                        # Re-run crawl4ai with raw HTML
-                        raw_html_url = f"raw:{raw_html}"
-                        result = await crawler.arun(url=raw_html_url, config=run_config)
-                        logger.info(f"Crawler result for raw HTML: {result}")
-                    except httpx.HTTPStatusError as e:
-                        logger.error(f"HTTP error while fetching {url}: {e}", exc_info=True)
-                        return "Failed to download the page content after being blocked by robots.txt."
-                    except Exception as e:
-                        logger.error(f"An error occurred during fallback processing for {url}: {e}", exc_info=True)
-                        return "An error occurred during the fallback extraction process."
-            except Exception as e:
-                logger.error(f"An error occurred during crawling {url}: {e}", exc_info=True)
-                return "An error occurred while trying to extract formatting rules."
             if result.success and result.extracted_content:
                 # Format the extracted data into a readable string

 import os
 import json
 import httpx
+import subprocess
+import sys
 from config import settings
 from pydantic import BaseModel, Field
 logger = logging.getLogger(__name__)
+def ensure_playwright_installed():
+    """Ensure Playwright browsers are installed, especially for Hugging Face deployment."""
+    try:
+        # Check if we're in a Hugging Face environment
+        if os.getenv('SPACE_ID') or os.getenv('HF_SPACE_ID') or os.getenv('SPACES_BUILDKIT_VERSION'):
+            logger.info("Detected Hugging Face environment, checking Playwright installation...")
+            # Try to install Playwright browsers
+            result = subprocess.run([
+                sys.executable, "-m", "playwright", "install", "chromium", "--with-deps"
+            ], capture_output=True, text=True, timeout=300)
+            if result.returncode == 0:
+                logger.info("Playwright browsers installed successfully")
+                return True
+            else:
+                logger.warning(f"Playwright installation failed: {result.stderr}")
+                return False
+        else:
+            logger.info("Not in Hugging Face environment, assuming Playwright is available")
+            return True
+    except subprocess.TimeoutExpired:
+        logger.error("Playwright installation timed out")
+        return False
+    except Exception as e:
+        logger.error(f"Error installing Playwright browsers: {e}")
+        return False
 class FormattingRules(BaseModel):
     """Schema for formatting rules extraction"""
     margins: str = Field(description="Margin requirements for the manuscript")
     """
     logger.info(f"Extracting rules from URL: {url}")
+    # Ensure Playwright is installed (especially for Hugging Face)
+    playwright_available = ensure_playwright_installed()
+    if not playwright_available:
+        logger.warning("Playwright installation failed, falling back to simple HTTP request")
+        try:
+            with httpx.Client() as client:
+                response = client.get(url, follow_redirects=True)
+                response.raise_for_status()
+                return f"# Formatting Rules (Simple Extraction)\n\nExtracted from: {url}\n\n{response.text[:2000]}...\n\n*Note: Advanced extraction failed, showing raw content. Please review manually.*"
+        except Exception as e:
+            return f"Failed to extract rules from {url}. Error: {str(e)}"
     # Apply nest_asyncio here, when the function is called
     nest_asyncio.apply()
     # Import crawl4ai modules here to avoid event loop issues at module level
+    try:
+        from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig, LLMConfig
+        from crawl4ai.extraction_strategy import LLMExtractionStrategy
+    except ImportError as e:
+        logger.error(f"Failed to import crawl4ai: {e}")
+        return f"Failed to import required modules for web crawling. Error: {str(e)}"
     async def _extract_rules_async(url: str) -> str:
         """
         )
         # Initialize the crawler and run
+        try:
+            async with AsyncWebCrawler() as crawler:
+                try:
+                    result = await crawler.arun(
+                        url=url,
+                        config=run_config
+                    )
+                    logger.info(f"Crawler result for {url}: {result}")
+                    # Handle robots.txt blocking
+                    if not result.success and "robots.txt" in str(result.error_message):
+                        logger.warning(f"Crawl blocked by robots.txt for {url}. Falling back to direct download.")
+                        try:
+                            with httpx.Client() as client:
+                                response = client.get(url, follow_redirects=True)
+                                response.raise_for_status()
+                            raw_html = response.text
+                            logger.info(f"Successfully downloaded HTML content for {url}.")
+                            # Re-run crawl4ai with raw HTML
+                            raw_html_url = f"raw:{raw_html}"
+                            result = await crawler.arun(url=raw_html_url, config=run_config)
+                            logger.info(f"Crawler result for raw HTML: {result}")
+                        except httpx.HTTPStatusError as e:
+                            logger.error(f"HTTP error while fetching {url}: {e}", exc_info=True)
+                            return "Failed to download the page content after being blocked by robots.txt."
+                        except Exception as e:
+                            logger.error(f"An error occurred during fallback processing for {url}: {e}", exc_info=True)
+                            return "An error occurred during the fallback extraction process."
+                except Exception as e:
+                    logger.error(f"An error occurred during crawling {url}: {e}", exc_info=True)
+                    return "An error occurred while trying to extract formatting rules."
+        except Exception as e:
+            logger.error(f"Failed to initialize AsyncWebCrawler: {e}", exc_info=True)
+            # Fallback to simple HTTP request if crawler initialization fails
             try:
+                with httpx.Client() as client:
+                    response = client.get(url, follow_redirects=True)
+                    response.raise_for_status()
+                    return f"# Formatting Rules (Fallback Extraction)\n\nExtracted from: {url}\n\n{response.text[:2000]}...\n\n*Note: Browser-based extraction failed, showing raw content. Please review manually.*"
+            except Exception as fallback_e:
+                logger.error(f"Fallback HTTP request also failed: {fallback_e}")
+                return f"Failed to extract rules from {url}. Both browser and HTTP extraction failed. Error: {str(e)}"
             if result.success and result.extracted_content:
                 # Format the extracted data into a readable string

startup_formatreview.sh CHANGED Viewed

@@ -19,11 +19,10 @@ if ! command -v uv &> /dev/null; then
     exit 1
 fi
-# Create virtual environment if it doesn't exist
-if [ ! -d ".venv" ]; then
-    echo "Creating virtual environment..."
-    uv venv
-fi
 # Activate virtual environment
 echo "Activating virtual environment..."
@@ -31,7 +30,7 @@ source .venv/bin/activate
 # Install dependencies
 echo "Installing dependencies..."
-uv pip install -r requirements.txt
 # Kill any existing instances of the Streamlit app
 echo "Stopping any existing instances of the Streamlit app..."
@@ -51,7 +50,7 @@ sleep 1
 # Start the Streamlit app
 echo "Starting Streamlit app on port $STREAMLIT_PORT..."
-nohup streamlit run $STREAMLIT_APP_FILE --server.port $STREAMLIT_PORT --server.headless true > $STREAMLIT_LOG_FILE 2>&1 &
 echo $! > $STREAMLIT_PID_FILE
 # Check if the Streamlit service started successfully

     exit 1
 fi
+# Recreate virtual environment
+echo "Recreating virtual environment..."
+rm -rf .venv
+uv venv
 # Activate virtual environment
 echo "Activating virtual environment..."
 # Install dependencies
 echo "Installing dependencies..."
+uv pip install -r requirements.txt --no-cache
 # Kill any existing instances of the Streamlit app
 echo "Stopping any existing instances of the Streamlit app..."
 # Start the Streamlit app
 echo "Starting Streamlit app on port $STREAMLIT_PORT..."
+nohup .venv/bin/python3 -m streamlit run $STREAMLIT_APP_FILE --server.port $STREAMLIT_PORT --server.headless true > $STREAMLIT_LOG_FILE 2>&1 &
 echo $! > $STREAMLIT_PID_FILE
 # Check if the Streamlit service started successfully