Stephen Zweibel commited on
Commit
95bad73
·
1 Parent(s): 48f85f2

Update app for Hugging Face

Browse files
Files changed (4) hide show
  1. packages.txt +5 -0
  2. requirements.txt +1 -0
  3. rule_extractor.py +96 -35
  4. startup_formatreview.sh +6 -7
packages.txt CHANGED
@@ -14,3 +14,8 @@ libgbm1
14
  libpango-1.0-0
15
  libcairo2
16
  libasound2
 
 
 
 
 
 
14
  libpango-1.0-0
15
  libcairo2
16
  libasound2
17
+ libglib2.0-0
18
+ libgtk-3-0
19
+ libgdk-pixbuf2.0-0
20
+ libxss1
21
+ libgconf-2-4
requirements.txt CHANGED
@@ -7,6 +7,7 @@ nest-asyncio>=1.5.8
7
 
8
  # Web crawling and extraction
9
  crawl4ai>=0.6.0
 
10
  litellm>=1.0.0
11
 
12
  # Document processing
 
7
 
8
  # Web crawling and extraction
9
  crawl4ai>=0.6.0
10
+ playwright>=1.40.0
11
  litellm>=1.0.0
12
 
13
  # Document processing
rule_extractor.py CHANGED
@@ -4,11 +4,43 @@ import nest_asyncio
4
  import os
5
  import json
6
  import httpx
 
 
7
  from config import settings
8
  from pydantic import BaseModel, Field
9
 
10
  logger = logging.getLogger(__name__)
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  class FormattingRules(BaseModel):
13
  """Schema for formatting rules extraction"""
14
  margins: str = Field(description="Margin requirements for the manuscript")
@@ -58,12 +90,28 @@ def get_rules_from_url(url: str) -> str:
58
  """
59
  logger.info(f"Extracting rules from URL: {url}")
60
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  # Apply nest_asyncio here, when the function is called
62
  nest_asyncio.apply()
63
 
64
  # Import crawl4ai modules here to avoid event loop issues at module level
65
- from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig, LLMConfig
66
- from crawl4ai.extraction_strategy import LLMExtractionStrategy
 
 
 
 
67
 
68
  async def _extract_rules_async(url: str) -> str:
69
  """
@@ -103,40 +151,53 @@ def get_rules_from_url(url: str) -> str:
103
  )
104
 
105
  # Initialize the crawler and run
106
- async with AsyncWebCrawler() as crawler:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  try:
108
- result = await crawler.arun(
109
- url=url,
110
- config=run_config
111
- )
112
- logger.info(f"Crawler result for {url}: {result}")
113
-
114
- # Handle robots.txt blocking
115
- if not result.success and "robots.txt" in str(result.error_message):
116
- logger.warning(f"Crawl blocked by robots.txt for {url}. Falling back to direct download.")
117
- try:
118
- with httpx.Client() as client:
119
- response = client.get(url, follow_redirects=True)
120
- response.raise_for_status()
121
-
122
- raw_html = response.text
123
- logger.info(f"Successfully downloaded HTML content for {url}.")
124
-
125
- # Re-run crawl4ai with raw HTML
126
- raw_html_url = f"raw:{raw_html}"
127
- result = await crawler.arun(url=raw_html_url, config=run_config)
128
- logger.info(f"Crawler result for raw HTML: {result}")
129
-
130
- except httpx.HTTPStatusError as e:
131
- logger.error(f"HTTP error while fetching {url}: {e}", exc_info=True)
132
- return "Failed to download the page content after being blocked by robots.txt."
133
- except Exception as e:
134
- logger.error(f"An error occurred during fallback processing for {url}: {e}", exc_info=True)
135
- return "An error occurred during the fallback extraction process."
136
-
137
- except Exception as e:
138
- logger.error(f"An error occurred during crawling {url}: {e}", exc_info=True)
139
- return "An error occurred while trying to extract formatting rules."
140
 
141
  if result.success and result.extracted_content:
142
  # Format the extracted data into a readable string
 
4
  import os
5
  import json
6
  import httpx
7
+ import subprocess
8
+ import sys
9
  from config import settings
10
  from pydantic import BaseModel, Field
11
 
12
  logger = logging.getLogger(__name__)
13
 
14
+ def ensure_playwright_installed():
15
+ """Ensure Playwright browsers are installed, especially for Hugging Face deployment."""
16
+ try:
17
+ # Check if we're in a Hugging Face environment
18
+ if os.getenv('SPACE_ID') or os.getenv('HF_SPACE_ID') or os.getenv('SPACES_BUILDKIT_VERSION'):
19
+ logger.info("Detected Hugging Face environment, checking Playwright installation...")
20
+
21
+ # Try to install Playwright browsers
22
+ result = subprocess.run([
23
+ sys.executable, "-m", "playwright", "install", "chromium", "--with-deps"
24
+ ], capture_output=True, text=True, timeout=300)
25
+
26
+ if result.returncode == 0:
27
+ logger.info("Playwright browsers installed successfully")
28
+ return True
29
+ else:
30
+ logger.warning(f"Playwright installation failed: {result.stderr}")
31
+ return False
32
+
33
+ else:
34
+ logger.info("Not in Hugging Face environment, assuming Playwright is available")
35
+ return True
36
+
37
+ except subprocess.TimeoutExpired:
38
+ logger.error("Playwright installation timed out")
39
+ return False
40
+ except Exception as e:
41
+ logger.error(f"Error installing Playwright browsers: {e}")
42
+ return False
43
+
44
  class FormattingRules(BaseModel):
45
  """Schema for formatting rules extraction"""
46
  margins: str = Field(description="Margin requirements for the manuscript")
 
90
  """
91
  logger.info(f"Extracting rules from URL: {url}")
92
 
93
+ # Ensure Playwright is installed (especially for Hugging Face)
94
+ playwright_available = ensure_playwright_installed()
95
+ if not playwright_available:
96
+ logger.warning("Playwright installation failed, falling back to simple HTTP request")
97
+ try:
98
+ with httpx.Client() as client:
99
+ response = client.get(url, follow_redirects=True)
100
+ response.raise_for_status()
101
+ return f"# Formatting Rules (Simple Extraction)\n\nExtracted from: {url}\n\n{response.text[:2000]}...\n\n*Note: Advanced extraction failed, showing raw content. Please review manually.*"
102
+ except Exception as e:
103
+ return f"Failed to extract rules from {url}. Error: {str(e)}"
104
+
105
  # Apply nest_asyncio here, when the function is called
106
  nest_asyncio.apply()
107
 
108
  # Import crawl4ai modules here to avoid event loop issues at module level
109
+ try:
110
+ from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig, LLMConfig
111
+ from crawl4ai.extraction_strategy import LLMExtractionStrategy
112
+ except ImportError as e:
113
+ logger.error(f"Failed to import crawl4ai: {e}")
114
+ return f"Failed to import required modules for web crawling. Error: {str(e)}"
115
 
116
  async def _extract_rules_async(url: str) -> str:
117
  """
 
151
  )
152
 
153
  # Initialize the crawler and run
154
+ try:
155
+ async with AsyncWebCrawler() as crawler:
156
+ try:
157
+ result = await crawler.arun(
158
+ url=url,
159
+ config=run_config
160
+ )
161
+ logger.info(f"Crawler result for {url}: {result}")
162
+
163
+ # Handle robots.txt blocking
164
+ if not result.success and "robots.txt" in str(result.error_message):
165
+ logger.warning(f"Crawl blocked by robots.txt for {url}. Falling back to direct download.")
166
+ try:
167
+ with httpx.Client() as client:
168
+ response = client.get(url, follow_redirects=True)
169
+ response.raise_for_status()
170
+
171
+ raw_html = response.text
172
+ logger.info(f"Successfully downloaded HTML content for {url}.")
173
+
174
+ # Re-run crawl4ai with raw HTML
175
+ raw_html_url = f"raw:{raw_html}"
176
+ result = await crawler.arun(url=raw_html_url, config=run_config)
177
+ logger.info(f"Crawler result for raw HTML: {result}")
178
+
179
+ except httpx.HTTPStatusError as e:
180
+ logger.error(f"HTTP error while fetching {url}: {e}", exc_info=True)
181
+ return "Failed to download the page content after being blocked by robots.txt."
182
+ except Exception as e:
183
+ logger.error(f"An error occurred during fallback processing for {url}: {e}", exc_info=True)
184
+ return "An error occurred during the fallback extraction process."
185
+
186
+ except Exception as e:
187
+ logger.error(f"An error occurred during crawling {url}: {e}", exc_info=True)
188
+ return "An error occurred while trying to extract formatting rules."
189
+
190
+ except Exception as e:
191
+ logger.error(f"Failed to initialize AsyncWebCrawler: {e}", exc_info=True)
192
+ # Fallback to simple HTTP request if crawler initialization fails
193
  try:
194
+ with httpx.Client() as client:
195
+ response = client.get(url, follow_redirects=True)
196
+ response.raise_for_status()
197
+ return f"# Formatting Rules (Fallback Extraction)\n\nExtracted from: {url}\n\n{response.text[:2000]}...\n\n*Note: Browser-based extraction failed, showing raw content. Please review manually.*"
198
+ except Exception as fallback_e:
199
+ logger.error(f"Fallback HTTP request also failed: {fallback_e}")
200
+ return f"Failed to extract rules from {url}. Both browser and HTTP extraction failed. Error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
  if result.success and result.extracted_content:
203
  # Format the extracted data into a readable string
startup_formatreview.sh CHANGED
@@ -19,11 +19,10 @@ if ! command -v uv &> /dev/null; then
19
  exit 1
20
  fi
21
 
22
- # Create virtual environment if it doesn't exist
23
- if [ ! -d ".venv" ]; then
24
- echo "Creating virtual environment..."
25
- uv venv
26
- fi
27
 
28
  # Activate virtual environment
29
  echo "Activating virtual environment..."
@@ -31,7 +30,7 @@ source .venv/bin/activate
31
 
32
  # Install dependencies
33
  echo "Installing dependencies..."
34
- uv pip install -r requirements.txt
35
 
36
  # Kill any existing instances of the Streamlit app
37
  echo "Stopping any existing instances of the Streamlit app..."
@@ -51,7 +50,7 @@ sleep 1
51
 
52
  # Start the Streamlit app
53
  echo "Starting Streamlit app on port $STREAMLIT_PORT..."
54
- nohup streamlit run $STREAMLIT_APP_FILE --server.port $STREAMLIT_PORT --server.headless true > $STREAMLIT_LOG_FILE 2>&1 &
55
  echo $! > $STREAMLIT_PID_FILE
56
 
57
  # Check if the Streamlit service started successfully
 
19
  exit 1
20
  fi
21
 
22
+ # Recreate virtual environment
23
+ echo "Recreating virtual environment..."
24
+ rm -rf .venv
25
+ uv venv
 
26
 
27
  # Activate virtual environment
28
  echo "Activating virtual environment..."
 
30
 
31
  # Install dependencies
32
  echo "Installing dependencies..."
33
+ uv pip install -r requirements.txt --no-cache
34
 
35
  # Kill any existing instances of the Streamlit app
36
  echo "Stopping any existing instances of the Streamlit app..."
 
50
 
51
  # Start the Streamlit app
52
  echo "Starting Streamlit app on port $STREAMLIT_PORT..."
53
+ nohup .venv/bin/python3 -m streamlit run $STREAMLIT_APP_FILE --server.port $STREAMLIT_PORT --server.headless true > $STREAMLIT_LOG_FILE 2>&1 &
54
  echo $! > $STREAMLIT_PID_FILE
55
 
56
  # Check if the Streamlit service started successfully