apexherbert200 commited on
Commit
02554df
·
1 Parent(s): 05c92cc

Fix Playwright browser installation for appuser

Browse files

- Install browsers as appuser instead of root
- Add proper browser launch args for containerized environment
- Add comprehensive logging for debugging
- Should resolve 'Executable doesn't exist' error

Files changed (2) hide show
  1. Dockerfile +5 -4
  2. scrape.py +66 -37
Dockerfile CHANGED
@@ -35,15 +35,16 @@ RUN apt-get update && apt-get install -y \
35
  COPY requirements.txt .
36
  RUN pip install --no-cache-dir -r requirements.txt
37
 
38
- # Install Playwright browsers
39
- RUN python -m playwright install chromium
40
 
41
  # Copy your code
42
  COPY . .
 
43
 
44
- # Create a non-root user for security
45
- RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app
46
  USER appuser
 
47
 
48
  EXPOSE 7860
49
 
 
35
  COPY requirements.txt .
36
  RUN pip install --no-cache-dir -r requirements.txt
37
 
38
+ # Create a non-root user for security
39
+ RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app
40
 
41
  # Copy your code
42
  COPY . .
43
+ RUN chown -R appuser:appuser /app
44
 
45
+ # Switch to appuser and install Playwright browsers
 
46
  USER appuser
47
+ RUN python -m playwright install chromium
48
 
49
  EXPOSE 7860
50
 
scrape.py CHANGED
@@ -3,8 +3,13 @@ from pydantic import BaseModel
3
  from playwright.async_api import async_playwright
4
  import asyncio
5
  import base64
 
6
  from typing import List, Optional
7
 
 
 
 
 
8
  app = FastAPI(title="Playwright Web Scraper", description="A simple web scraper using Playwright")
9
 
10
  class LinkInfo(BaseModel):
@@ -34,43 +39,67 @@ async def scrape_page(
34
  get_links: bool = Query(True, description="Extract links"),
35
  get_content: bool = Query(False, description="Get page content (can be large)")
36
  ):
37
- async with async_playwright() as p:
38
- browser = await p.chromium.launch()
39
- page = await browser.new_page()
40
-
41
- try:
42
- await page.goto(url, wait_until="networkidle")
43
- response = ScrapeResponse()
44
-
45
- # Get page content
46
- if get_content:
47
- response.content = await page.content()
48
-
49
- # Get screenshot
50
- if screenshot:
51
- screenshot_bytes = await page.screenshot()
52
- response.screenshot = base64.b64encode(screenshot_bytes).decode('utf-8')
53
-
54
- # Get links
55
- if get_links:
56
- links = await page.evaluate("""
57
- () => {
58
- return Array.from(document.querySelectorAll('a')).map(a => {
59
- return {
60
- text: a.innerText.trim(),
61
- href: a.href
62
- }
63
- });
64
- }
65
- """)
66
- response.links = [LinkInfo(**link) for link in links]
67
-
68
- await browser.close()
69
- return response
70
-
71
- except Exception as e:
72
- await browser.close()
73
- raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
 
76
 
 
3
  from playwright.async_api import async_playwright
4
  import asyncio
5
  import base64
6
+ import logging
7
  from typing import List, Optional
8
 
9
+ # Set up logging
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
  app = FastAPI(title="Playwright Web Scraper", description="A simple web scraper using Playwright")
14
 
15
  class LinkInfo(BaseModel):
 
39
  get_links: bool = Query(True, description="Extract links"),
40
  get_content: bool = Query(False, description="Get page content (can be large)")
41
  ):
42
+ logger.info(f"Starting scrape for URL: {url}")
43
+ try:
44
+ async with async_playwright() as p:
45
+ logger.info("Launching browser...")
46
+ browser = await p.chromium.launch(
47
+ headless=True,
48
+ args=[
49
+ '--no-sandbox',
50
+ '--disable-setuid-sandbox',
51
+ '--disable-dev-shm-usage',
52
+ '--disable-accelerated-2d-canvas',
53
+ '--no-first-run',
54
+ '--no-zygote',
55
+ '--disable-gpu'
56
+ ]
57
+ )
58
+ page = await browser.new_page()
59
+
60
+ try:
61
+ logger.info(f"Navigating to {url}...")
62
+ await page.goto(url, wait_until="networkidle")
63
+ response = ScrapeResponse()
64
+
65
+ # Get page content
66
+ if get_content:
67
+ logger.info("Getting page content...")
68
+ response.content = await page.content()
69
+
70
+ # Get screenshot
71
+ if screenshot:
72
+ logger.info("Taking screenshot...")
73
+ screenshot_bytes = await page.screenshot()
74
+ response.screenshot = base64.b64encode(screenshot_bytes).decode('utf-8')
75
+
76
+ # Get links
77
+ if get_links:
78
+ logger.info("Extracting links...")
79
+ links = await page.evaluate("""
80
+ () => {
81
+ return Array.from(document.querySelectorAll('a')).map(a => {
82
+ return {
83
+ text: a.innerText.trim(),
84
+ href: a.href
85
+ }
86
+ });
87
+ }
88
+ """)
89
+ response.links = [LinkInfo(**link) for link in links]
90
+
91
+ await browser.close()
92
+ logger.info("Scraping completed successfully")
93
+ return response
94
+
95
+ except Exception as e:
96
+ logger.error(f"Error during scraping: {str(e)}")
97
+ await browser.close()
98
+ raise HTTPException(status_code=500, detail=f"Scraping error: {str(e)}")
99
+
100
+ except Exception as e:
101
+ logger.error(f"Error launching browser: {str(e)}")
102
+ raise HTTPException(status_code=500, detail=f"Browser launch error: {str(e)}")
103
 
104
 
105