Spaces:

apexherbert200
/

playwright-scraper-clean

Paused

apexherbert200 commited on May 25

Commit

d382ddb

1 Parent(s): 8b5a84c

Fix Python syntax errors and add requirements.txt

- Fixed all syntax errors in scrape.py (imports, indentation, class definitions)
- Created requirements.txt with necessary dependencies
- Cleaned up Dockerfile to remove redundant installations
- Ready for Hugging Face deployment

Files changed (3) hide show

Dockerfile +18 -1
requirements.txt +4 -0
scrape.py +87 -0

Dockerfile CHANGED Viewed

@@ -1,4 +1,21 @@
-FROM herbert400/web-scraper
 EXPOSE 7860
 CMD ["python", "-m", "uvicorn", "scrape:app", "--host", "0.0.0.0", "--port", "7860"]

+FROM zenika/alpine-chrome:with-playwright
+WORKDIR /app
+# Install Python and pip
+RUN apk add --no-cache python3 py3-pip
+# Copy requirements first for better caching
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy your code
+COPY . .
+# FastAPI and Uvicorn are already installed via requirements.txt
+# Playwright browsers are already available in the base image
 EXPOSE 7860
+# Run the FastAPI application
 CMD ["python", "-m", "uvicorn", "scrape:app", "--host", "0.0.0.0", "--port", "7860"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+fastapi==0.104.1
+uvicorn[standard]==0.24.0
+playwright==1.40.0
+pydantic==2.5.0

scrape.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from playwright.async_api import async_playwright
+import asyncio
+import base64
+from typing import List, Optional
+app = FastAPI()
+class ScrapeRequest(BaseModel):
+    url: str
+    screenshot: bool = True
+    get_links: bool = True
+    get_content: bool = True
+class LinkInfo(BaseModel):
+    text: str
+    href: str
+class ScrapeResponse(BaseModel):
+    content: Optional[str] = None
+    screenshot: Optional[str] = None
+    links: Optional[List[LinkInfo]] = None
+@app.post("/scrape")
+async def scrape_page(request: ScrapeRequest):
+    async with async_playwright() as p:
+        browser = await p.chromium.launch()
+        page = await browser.new_page()
+        try:
+            await page.goto(request.url, wait_until="networkidle")
+            response = ScrapeResponse()
+            # Get page content
+            if request.get_content:
+                response.content = await page.content()
+            # Get screenshot
+            if request.screenshot:
+                screenshot_bytes = await page.screenshot()
+                response.screenshot = base64.b64encode(screenshot_bytes).decode('utf-8')
+            # Get links
+            if request.get_links:
+                links = await page.evaluate("""
+                    () => {
+                        return Array.from(document.querySelectorAll('a')).map(a => {
+                            return {
+                                text: a.innerText.trim(),
+                                href: a.href
+                            }
+                        });
+                    }
+                """)
+                response.links = [LinkInfo(**link) for link in links]
+            await browser.close()
+            return response
+        except Exception as e:
+            await browser.close()
+            raise HTTPException(status_code=500, detail=str(e))