Spaces:

apexherbert200
/

playwright-scraper-clean

Running

apexherbert200 commited on May 25

Commit

05c92cc

1 Parent(s): 9b0dc4d

Convert FastAPI to use GET requests for easier testing

- Changed /scrape endpoint from POST to GET
- Added query parameters: url, screenshot, get_links, get_content
- Added root endpoint (/) with API information and examples
- Made get_content default to false to avoid large responses
- Now testable directly in browser with URL parameters
- Example: /scrape?url=https://example.com&screenshot=true

Files changed (1) hide show

scrape.py +24 -20

scrape.py CHANGED Viewed

@@ -1,17 +1,11 @@
-from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from playwright.async_api import async_playwright
 import asyncio
 import base64
 from typing import List, Optional
-app = FastAPI()
-class ScrapeRequest(BaseModel):
-    url: str
-    screenshot: bool = True
-    get_links: bool = True
-    get_content: bool = True
 class LinkInfo(BaseModel):
     text: str
@@ -22,27 +16,43 @@ class ScrapeResponse(BaseModel):
     screenshot: Optional[str] = None
     links: Optional[List[LinkInfo]] = None
-@app.post("/scrape")
-async def scrape_page(request: ScrapeRequest):
     async with async_playwright() as p:
         browser = await p.chromium.launch()
         page = await browser.new_page()
         try:
-            await page.goto(request.url, wait_until="networkidle")
             response = ScrapeResponse()
             # Get page content
-            if request.get_content:
                 response.content = await page.content()
             # Get screenshot
-            if request.screenshot:
                 screenshot_bytes = await page.screenshot()
                 response.screenshot = base64.b64encode(screenshot_bytes).decode('utf-8')
             # Get links
-            if request.get_links:
                 links = await page.evaluate("""
                     () => {
                         return Array.from(document.querySelectorAll('a')).map(a => {
@@ -75,12 +85,6 @@ async def scrape_page(request: ScrapeRequest):

+from fastapi import FastAPI, HTTPException, Query
 from pydantic import BaseModel
 from playwright.async_api import async_playwright
 import asyncio
 import base64
 from typing import List, Optional
+app = FastAPI(title="Playwright Web Scraper", description="A simple web scraper using Playwright")
 class LinkInfo(BaseModel):
     text: str
     screenshot: Optional[str] = None
     links: Optional[List[LinkInfo]] = None
+@app.get("/")
+async def root():
+    return {
+        "message": "Playwright Web Scraper API",
+        "endpoints": {
+            "/scrape": "Scrape a webpage (GET request)",
+            "/docs": "API documentation"
+        },
+        "example": "/scrape?url=https://example.com&screenshot=true&get_links=true&get_content=false"
+    }
+@app.get("/scrape")
+async def scrape_page(
+    url: str = Query(..., description="URL to scrape"),
+    screenshot: bool = Query(True, description="Take a screenshot"),
+    get_links: bool = Query(True, description="Extract links"),
+    get_content: bool = Query(False, description="Get page content (can be large)")
+):
     async with async_playwright() as p:
         browser = await p.chromium.launch()
         page = await browser.new_page()
         try:
+            await page.goto(url, wait_until="networkidle")
             response = ScrapeResponse()
             # Get page content
+            if get_content:
                 response.content = await page.content()
             # Get screenshot
+            if screenshot:
                 screenshot_bytes = await page.screenshot()
                 response.screenshot = base64.b64encode(screenshot_bytes).decode('utf-8')
             # Get links
+            if get_links:
                 links = await page.evaluate("""
                     () => {
                         return Array.from(document.querySelectorAll('a')).map(a => {