apexherbert200 commited on
Commit
05c92cc
·
1 Parent(s): 9b0dc4d

Convert FastAPI to use GET requests for easier testing

Browse files

- Changed /scrape endpoint from POST to GET
- Added query parameters: url, screenshot, get_links, get_content
- Added root endpoint (/) with API information and examples
- Made get_content default to false to avoid large responses
- Now testable directly in browser with URL parameters
- Example: /scrape?url=https://example.com&screenshot=true

Files changed (1) hide show
  1. scrape.py +24 -20
scrape.py CHANGED
@@ -1,17 +1,11 @@
1
- from fastapi import FastAPI, HTTPException
2
  from pydantic import BaseModel
3
  from playwright.async_api import async_playwright
4
  import asyncio
5
  import base64
6
  from typing import List, Optional
7
 
8
- app = FastAPI()
9
-
10
- class ScrapeRequest(BaseModel):
11
- url: str
12
- screenshot: bool = True
13
- get_links: bool = True
14
- get_content: bool = True
15
 
16
  class LinkInfo(BaseModel):
17
  text: str
@@ -22,27 +16,43 @@ class ScrapeResponse(BaseModel):
22
  screenshot: Optional[str] = None
23
  links: Optional[List[LinkInfo]] = None
24
 
25
- @app.post("/scrape")
26
- async def scrape_page(request: ScrapeRequest):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  async with async_playwright() as p:
28
  browser = await p.chromium.launch()
29
  page = await browser.new_page()
30
 
31
  try:
32
- await page.goto(request.url, wait_until="networkidle")
33
  response = ScrapeResponse()
34
 
35
  # Get page content
36
- if request.get_content:
37
  response.content = await page.content()
38
 
39
  # Get screenshot
40
- if request.screenshot:
41
  screenshot_bytes = await page.screenshot()
42
  response.screenshot = base64.b64encode(screenshot_bytes).decode('utf-8')
43
 
44
  # Get links
45
- if request.get_links:
46
  links = await page.evaluate("""
47
  () => {
48
  return Array.from(document.querySelectorAll('a')).map(a => {
@@ -75,12 +85,6 @@ async def scrape_page(request: ScrapeRequest):
75
 
76
 
77
 
78
-
79
-
80
-
81
-
82
-
83
-
84
 
85
 
86
 
 
1
+ from fastapi import FastAPI, HTTPException, Query
2
  from pydantic import BaseModel
3
  from playwright.async_api import async_playwright
4
  import asyncio
5
  import base64
6
  from typing import List, Optional
7
 
8
+ app = FastAPI(title="Playwright Web Scraper", description="A simple web scraper using Playwright")
 
 
 
 
 
 
9
 
10
  class LinkInfo(BaseModel):
11
  text: str
 
16
  screenshot: Optional[str] = None
17
  links: Optional[List[LinkInfo]] = None
18
 
19
+ @app.get("/")
20
+ async def root():
21
+ return {
22
+ "message": "Playwright Web Scraper API",
23
+ "endpoints": {
24
+ "/scrape": "Scrape a webpage (GET request)",
25
+ "/docs": "API documentation"
26
+ },
27
+ "example": "/scrape?url=https://example.com&screenshot=true&get_links=true&get_content=false"
28
+ }
29
+
30
+ @app.get("/scrape")
31
+ async def scrape_page(
32
+ url: str = Query(..., description="URL to scrape"),
33
+ screenshot: bool = Query(True, description="Take a screenshot"),
34
+ get_links: bool = Query(True, description="Extract links"),
35
+ get_content: bool = Query(False, description="Get page content (can be large)")
36
+ ):
37
  async with async_playwright() as p:
38
  browser = await p.chromium.launch()
39
  page = await browser.new_page()
40
 
41
  try:
42
+ await page.goto(url, wait_until="networkidle")
43
  response = ScrapeResponse()
44
 
45
  # Get page content
46
+ if get_content:
47
  response.content = await page.content()
48
 
49
  # Get screenshot
50
+ if screenshot:
51
  screenshot_bytes = await page.screenshot()
52
  response.screenshot = base64.b64encode(screenshot_bytes).decode('utf-8')
53
 
54
  # Get links
55
+ if get_links:
56
  links = await page.evaluate("""
57
  () => {
58
  return Array.from(document.querySelectorAll('a')).map(a => {
 
85
 
86
 
87
 
 
 
 
 
 
 
88
 
89
 
90