|
from fastapi import FastAPI, HTTPException |
|
from pydantic import BaseModel |
|
from playwright.async_api import async_playwright |
|
import asyncio |
|
import base64 |
|
from typing import List, Optional |
|
|
|
app = FastAPI() |
|
|
|
class ScrapeRequest(BaseModel): |
|
url: str |
|
screenshot: bool = True |
|
get_links: bool = True |
|
get_content: bool = True |
|
|
|
class LinkInfo(BaseModel): |
|
text: str |
|
href: str |
|
|
|
class ScrapeResponse(BaseModel): |
|
content: Optional[str] = None |
|
screenshot: Optional[str] = None |
|
links: Optional[List[LinkInfo]] = None |
|
|
|
@app.post("/scrape") |
|
async def scrape_page(request: ScrapeRequest): |
|
async with async_playwright() as p: |
|
browser = await p.chromium.launch() |
|
page = await browser.new_page() |
|
|
|
try: |
|
await page.goto(request.url, wait_until="networkidle") |
|
response = ScrapeResponse() |
|
|
|
|
|
if request.get_content: |
|
response.content = await page.content() |
|
|
|
|
|
if request.screenshot: |
|
screenshot_bytes = await page.screenshot() |
|
response.screenshot = base64.b64encode(screenshot_bytes).decode('utf-8') |
|
|
|
|
|
if request.get_links: |
|
links = await page.evaluate(""" |
|
() => { |
|
return Array.from(document.querySelectorAll('a')).map(a => { |
|
return { |
|
text: a.innerText.trim(), |
|
href: a.href |
|
} |
|
}); |
|
} |
|
""") |
|
response.links = [LinkInfo(**link) for link in links] |
|
|
|
await browser.close() |
|
return response |
|
|
|
except Exception as e: |
|
await browser.close() |
|
raise HTTPException(status_code=500, detail=str(e)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|