File size: 1,937 Bytes
d382ddb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from playwright.async_api import async_playwright
import asyncio
import base64
from typing import List, Optional

app = FastAPI()

class ScrapeRequest(BaseModel):
    url: str
    screenshot: bool = True
    get_links: bool = True
    get_content: bool = True

class LinkInfo(BaseModel):
    text: str
    href: str

class ScrapeResponse(BaseModel):
    content: Optional[str] = None
    screenshot: Optional[str] = None
    links: Optional[List[LinkInfo]] = None

@app.post("/scrape")
async def scrape_page(request: ScrapeRequest):
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()

        try:
            await page.goto(request.url, wait_until="networkidle")
            response = ScrapeResponse()

            # Get page content
            if request.get_content:
                response.content = await page.content()

            # Get screenshot
            if request.screenshot:
                screenshot_bytes = await page.screenshot()
                response.screenshot = base64.b64encode(screenshot_bytes).decode('utf-8')

            # Get links
            if request.get_links:
                links = await page.evaluate("""
                    () => {
                        return Array.from(document.querySelectorAll('a')).map(a => {
                            return {
                                text: a.innerText.trim(),
                                href: a.href
                            }
                        });
                    }
                """)
                response.links = [LinkInfo(**link) for link in links]

            await browser.close()
            return response

        except Exception as e:
            await browser.close()
            raise HTTPException(status_code=500, detail=str(e))