File size: 3,715 Bytes
05c92cc
d382ddb
 
 
 
02554df
d382ddb
 
02554df
 
 
 
05c92cc
d382ddb
 
 
 
 
 
 
 
 
 
05c92cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
02554df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d382ddb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from fastapi import FastAPI, HTTPException, Query
from pydantic import BaseModel
from playwright.async_api import async_playwright
import asyncio
import base64
import logging
from typing import List, Optional

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = FastAPI(title="Playwright Web Scraper", description="A simple web scraper using Playwright")

class LinkInfo(BaseModel):
    text: str
    href: str

class ScrapeResponse(BaseModel):
    content: Optional[str] = None
    screenshot: Optional[str] = None
    links: Optional[List[LinkInfo]] = None

@app.get("/")
async def root():
    return {
        "message": "Playwright Web Scraper API",
        "endpoints": {
            "/scrape": "Scrape a webpage (GET request)",
            "/docs": "API documentation"
        },
        "example": "/scrape?url=https://example.com&screenshot=true&get_links=true&get_content=false"
    }

@app.get("/scrape")
async def scrape_page(
    url: str = Query(..., description="URL to scrape"),
    screenshot: bool = Query(True, description="Take a screenshot"),
    get_links: bool = Query(True, description="Extract links"),
    get_content: bool = Query(False, description="Get page content (can be large)")
):
    logger.info(f"Starting scrape for URL: {url}")
    try:
        async with async_playwright() as p:
            logger.info("Launching browser...")
            browser = await p.chromium.launch(
                headless=True,
                args=[
                    '--no-sandbox',
                    '--disable-setuid-sandbox',
                    '--disable-dev-shm-usage',
                    '--disable-accelerated-2d-canvas',
                    '--no-first-run',
                    '--no-zygote',
                    '--disable-gpu'
                ]
            )
            page = await browser.new_page()

            try:
                logger.info(f"Navigating to {url}...")
                await page.goto(url, wait_until="networkidle")
                response = ScrapeResponse()

                # Get page content
                if get_content:
                    logger.info("Getting page content...")
                    response.content = await page.content()

                # Get screenshot
                if screenshot:
                    logger.info("Taking screenshot...")
                    screenshot_bytes = await page.screenshot()
                    response.screenshot = base64.b64encode(screenshot_bytes).decode('utf-8')

                # Get links
                if get_links:
                    logger.info("Extracting links...")
                    links = await page.evaluate("""
                        () => {
                            return Array.from(document.querySelectorAll('a')).map(a => {
                                return {
                                    text: a.innerText.trim(),
                                    href: a.href
                                }
                            });
                        }
                    """)
                    response.links = [LinkInfo(**link) for link in links]

                await browser.close()
                logger.info("Scraping completed successfully")
                return response

            except Exception as e:
                logger.error(f"Error during scraping: {str(e)}")
                await browser.close()
                raise HTTPException(status_code=500, detail=f"Scraping error: {str(e)}")

    except Exception as e:
        logger.error(f"Error launching browser: {str(e)}")
        raise HTTPException(status_code=500, detail=f"Browser launch error: {str(e)}")