File size: 5,720 Bytes
05c92cc
d382ddb
 
 
 
02554df
d382ddb
 
02554df
 
 
 
05c92cc
d382ddb
 
 
 
 
 
9e4b598
d382ddb
 
9e4b598
 
d382ddb
05c92cc
 
 
9e4b598
05c92cc
9e4b598
05c92cc
 
9e4b598
 
 
 
 
 
 
05c92cc
 
 
 
 
9e4b598
 
 
05c92cc
02554df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e4b598
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
02554df
9e4b598
02554df
9e4b598
 
02554df
 
9e4b598
02554df
 
 
 
9e4b598
 
 
 
 
 
 
 
 
 
02554df
9e4b598
 
02554df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d382ddb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
from fastapi import FastAPI, HTTPException, Query
from pydantic import BaseModel
from playwright.async_api import async_playwright
import asyncio
import base64
import logging
from typing import List, Optional

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = FastAPI(title="Playwright Web Scraper", description="A simple web scraper using Playwright")

class LinkInfo(BaseModel):
    text: str
    href: str

class ScrapeResponse(BaseModel):
    body_content: Optional[str] = None
    screenshot: Optional[str] = None
    links: Optional[List[LinkInfo]] = None
    page_title: Optional[str] = None
    meta_description: Optional[str] = None

@app.get("/")
async def root():
    return {
        "message": "Playwright Web Scraper API - Body, Links & Images",
        "endpoints": {
            "/scrape": "Scrape webpage body content, links, and take screenshot",
            "/docs": "API documentation"
        },
        "example": "/scrape?url=https://example.com&screenshot=true&get_links=true&get_body=true",
        "features": [
            "Extract body tag content (clean text)",
            "Get all links with text and URLs",
            "Take full page screenshot",
            "Extract page title and meta description"
        ]
    }

@app.get("/scrape")
async def scrape_page(
    url: str = Query(..., description="URL to scrape"),
    screenshot: bool = Query(True, description="Take a full page screenshot"),
    get_links: bool = Query(True, description="Extract all links from the page"),
    get_body: bool = Query(True, description="Extract body tag content")
):
    logger.info(f"Starting scrape for URL: {url}")
    try:
        async with async_playwright() as p:
            logger.info("Launching browser...")
            browser = await p.chromium.launch(
                headless=True,
                args=[
                    '--no-sandbox',
                    '--disable-setuid-sandbox',
                    '--disable-dev-shm-usage',
                    '--disable-accelerated-2d-canvas',
                    '--no-first-run',
                    '--no-zygote',
                    '--disable-gpu'
                ]
            )
            page = await browser.new_page()

            try:
                logger.info(f"Navigating to {url}...")
                await page.goto(url, wait_until="networkidle")
                response = ScrapeResponse()

                # Always get page title and meta description
                logger.info("Getting page metadata...")
                response.page_title = await page.title()

                meta_desc = await page.evaluate("""
                    () => {
                        const meta = document.querySelector('meta[name="description"]');
                        return meta ? meta.getAttribute('content') : null;
                    }
                """)
                response.meta_description = meta_desc

                # Get body content (clean text)
                if get_body:
                    logger.info("Extracting body content...")
                    body_content = await page.evaluate("""
                        () => {
                            const body = document.querySelector('body');
                            if (!body) return null;

                            // Remove script and style elements
                            const scripts = body.querySelectorAll('script, style, noscript');
                            scripts.forEach(el => el.remove());

                            // Get clean text content
                            return body.innerText.trim();
                        }
                    """)
                    response.body_content = body_content

                # Get screenshot (full page)
                if screenshot:
                    logger.info("Taking full page screenshot...")
                    screenshot_bytes = await page.screenshot(full_page=True)
                    response.screenshot = base64.b64encode(screenshot_bytes).decode('utf-8')

                # Get links with better filtering
                if get_links:
                    logger.info("Extracting links...")
                    links = await page.evaluate("""
                        () => {
                            return Array.from(document.querySelectorAll('a[href]')).map(a => {
                                const text = a.innerText.trim();
                                const href = a.href;

                                // Only include links with meaningful text and valid URLs
                                if (text && href && href.startsWith('http')) {
                                    return {
                                        text: text.substring(0, 200), // Limit text length
                                        href: href
                                    }
                                }
                                return null;
                            }).filter(link => link !== null);
                        }
                    """)
                    response.links = [LinkInfo(**link) for link in links]

                await browser.close()
                logger.info("Scraping completed successfully")
                return response

            except Exception as e:
                logger.error(f"Error during scraping: {str(e)}")
                await browser.close()
                raise HTTPException(status_code=500, detail=f"Scraping error: {str(e)}")

    except Exception as e:
        logger.error(f"Error launching browser: {str(e)}")
        raise HTTPException(status_code=500, detail=f"Browser launch error: {str(e)}")