File size: 5,720 Bytes
05c92cc d382ddb 02554df d382ddb 02554df 05c92cc d382ddb 9e4b598 d382ddb 9e4b598 d382ddb 05c92cc 9e4b598 05c92cc 9e4b598 05c92cc 9e4b598 05c92cc 9e4b598 05c92cc 02554df 9e4b598 02554df 9e4b598 02554df 9e4b598 02554df 9e4b598 02554df 9e4b598 02554df 9e4b598 02554df d382ddb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
from fastapi import FastAPI, HTTPException, Query
from pydantic import BaseModel
from playwright.async_api import async_playwright
import asyncio
import base64
import logging
from typing import List, Optional
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = FastAPI(title="Playwright Web Scraper", description="A simple web scraper using Playwright")
class LinkInfo(BaseModel):
text: str
href: str
class ScrapeResponse(BaseModel):
body_content: Optional[str] = None
screenshot: Optional[str] = None
links: Optional[List[LinkInfo]] = None
page_title: Optional[str] = None
meta_description: Optional[str] = None
@app.get("/")
async def root():
return {
"message": "Playwright Web Scraper API - Body, Links & Images",
"endpoints": {
"/scrape": "Scrape webpage body content, links, and take screenshot",
"/docs": "API documentation"
},
"example": "/scrape?url=https://example.com&screenshot=true&get_links=true&get_body=true",
"features": [
"Extract body tag content (clean text)",
"Get all links with text and URLs",
"Take full page screenshot",
"Extract page title and meta description"
]
}
@app.get("/scrape")
async def scrape_page(
url: str = Query(..., description="URL to scrape"),
screenshot: bool = Query(True, description="Take a full page screenshot"),
get_links: bool = Query(True, description="Extract all links from the page"),
get_body: bool = Query(True, description="Extract body tag content")
):
logger.info(f"Starting scrape for URL: {url}")
try:
async with async_playwright() as p:
logger.info("Launching browser...")
browser = await p.chromium.launch(
headless=True,
args=[
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--no-first-run',
'--no-zygote',
'--disable-gpu'
]
)
page = await browser.new_page()
try:
logger.info(f"Navigating to {url}...")
await page.goto(url, wait_until="networkidle")
response = ScrapeResponse()
# Always get page title and meta description
logger.info("Getting page metadata...")
response.page_title = await page.title()
meta_desc = await page.evaluate("""
() => {
const meta = document.querySelector('meta[name="description"]');
return meta ? meta.getAttribute('content') : null;
}
""")
response.meta_description = meta_desc
# Get body content (clean text)
if get_body:
logger.info("Extracting body content...")
body_content = await page.evaluate("""
() => {
const body = document.querySelector('body');
if (!body) return null;
// Remove script and style elements
const scripts = body.querySelectorAll('script, style, noscript');
scripts.forEach(el => el.remove());
// Get clean text content
return body.innerText.trim();
}
""")
response.body_content = body_content
# Get screenshot (full page)
if screenshot:
logger.info("Taking full page screenshot...")
screenshot_bytes = await page.screenshot(full_page=True)
response.screenshot = base64.b64encode(screenshot_bytes).decode('utf-8')
# Get links with better filtering
if get_links:
logger.info("Extracting links...")
links = await page.evaluate("""
() => {
return Array.from(document.querySelectorAll('a[href]')).map(a => {
const text = a.innerText.trim();
const href = a.href;
// Only include links with meaningful text and valid URLs
if (text && href && href.startsWith('http')) {
return {
text: text.substring(0, 200), // Limit text length
href: href
}
}
return null;
}).filter(link => link !== null);
}
""")
response.links = [LinkInfo(**link) for link in links]
await browser.close()
logger.info("Scraping completed successfully")
return response
except Exception as e:
logger.error(f"Error during scraping: {str(e)}")
await browser.close()
raise HTTPException(status_code=500, detail=f"Scraping error: {str(e)}")
except Exception as e:
logger.error(f"Error launching browser: {str(e)}")
raise HTTPException(status_code=500, detail=f"Browser launch error: {str(e)}")
|