Spaces:

service-internal
/

web-scraper-new

Sleeping

File size: 2,987 Bytes

a8febb3

from fastapi import FastAPI
from pydantic import BaseModel
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import logging
import re

app = FastAPI()
logging.basicConfig(level=logging.INFO)

class RedirectRequest(BaseModel):
    url: str

@app.post("/resolve")
async def resolve_redirect(data: RedirectRequest):
    try:
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            context = await browser.new_context()
            page = await context.new_page()

            # Step 1: Start navigation to the RSS link
            await page.goto(data.url, wait_until="domcontentloaded", timeout=15000)

            # Step 2: Wait for navigation to a non-Google domain
            try:
                await page.wait_for_url(re.compile(r"^(?!.*news\.google\.com).*"), timeout=10000)
            except:
                pass  # fallback if no hard redirect happened

            final_url = page.url
            await browser.close()

            return {"final_url": final_url}

    except Exception as e:
        logging.error("Redirect resolution failed", exc_info=True)
        return {"error": str(e)}


class ScrapeRequest(BaseModel):
    url: str

@app.post("/scrape")
async def scrape_page(data: ScrapeRequest):
    try:
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            context = await browser.new_context()
            page = await context.new_page()

            await page.goto(data.url, wait_until="domcontentloaded", timeout=40000)

            # Extract visible text using JS walker for generalized coverage
            text = await page.evaluate("""
            () => {
                const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT, {
                    acceptNode: node => {
                        const style = window.getComputedStyle(node.parentElement || {});
                        return style && style.display !== 'none' && style.visibility !== 'hidden' ? NodeFilter.FILTER_ACCEPT : NodeFilter.FILTER_REJECT;
                    }
                });
                let text = '';
                while (walker.nextNode()) {
                    text += walker.currentNode.textContent + '\\n';
                }
                return text.trim();
            }
            """)

            # Get links
            links = await page.eval_on_selector_all(
                "a[href]",
                """els => els.map(el => ({
                    text: el.innerText.trim(),
                    href: el.href
                }))"""
            )

            await browser.close()

            return {
                "final_url": page.url,
                "text": text if text else "No visible content found.",
                "links": links
            }

    except Exception as e:
        logging.error("Scraping failed", exc_info=True)
        return {"error": str(e)}