File size: 2,987 Bytes
a8febb3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from fastapi import FastAPI
from pydantic import BaseModel
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import logging
import re

app = FastAPI()
logging.basicConfig(level=logging.INFO)

class RedirectRequest(BaseModel):
    url: str

@app.post("/resolve")
async def resolve_redirect(data: RedirectRequest):
    try:
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            context = await browser.new_context()
            page = await context.new_page()

            # Step 1: Start navigation to the RSS link
            await page.goto(data.url, wait_until="domcontentloaded", timeout=15000)

            # Step 2: Wait for navigation to a non-Google domain
            try:
                await page.wait_for_url(re.compile(r"^(?!.*news\.google\.com).*"), timeout=10000)
            except:
                pass  # fallback if no hard redirect happened

            final_url = page.url
            await browser.close()

            return {"final_url": final_url}

    except Exception as e:
        logging.error("Redirect resolution failed", exc_info=True)
        return {"error": str(e)}


class ScrapeRequest(BaseModel):
    url: str

@app.post("/scrape")
async def scrape_page(data: ScrapeRequest):
    try:
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            context = await browser.new_context()
            page = await context.new_page()

            await page.goto(data.url, wait_until="domcontentloaded", timeout=40000)

            # Extract visible text using JS walker for generalized coverage
            text = await page.evaluate("""
            () => {
                const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT, {
                    acceptNode: node => {
                        const style = window.getComputedStyle(node.parentElement || {});
                        return style && style.display !== 'none' && style.visibility !== 'hidden' ? NodeFilter.FILTER_ACCEPT : NodeFilter.FILTER_REJECT;
                    }
                });
                let text = '';
                while (walker.nextNode()) {
                    text += walker.currentNode.textContent + '\\n';
                }
                return text.trim();
            }
            """)

            # Get links
            links = await page.eval_on_selector_all(
                "a[href]",
                """els => els.map(el => ({
                    text: el.innerText.trim(),
                    href: el.href
                }))"""
            )

            await browser.close()

            return {
                "final_url": page.url,
                "text": text if text else "No visible content found.",
                "links": links
            }

    except Exception as e:
        logging.error("Scraping failed", exc_info=True)
        return {"error": str(e)}