Spaces:
Sleeping
Sleeping
File size: 2,987 Bytes
a8febb3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
from fastapi import FastAPI
from pydantic import BaseModel
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import logging
import re
app = FastAPI()
logging.basicConfig(level=logging.INFO)
class RedirectRequest(BaseModel):
url: str
@app.post("/resolve")
async def resolve_redirect(data: RedirectRequest):
try:
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
# Step 1: Start navigation to the RSS link
await page.goto(data.url, wait_until="domcontentloaded", timeout=15000)
# Step 2: Wait for navigation to a non-Google domain
try:
await page.wait_for_url(re.compile(r"^(?!.*news\.google\.com).*"), timeout=10000)
except:
pass # fallback if no hard redirect happened
final_url = page.url
await browser.close()
return {"final_url": final_url}
except Exception as e:
logging.error("Redirect resolution failed", exc_info=True)
return {"error": str(e)}
class ScrapeRequest(BaseModel):
url: str
@app.post("/scrape")
async def scrape_page(data: ScrapeRequest):
try:
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
await page.goto(data.url, wait_until="domcontentloaded", timeout=40000)
# Extract visible text using JS walker for generalized coverage
text = await page.evaluate("""
() => {
const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT, {
acceptNode: node => {
const style = window.getComputedStyle(node.parentElement || {});
return style && style.display !== 'none' && style.visibility !== 'hidden' ? NodeFilter.FILTER_ACCEPT : NodeFilter.FILTER_REJECT;
}
});
let text = '';
while (walker.nextNode()) {
text += walker.currentNode.textContent + '\\n';
}
return text.trim();
}
""")
# Get links
links = await page.eval_on_selector_all(
"a[href]",
"""els => els.map(el => ({
text: el.innerText.trim(),
href: el.href
}))"""
)
await browser.close()
return {
"final_url": page.url,
"text": text if text else "No visible content found.",
"links": links
}
except Exception as e:
logging.error("Scraping failed", exc_info=True)
return {"error": str(e)}
|