service-internal commited on
Commit
a8febb3
·
verified ·
1 Parent(s): 1465c07

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +19 -0
  2. app.py +91 -0
  3. requirements.txt +4 -0
Dockerfile ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Install dependencies
4
+ RUN apt-get update && apt-get install -y wget curl gnupg unzip libglib2.0-0 libnss3 libgconf-2-4 libfontconfig1 libxss1 libasound2 libxtst6 libx11-xcb1 libxcomposite1 libxcursor1 libxdamage1 libxrandr2 libxext6 libxfixes3 libx11-6 libxcb1 libxinerama1 libpango-1.0-0 libcairo2 libatk-bridge2.0-0 libgtk-3-0
5
+
6
+ # Copy files
7
+ COPY requirements.txt .
8
+ RUN pip install --no-cache-dir -r requirements.txt
9
+
10
+ # Install Playwright dependencies
11
+ RUN pip install playwright && playwright install chromium
12
+
13
+ # Copy your app code
14
+ COPY . /app
15
+ WORKDIR /app
16
+
17
+ # Run app
18
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
19
+
app.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from pydantic import BaseModel
3
+ from playwright.async_api import async_playwright
4
+ from bs4 import BeautifulSoup
5
+ import logging
6
+ import re
7
+
8
+ app = FastAPI()
9
+ logging.basicConfig(level=logging.INFO)
10
+
11
+ class RedirectRequest(BaseModel):
12
+ url: str
13
+
14
+ @app.post("/resolve")
15
+ async def resolve_redirect(data: RedirectRequest):
16
+ try:
17
+ async with async_playwright() as p:
18
+ browser = await p.chromium.launch(headless=True)
19
+ context = await browser.new_context()
20
+ page = await context.new_page()
21
+
22
+ # Step 1: Start navigation to the RSS link
23
+ await page.goto(data.url, wait_until="domcontentloaded", timeout=15000)
24
+
25
+ # Step 2: Wait for navigation to a non-Google domain
26
+ try:
27
+ await page.wait_for_url(re.compile(r"^(?!.*news\.google\.com).*"), timeout=10000)
28
+ except:
29
+ pass # fallback if no hard redirect happened
30
+
31
+ final_url = page.url
32
+ await browser.close()
33
+
34
+ return {"final_url": final_url}
35
+
36
+ except Exception as e:
37
+ logging.error("Redirect resolution failed", exc_info=True)
38
+ return {"error": str(e)}
39
+
40
+
41
+ class ScrapeRequest(BaseModel):
42
+ url: str
43
+
44
+ @app.post("/scrape")
45
+ async def scrape_page(data: ScrapeRequest):
46
+ try:
47
+ async with async_playwright() as p:
48
+ browser = await p.chromium.launch(headless=True)
49
+ context = await browser.new_context()
50
+ page = await context.new_page()
51
+
52
+ await page.goto(data.url, wait_until="domcontentloaded", timeout=40000)
53
+
54
+ # Extract visible text using JS walker for generalized coverage
55
+ text = await page.evaluate("""
56
+ () => {
57
+ const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT, {
58
+ acceptNode: node => {
59
+ const style = window.getComputedStyle(node.parentElement || {});
60
+ return style && style.display !== 'none' && style.visibility !== 'hidden' ? NodeFilter.FILTER_ACCEPT : NodeFilter.FILTER_REJECT;
61
+ }
62
+ });
63
+ let text = '';
64
+ while (walker.nextNode()) {
65
+ text += walker.currentNode.textContent + '\\n';
66
+ }
67
+ return text.trim();
68
+ }
69
+ """)
70
+
71
+ # Get links
72
+ links = await page.eval_on_selector_all(
73
+ "a[href]",
74
+ """els => els.map(el => ({
75
+ text: el.innerText.trim(),
76
+ href: el.href
77
+ }))"""
78
+ )
79
+
80
+ await browser.close()
81
+
82
+ return {
83
+ "final_url": page.url,
84
+ "text": text if text else "No visible content found.",
85
+ "links": links
86
+ }
87
+
88
+ except Exception as e:
89
+ logging.error("Scraping failed", exc_info=True)
90
+ return {"error": str(e)}
91
+
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ playwright
4
+ beautifulsoup4