Spaces:

service-internal
/

web-scraper-new

Sleeping

App Files Files Community

service-internal commited on 28 days ago

Commit

a8febb3

verified ·

1 Parent(s): 1465c07

Upload 3 files

Browse files

Files changed (3) hide show

Dockerfile +19 -0
app.py +91 -0
requirements.txt +4 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,19 @@

+FROM python:3.10-slim
+# Install dependencies
+RUN apt-get update && apt-get install -y wget curl gnupg unzip libglib2.0-0 libnss3 libgconf-2-4 libfontconfig1 libxss1 libasound2 libxtst6 libx11-xcb1 libxcomposite1 libxcursor1 libxdamage1 libxrandr2 libxext6 libxfixes3 libx11-6 libxcb1 libxinerama1 libpango-1.0-0 libcairo2 libatk-bridge2.0-0 libgtk-3-0
+# Copy files
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Install Playwright dependencies
+RUN pip install playwright && playwright install chromium
+# Copy your app code
+COPY . /app
+WORKDIR /app
+# Run app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,91 @@

+from fastapi import FastAPI
+from pydantic import BaseModel
+from playwright.async_api import async_playwright
+from bs4 import BeautifulSoup
+import logging
+import re
+app = FastAPI()
+logging.basicConfig(level=logging.INFO)
+class RedirectRequest(BaseModel):
+    url: str
+@app.post("/resolve")
+async def resolve_redirect(data: RedirectRequest):
+    try:
+        async with async_playwright() as p:
+            browser = await p.chromium.launch(headless=True)
+            context = await browser.new_context()
+            page = await context.new_page()
+            # Step 1: Start navigation to the RSS link
+            await page.goto(data.url, wait_until="domcontentloaded", timeout=15000)
+            # Step 2: Wait for navigation to a non-Google domain
+            try:
+                await page.wait_for_url(re.compile(r"^(?!.*news\.google\.com).*"), timeout=10000)
+            except:
+                pass  # fallback if no hard redirect happened
+            final_url = page.url
+            await browser.close()
+            return {"final_url": final_url}
+    except Exception as e:
+        logging.error("Redirect resolution failed", exc_info=True)
+        return {"error": str(e)}
+class ScrapeRequest(BaseModel):
+    url: str
+@app.post("/scrape")
+async def scrape_page(data: ScrapeRequest):
+    try:
+        async with async_playwright() as p:
+            browser = await p.chromium.launch(headless=True)
+            context = await browser.new_context()
+            page = await context.new_page()
+            await page.goto(data.url, wait_until="domcontentloaded", timeout=40000)
+            # Extract visible text using JS walker for generalized coverage
+            text = await page.evaluate("""
+            () => {
+                const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT, {
+                    acceptNode: node => {
+                        const style = window.getComputedStyle(node.parentElement || {});
+                        return style && style.display !== 'none' && style.visibility !== 'hidden' ? NodeFilter.FILTER_ACCEPT : NodeFilter.FILTER_REJECT;
+                    }
+                });
+                let text = '';
+                while (walker.nextNode()) {
+                    text += walker.currentNode.textContent + '\\n';
+                }
+                return text.trim();
+            }
+            """)
+            # Get links
+            links = await page.eval_on_selector_all(
+                "a[href]",
+                """els => els.map(el => ({
+                    text: el.innerText.trim(),
+                    href: el.href
+                }))"""
+            )
+            await browser.close()
+            return {
+                "final_url": page.url,
+                "text": text if text else "No visible content found.",
+                "links": links
+            }
+    except Exception as e:
+        logging.error("Scraping failed", exc_info=True)
+        return {"error": str(e)}

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+fastapi
+uvicorn
+playwright
+beautifulsoup4