Spaces:

apexherbert200
/

playwright-scraper-clean

Running

App Files Files Community

apexherbert200 commited on Jun 15

Commit

be7cc52

1 Parent(s): dd2c937

Using google search

Browse files

Files changed (2) hide show

Dockerfile +1 -1
test1.py +35 -250

Dockerfile CHANGED Viewed

@@ -53,4 +53,4 @@ RUN python -m playwright install chromium
 EXPOSE 7860
 # Run the FastAPI application
-CMD ["python", "-m", "uvicorn", "webrify:app", "--host", "0.0.0.0", "--port", "7860"]

 EXPOSE 7860
 # Run the FastAPI application
+CMD ["python", "-m", "uvicorn", "test1:app", "--host", "0.0.0.0", "--port", "7860"]

test1.py CHANGED Viewed

@@ -1,257 +1,42 @@
-from fastapi import FastAPI, HTTPException, Query
-from pydantic import BaseModel
 from playwright.async_api import async_playwright
-import asyncio
-import base64
-import logging
-from typing import List, Optional
-from urllib.parse import urlparse
-app = FastAPI(
-    title="Website Quality & Compliance Analyzer",
-    description="API that analyzes websites for SEO, accessibility, compliance and technical quality",
-    version="1.0.0"
-)
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-class SEOResult(BaseModel):
-    title: Optional[str] = None
-    meta_description: Optional[str] = None
-    h1_tags: List[str] = []
-    canonical_url: Optional[str] = None
-    robots_txt_present: bool = False
-    sitemap_present: bool = False
-class AccessibilityResult(BaseModel):
-    missing_alt_tags: int = 0
-    images_without_alt: List[str] = []
-    aria_roles: List[str] = []
-    contrast_issues: List[str] = []
-class ComplianceResult(BaseModel):
-    has_cookie_banner: bool = False
-    gdpr_compliant: Optional[bool] = None
-    has_privacy_policy: bool = False
-    has_terms_of_service: bool = False
-class TechnicalResult(BaseModel):
-    tech_stack: List[str] = []
-    viewport_meta: Optional[str] = None
-    doctype: Optional[str] = None
-    is_https: bool = False
-    has_analytics: bool = False
-class BrokenLink(BaseModel):
-    url: str
-    status: Optional[int] = None
-    text: Optional[str] = None
-class AnalysisRequest(BaseModel):
-    url: str
-    screenshot: bool = False
-    mobile_test: bool = False
-    check_broken_links: bool = False
-    depth: int = 1  # How many levels deep to check broken links
-class AnalysisResponse(BaseModel):
-    url: str
-    seo: SEOResult
-    accessibility: AccessibilityResult
-    compliance: ComplianceResult
-    technical: TechnicalResult
-    broken_links: List[BrokenLink] = []
-    mobile_friendly: Optional[bool] = None
-    screenshot_base64: Optional[str] = None
-    load_time: Optional[float] = None
-    success: bool
-    error: Optional[str] = None
-async def analyze_page(page, url: str, options: AnalysisRequest):
-    result = {
-        "url": url,
-        "seo": {},
-        "accessibility": {},
-        "compliance": {},
-        "technical": {},
-        "broken_links": [],
-        "success": True
-    }
-    # Basic SEO checks
-    title = await page.title()
-    meta_description = await page.evaluate('''() => {
-        const meta = document.querySelector('meta[name="description"]');
-        return meta ? meta.content : null;
-    }''')
-    h1_tags = await page.evaluate('''() => {
-        return Array.from(document.querySelectorAll('h1')).map(h => h.textContent.trim());
-    }''')
-    result["seo"] = {
-        "title": title,
-        "meta_description": meta_description,
-        "h1_tags": h1_tags
-    }
-    # Accessibility checks
-    images_without_alt = await page.evaluate('''() => {
-        return Array.from(document.querySelectorAll('img:not([alt])'))
-            .map(img => img.src);
-    }''')
-    result["accessibility"] = {
-        "missing_alt_tags": len(images_without_alt),
-        "images_without_alt": images_without_alt
-    }
-    # Compliance checks
-    has_cookie_banner = await page.evaluate('''() => {
-        const keywords = ['cookie', 'gdpr', 'privacy', 'consent'];
-        const elements = document.querySelectorAll('*');
-        for (let el of elements) {
-            const text = el.textContent.toLowerCase();
-            if (keywords.some(kw => text.includes(kw))) {
-                return true;
-            }
-        }
-        return false;
-    }''')
-    result["compliance"] = {
-        "has_cookie_banner": has_cookie_banner
-    }
-    # Technical checks
-    tech_stack = []
-    # Check for common JS libraries
-    libraries = await page.evaluate('''() => {
-        const libs = [];
-        if (window.jQuery) libs.push('jQuery');
-        if (window.React) libs.push('React');
-        if (window.Vue) libs.push('Vue');
-        if (window.angular) libs.push('Angular');
-        return libs;
-    }''')
-    tech_stack.extend(libraries)
-    # Check for analytics
-    has_analytics = await page.evaluate('''() => {
-        return !!document.querySelector('script[src*="google-analytics.com"], script[src*="googletagmanager.com"]');
-    }''')
-    is_https = url.startswith('https://')
-    result["technical"] = {
-        "tech_stack": tech_stack,
-        "is_https": is_https,
-        "has_analytics": has_analytics
-    }
-    # Broken links check (if requested)
-    if options.check_broken_links and options.depth > 0:
-        links = await page.evaluate('''() => {
-            return Array.from(document.querySelectorAll('a[href]')).map(a => ({
-                href: a.href,
-                text: a.textContent.trim()
-            }));
-        }''')
-        # Filter out external links and non-http links
-        domain = urlparse(url).netloc
-        internal_links = [
-            link for link in links
-            if link['href'].startswith('http') and domain in link['href']
-        ][:10]  # Limit to 10 links for demo purposes
-        broken_links = []
-        for link in internal_links:
-            try:
-                response = await page.goto(link['href'], wait_until="domcontentloaded")
-                status = response.status if response else None
-                if status and status >= 400:
-                    broken_links.append({
-                        "url": link['href'],
-                        "status": status,
-                        "text": link['text']
-                    })
-            except Exception as e:
-                broken_links.append({
-                    "url": link['href'],
-                    "status": None,
-                    "text": link['text']
-                })
-        result["broken_links"] = broken_links
-    return result
-@app.post("/analyze", response_model=AnalysisResponse)
-async def analyze_website(request: AnalysisRequest):
-    """Analyze a website for quality and compliance metrics"""
-    async with async_playwright() as p:
         try:
-            browser = await p.chromium.launch()
-            context = await browser.new_context()
-            page = await context.new_page()
-            # Start timing
-            start_time = asyncio.get_event_loop().time()
-            # Navigate to the page
-            response = await page.goto(request.url, wait_until="domcontentloaded")
-            if not response or response.status >= 400:
-                raise HTTPException(status_code=400, detail=f"Failed to load page. Status: {response.status if response else 'unknown'}")
-            # Mobile test if requested
-            if request.mobile_test:
-                mobile_viewport = {'width': 375, 'height': 667}
-                await page.set_viewport_size(mobile_viewport)
-                result = await analyze_page(page, request.url, request)
-                result["mobile_friendly"] = True  # Basic check - would need more sophisticated testing
-            else:
-                result = await analyze_page(page, request.url, request)
-            # Screenshot if requested
-            if request.screenshot:
-                screenshot = await page.screenshot(full_page=True)
-                result["screenshot_base64"] = base64.b64encode(screenshot).decode('utf-8')
-            # Calculate load time
-            end_time = asyncio.get_event_loop().time()
-            result["load_time"] = end_time - start_time
-            await browser.close()
-            return result
-        except Exception as e:
-            logger.error(f"Error analyzing website: {str(e)}")
-            await browser.close()
-            raise HTTPException(status_code=500, detail=str(e))
-@app.get("/analyze", response_model=AnalysisResponse)
-async def analyze_website_get(
-    url: str = Query(..., description="URL to analyze"),
-    screenshot: bool = Query(False, description="Include screenshot"),
-    mobile_test: bool = Query(False, description="Test mobile responsiveness"),
-    check_broken_links: bool = Query(False, description="Check for broken links"),
-    depth: int = Query(1, description="Depth for broken links check")
-):
-    """GET endpoint for website analysis"""
-    request = AnalysisRequest(
-        url=url,
-        screenshot=screenshot,
-        mobile_test=mobile_test,
-        check_broken_links=check_broken_links,
-        depth=depth
-    )
-    return await analyze_website(request)
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)

+from fastapi import FastAPI
 from playwright.async_api import async_playwright
+app = FastAPI()
+async def scrape_google(query: str):
+    url = (
+        "https://www.google.com/search"
+        f"?q={query}"
+        "&sxsrf=AE3TifOZcTbH54cOkE27wqRqSVEmaqb7fw%3A1750003707838"
+    )
+    async with async_playwright() as pw:
+        browser = await pw.chromium.launch(headless=True)
+        context = await browser.new_context()
+        page = await context.new_page()
+        # Accept cookie/consent pop-ups
         try:
+            btn = await page.wait_for_selector('button:has-text("I agree")', timeout=5000)
+            await btn.click()
+        except:
+            pass
+        await page.goto(url, wait_until="domcontentloaded")
+        await page.wait_for_selector("h3")
+        results = []
+        for h in await page.query_selector_all("h3"):
+            try:
+                link = await h.evaluate("(e) => e.closest('a').href")
+                title = await h.inner_text()
+                results.append({"title": title, "link": link})
+            except:
+                continue
+        await browser.close()
+    return results
+@app.get("/search")
+async def search(query: str):
+    data = await scrape_google(query.replace(" ", "+"))
+    return {"query": query, "results": data}