Spaces:

apexherbert200
/

playwright-scraper-clean

Paused

App Files Files Community

apexherbert200 commited on Jul 4

Commit

2850050

1 Parent(s): 9a5d887

Tool for scraping contacts

Browse files

Files changed (2) hide show

Dockerfile +1 -1
contacts.py +181 -0

Dockerfile CHANGED Viewed

@@ -53,4 +53,4 @@ RUN python -m playwright install chromium
 EXPOSE 7860
 # Run the FastAPI application
-CMD ["python", "-m", "uvicorn", "scrapeAPI2:app", "--host", "0.0.0.0", "--port", "7860"]

 EXPOSE 7860
 # Run the FastAPI application
+CMD ["python", "-m", "uvicorn", "contacts:app", "--host", "0.0.0.0", "--port", "7860"]

contacts.py CHANGED Viewed

	@@ -0,0 +1,181 @@

+from fastapi import FastAPI, Query, HTTPException
+from pydantic import BaseModel
+from playwright.async_api import async_playwright
+from urllib.parse import urlparse
+import base64
+import logging
+from typing import List, Optional, Set
+import re
+app = FastAPI(title="Lead Generation Web Scraper API")
+# -------------------- Logging Setup --------------------
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# -------------------- Pydantic Models --------------------
+class LinkInfo(BaseModel):
+    text: str
+    href: str
+class ContactInfo(BaseModel):
+    emails: List[str] = []
+    phones: List[str] = []
+    social_media: List[str] = []
+    contact_forms: List[str] = []
+class BusinessInfo(BaseModel):
+    company_name: Optional[str] = None
+    address: Optional[str] = None
+    description: Optional[str] = None
+    industry_keywords: List[str] = []
+class LeadData(BaseModel):
+    contact_info: ContactInfo
+    business_info: BusinessInfo
+    lead_score: int = 0
+    technologies: List[str] = []
+class ScrapedPage(BaseModel):
+    url: str
+    page_title: Optional[str]
+    meta_description: Optional[str]
+    screenshot: Optional[str]
+    links: Optional[List[LinkInfo]]
+    lead_data: Optional[LeadData]
+class CrawlResponse(BaseModel):
+    pages_visited: int
+    results: List[ScrapedPage]
+# -------------------- Utility Functions --------------------
+async def extract_page_data(page, url: str, take_screenshot: bool) -> ScrapedPage:
+    await page.goto(url, timeout=30000)
+    title = await page.title()
+    meta_desc = await page.evaluate("""
+        () => {
+            const meta = document.querySelector('meta[name="description"]');
+            return meta ? meta.getAttribute('content') : null;
+        }
+    """)
+    screenshot = None
+    if take_screenshot:
+        shot = await page.screenshot(full_page=True)
+        screenshot = base64.b64encode(shot).decode('utf-8')
+    links = await page.evaluate("""
+        () => Array.from(document.querySelectorAll('a[href]')).map(a => ({
+            text: a.innerText.trim().substring(0, 200),
+            href: a.href
+        }))
+    """)
+    text = await page.evaluate("document.body.innerText")
+    emails = list(set(re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", text)))[:10]
+    phones = list(set(re.findall(r"(\+?1?[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", text)))[:5]
+    social_links = list(set(
+        l['href'] for l in links if re.search(r"facebook|linkedin|twitter|instagram|youtube", l['href'], re.I)
+    ))[:10]
+    contact_forms = list(set(await page.evaluate("""
+        () => Array.from(document.querySelectorAll('form')).map(f => f.action || location.href)
+    """)))[:5]
+    company_name = await page.evaluate("""
+        () => document.querySelector('meta[property="og:site_name"]')?.content ||
+              document.querySelector('meta[name="application-name"]')?.content ||
+              document.querySelector('h1')?.innerText ||
+              document.title?.split('|')[0]?.split('-')[0]?.trim()
+    """)
+    address_matches = re.findall(r"\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln|Drive|Dr|Court|Ct|Place|Pl)[^\n,]*", text)
+    address = address_matches[0] if address_matches else None
+    html = await page.content()
+    tech_stack = [t for t in ['wordpress', 'shopify', 'react', 'angular', 'vue', 'bootstrap', 'jquery', 'google analytics'] if t in html.lower()]
+    industry_keywords = [k for k in ['consulting', 'marketing', 'software', 'healthcare', 'finance', 'education'] if k in text.lower()]
+    lead_score = min(100, sum([
+        30 if emails else 0,
+        25 if phones else 0,
+        15 if social_links else 0,
+        10 if company_name else 0,
+        15 if address else 0,
+        10 if tech_stack else 0,
+        5 if industry_keywords else 0
+    ]))
+    return ScrapedPage(
+        url=url,
+        page_title=title,
+        meta_description=meta_desc,
+        screenshot=screenshot,
+        links=[LinkInfo(**l) for l in links],
+        lead_data=LeadData(
+            contact_info=ContactInfo(
+                emails=emails,
+                phones=phones,
+                social_media=social_links,
+                contact_forms=contact_forms,
+            ),
+            business_info=BusinessInfo(
+                company_name=company_name,
+                address=address,
+                description=meta_desc,
+                industry_keywords=industry_keywords
+            ),
+            lead_score=lead_score,
+            technologies=tech_stack
+        )
+    )
+# -------------------- API Endpoint --------------------
+@app.get("/crawl-leads", response_model=CrawlResponse)
+async def crawl_leads(
+    website: str = Query(..., description="Base website URL to crawl"),
+    max_depth: int = Query(1, ge=1, le=3),
+    screenshot: bool = Query(False, description="Take full page screenshots")
+):
+    visited: Set[str] = set()
+    results: List[ScrapedPage] = []
+    try:
+        async with async_playwright() as p:
+            browser = await p.chromium.launch(headless=True)
+            context = await browser.new_context()
+            page = await context.new_page()
+            queue = [(website, 0)]
+            while queue:
+                url, depth = queue.pop(0)
+                if url in visited or depth > max_depth:
+                    continue
+                visited.add(url)
+                try:
+                    result = await extract_page_data(page, url, screenshot)
+                    results.append(result)
+                    if depth < max_depth:
+                        links = [link.href for link in result.links if urlparse(link.href).netloc == urlparse(website).netloc]
+                        for link in links:
+                            if link not in visited:
+                                queue.append((link, depth + 1))
+                except Exception as e:
+                    logger.warning(f"Failed to scrape {url}: {e}")
+            await browser.close()
+        return CrawlResponse(
+            pages_visited=len(visited),
+            results=results
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))