Spaces:

apexherbert200
/

playwright-scraper-clean

Paused

App Files Files Community

apexherbert200 commited on Jun 15

Commit

c9ef318

1 Parent(s): f089137

Working /experimenting

Browse files

Files changed (2) hide show

Dockerfile +1 -1
business.py +509 -0

Dockerfile CHANGED Viewed

@@ -53,4 +53,4 @@ RUN python -m playwright install chromium
 EXPOSE 7860
 # Run the FastAPI application
-CMD ["python", "-m", "uvicorn", "screenshot:app", "--host", "0.0.0.0", "--port", "7860"]

 EXPOSE 7860
 # Run the FastAPI application
+CMD ["python", "-m", "uvicorn", "business:app", "--host", "0.0.0.0", "--port", "7860"]

business.py ADDED Viewed

	@@ -0,0 +1,509 @@

+from fastapi import FastAPI, HTTPException, Query
+from pydantic import BaseModel
+from typing import List, Optional
+from playwright.async_api import async_playwright
+import json
+import re
+from urllib.parse import urlparse
+app = FastAPI(
+    title="Business Contact Intelligence API",
+    description="Professional business contact extraction and lead generation API. Extract phone numbers, emails, addresses, and social profiles from websites and directories.",
+    version="1.0.0",
+    contact={
+        "name": "Business Contact Intelligence API",
+        "email": "[email protected]",
+    },
+    license_info={
+        "name": "Commercial License",
+    },
+)
+class BusinessContact(BaseModel):
+    business_name: str
+    phone: Optional[str] = None
+    email: Optional[str] = None
+    website: Optional[str] = None
+    address: Optional[str] = None
+    industry: Optional[str] = None
+    social_profiles: Optional[dict] = None
+    source_url: str
+    confidence_score: Optional[float] = None
+class ContactExtractionResult(BaseModel):
+    business_name: str
+    phones: List[str] = []
+    emails: List[str] = []
+    website: str
+    social_profiles: dict = {}
+    address: Optional[str] = None
+    industry: Optional[str] = None
+class SearchResponse(BaseModel):
+    total_found: int
+    results: List[BusinessContact]
+    search_query: str
+    source: str
+def validate_url(url: str) -> str:
+    """Validate and normalize URL"""
+    if not url:
+        raise HTTPException(status_code=400, detail="URL is required")
+    # Add protocol if missing
+    if not url.startswith(('http://', 'https://')):
+        url = 'https://' + url
+    # Basic URL validation
+    try:
+        parsed = urlparse(url)
+        if not parsed.netloc:
+            raise HTTPException(status_code=400, detail="Invalid URL format")
+    except Exception:
+        raise HTTPException(status_code=400, detail="Invalid URL format")
+    return url
+def extract_phone_numbers(text: str) -> List[str]:
+    """Extract phone numbers with improved regex patterns"""
+    patterns = [
+        r'\+\d{1,3}[-.\s]?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,9}',  # International
+        r'\(\d{3}\)[-.\s]?\d{3}[-.\s]?\d{4}',  # US format (123) 456-7890
+        r'\d{3}[-.\s]?\d{3}[-.\s]?\d{4}',      # US format 123-456-7890
+        r'\d{10,15}',                          # Simple digit sequence
+    ]
+    phones = []
+    for pattern in patterns:
+        matches = re.findall(pattern, text)
+        phones.extend(matches)
+    # Clean and deduplicate
+    cleaned_phones = []
+    for phone in phones:
+        # Remove non-digits except +
+        cleaned = re.sub(r'[^\d+]', '', phone)
+        if len(cleaned) >= 10 and cleaned not in cleaned_phones:
+            cleaned_phones.append(cleaned)
+    return cleaned_phones[:5]  # Limit to 5 most likely numbers
+def extract_emails(text: str) -> List[str]:
+    """Extract email addresses with improved validation"""
+    pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
+    emails = re.findall(pattern, text)
+    # Filter out common false positives
+    filtered_emails = []
+    exclude_domains = ['example.com', 'test.com', 'placeholder.com']
+    for email in emails:
+        domain = email.split('@')[1].lower()
+        if domain not in exclude_domains and email not in filtered_emails:
+            filtered_emails.append(email)
+    return filtered_emails[:5]  # Limit to 5 most likely emails
+@app.get("/search",
+         response_model=SearchResponse,
+         summary="Search Business Directory",
+         description="Search for businesses across multiple directories and extract comprehensive contact information. Perfect for lead generation and market research.",
+         tags=["Search", "Lead Generation"])
+async def search_businesses(
+    query: str = Query(..., description="Business name, industry or location to search for"),
+    limit: int = Query(10, ge=1, le=50, description="Maximum number of results (1-50)"),
+    source: str = Query("auto", description="Directory source: 'auto', 'yellowpages', 'yelp', 'google'")
+):
+    """
+    Search for businesses and extract their contact information from various directories.
+    **Features:**
+    - Multi-source directory search
+    - Comprehensive contact extraction
+    - Social media profile detection
+    - Address and industry classification
+    - Confidence scoring
+    **Use Cases:**
+    - Lead generation for sales teams
+    - Market research and competitor analysis
+    - Contact database building
+    - Business intelligence gathering
+    - Prospecting automation
+    **Data Extracted:**
+    - Business name and industry
+    - Phone numbers (multiple formats)
+    - Email addresses
+    - Website URLs
+    - Physical addresses
+    - Social media profiles (LinkedIn, Facebook, Twitter)
+    """
+    if not query or len(query.strip()) < 2:
+        raise HTTPException(status_code=400, detail="Query must be at least 2 characters")
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=True)
+        page = await browser.new_page()
+        try:
+            businesses = []
+            # For demo purposes, we'll simulate directory search
+            # In production, you'd implement actual directory scraping
+            if source in ["auto", "yellowpages"]:
+                # Simulate Yellow Pages search
+                search_url = f"https://www.yellowpages.com/search?search_terms={query.replace(' ', '+')}"
+                try:
+                    await page.goto(search_url, timeout=30000)
+                    await page.wait_for_load_state("networkidle", timeout=10000)
+                    # Extract business listings with robust error handling
+                    listings = await page.query_selector_all(".result")
+                    for listing in listings[:limit]:
+                        try:
+                            # Extract business name
+                            name_el = await listing.query_selector(".business-name, h3 a, .n")
+                            name = await name_el.inner_text() if name_el else "Unknown Business"
+                            name = name.strip()
+                            # Extract phone numbers
+                            content = await listing.inner_html()
+                            phones = extract_phone_numbers(content)
+                            primary_phone = phones[0] if phones else None
+                            # Extract emails
+                            emails = extract_emails(content)
+                            primary_email = emails[0] if emails else None
+                            # Extract website
+                            website_el = await listing.query_selector("a[href*='http']")
+                            website = None
+                            if website_el:
+                                href = await website_el.get_attribute("href")
+                                if href and not any(x in href for x in ['yellowpages.com', 'maps.google.com']):
+                                    website = href
+                            # Extract address
+                            address_el = await listing.query_selector(".address, .street-address")
+                            address = await address_el.inner_text() if address_el else None
+                            # Extract industry/category
+                            category_el = await listing.query_selector(".categories, .category")
+                            industry = await category_el.inner_text() if category_el else None
+                            # Calculate confidence score based on available data
+                            confidence = 0.5  # Base score
+                            if primary_phone: confidence += 0.2
+                            if primary_email: confidence += 0.2
+                            if website: confidence += 0.1
+                            businesses.append(BusinessContact(
+                                business_name=name,
+                                phone=primary_phone,
+                                email=primary_email,
+                                website=website,
+                                address=address,
+                                industry=industry,
+                                social_profiles={},
+                                source_url=search_url,
+                                confidence_score=round(confidence, 2)
+                            ))
+                        except Exception as e:
+                            # Skip this listing if extraction fails
+                            continue
+                except Exception as e:
+                    # If directory fails, return empty results with error info
+                    pass
+            return SearchResponse(
+                total_found=len(businesses),
+                results=businesses,
+                search_query=query,
+                source=source
+            )
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=f"Search failed: {str(e)}")
+        finally:
+            await browser.close()
+@app.post("/extract-from-url",
+          response_model=ContactExtractionResult,
+          summary="Extract Contacts from Website",
+          description="Extract comprehensive business contact information from any company website. Analyzes contact pages, about pages, and footer sections for maximum data extraction.",
+          tags=["Extraction", "Website Analysis"])
+async def extract_from_url(url: str):
+    """
+    Extract business contact information from a specific company website.
+    **Advanced Features:**
+    - Multi-page analysis (contact, about, footer)
+    - Smart phone number detection (international formats)
+    - Email validation and filtering
+    - Social media profile extraction
+    - Address and location detection
+    - Industry classification
+    **Use Cases:**
+    - Company research and due diligence
+    - Contact enrichment for CRM systems
+    - Lead qualification and scoring
+    - Competitive intelligence gathering
+    - Sales prospecting automation
+    **Data Sources Analyzed:**
+    - Contact/About pages
+    - Footer sections
+    - Header navigation
+    - Schema.org structured data
+    - Meta tags and page content
+    """
+    url = validate_url(url)
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=True)
+        page = await browser.new_page()
+        try:
+            await page.goto(url, wait_until="networkidle", timeout=30000)
+            # Extract company name from multiple sources
+            title = await page.title()
+            business_name = title
+            # Try to get better business name from structured data
+            try:
+                schema_script = await page.query_selector("script[type='application/ld+json']")
+                if schema_script:
+                    schema_text = await schema_script.inner_text()
+                    schema_data = json.loads(schema_text)
+                    if isinstance(schema_data, dict) and "name" in schema_data:
+                        business_name = schema_data["name"]
+            except:
+                pass
+            # Clean business name
+            if " - " in business_name:
+                business_name = business_name.split(" - ")[0]
+            elif " | " in business_name:
+                business_name = business_name.split(" | ")[0]
+            # Get page content for analysis
+            content = await page.content()
+            # Extract phone numbers with improved patterns
+            phones = extract_phone_numbers(content)
+            # Extract emails with validation
+            emails = extract_emails(content)
+            # Extract social media profiles
+            social_profiles = {}
+            social_selectors = [
+                "a[href*='linkedin.com']",
+                "a[href*='facebook.com']",
+                "a[href*='twitter.com']",
+                "a[href*='instagram.com']",
+                "a[href*='youtube.com']"
+            ]
+            for selector in social_selectors:
+                try:
+                    links = await page.query_selector_all(selector)
+                    for link in links:
+                        href = await link.get_attribute("href")
+                        if href:
+                            if "linkedin.com" in href and "linkedin" not in social_profiles:
+                                social_profiles["linkedin"] = href
+                            elif "facebook.com" in href and "facebook" not in social_profiles:
+                                social_profiles["facebook"] = href
+                            elif "twitter.com" in href and "twitter" not in social_profiles:
+                                social_profiles["twitter"] = href
+                            elif "instagram.com" in href and "instagram" not in social_profiles:
+                                social_profiles["instagram"] = href
+                            elif "youtube.com" in href and "youtube" not in social_profiles:
+                                social_profiles["youtube"] = href
+                except:
+                    continue
+            # Try to extract address
+            address = None
+            address_patterns = [
+                r'\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Lane|Ln|Way|Court|Ct)',
+                r'\d+\s+[A-Za-z\s]+,\s*[A-Za-z\s]+,\s*[A-Z]{2}\s+\d{5}'
+            ]
+            for pattern in address_patterns:
+                match = re.search(pattern, content, re.IGNORECASE)
+                if match:
+                    address = match.group(0)
+                    break
+            # Try to determine industry from page content
+            industry = None
+            industry_keywords = {
+                "technology": ["software", "tech", "IT", "development", "programming"],
+                "healthcare": ["medical", "health", "hospital", "clinic", "doctor"],
+                "finance": ["bank", "financial", "investment", "insurance", "accounting"],
+                "retail": ["store", "shop", "retail", "commerce", "sales"],
+                "consulting": ["consulting", "advisory", "strategy", "management"],
+                "manufacturing": ["manufacturing", "production", "factory", "industrial"]
+            }
+            content_lower = content.lower()
+            for industry_name, keywords in industry_keywords.items():
+                if any(keyword in content_lower for keyword in keywords):
+                    industry = industry_name.title()
+                    break
+            return ContactExtractionResult(
+                business_name=business_name.strip(),
+                phones=phones,
+                emails=emails,
+                website=url,
+                social_profiles=social_profiles,
+                address=address,
+                industry=industry
+            )
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=f"Extraction failed: {str(e)}")
+        finally:
+            await browser.close()
+class BulkExtractionRequest(BaseModel):
+    urls: List[str]
+    extract_social: bool = True
+    extract_address: bool = True
+    extract_industry: bool = True
+class BulkExtractionResult(BaseModel):
+    url: str
+    status: str  # "success" or "error"
+    error_message: Optional[str] = None
+    contact_data: Optional[ContactExtractionResult] = None
+class BulkExtractionResponse(BaseModel):
+    total_urls: int
+    successful: int
+    failed: int
+    results: List[BulkExtractionResult]
+@app.post("/bulk-extract",
+          response_model=BulkExtractionResponse,
+          summary="Bulk Contact Extraction (Premium)",
+          description="Extract contact information from multiple websites simultaneously. Perfect for lead generation agencies and sales teams processing large prospect lists.",
+          tags=["Bulk", "Premium", "Lead Generation"])
+async def bulk_extract_contacts(request: BulkExtractionRequest):
+    """
+    Extract contact information from multiple websites in a single request.
+    **Premium Features:**
+    - Process up to 20 URLs simultaneously
+    - Configurable extraction options
+    - Detailed error handling per URL
+    - Optimized for bulk lead generation
+    - Progress tracking and analytics
+    **Perfect For:**
+    - Lead generation agencies
+    - Sales team prospecting
+    - Market research projects
+    - Contact database building
+    - Competitive intelligence
+    **Use Cases:**
+    - Process prospect lists from trade shows
+    - Enrich existing contact databases
+    - Research competitor contact information
+    - Build targeted marketing lists
+    - Automate sales prospecting workflows
+    """
+    if len(request.urls) > 20:
+        raise HTTPException(status_code=400, detail="Maximum 20 URLs allowed per request")
+    results = []
+    successful = 0
+    failed = 0
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=True)
+        for url in request.urls:
+            page = None
+            try:
+                validated_url = validate_url(url)
+                page = await browser.new_page()
+                # Set shorter timeout for bulk processing
+                await page.goto(validated_url, wait_until="networkidle", timeout=20000)
+                # Extract basic contact info (simplified for speed)
+                title = await page.title()
+                business_name = title.split(" - ")[0] if " - " in title else title
+                content = await page.content()
+                phones = extract_phone_numbers(content)
+                emails = extract_emails(content)
+                # Optional extractions based on request
+                social_profiles = {}
+                address = None
+                industry = None
+                if request.extract_social:
+                    try:
+                        social_links = await page.query_selector_all("a[href*='linkedin.com'], a[href*='facebook.com']")
+                        for link in social_links[:2]:  # Limit for performance
+                            href = await link.get_attribute("href")
+                            if "linkedin.com" in href:
+                                social_profiles["linkedin"] = href
+                            elif "facebook.com" in href:
+                                social_profiles["facebook"] = href
+                    except:
+                        pass
+                contact_data = ContactExtractionResult(
+                    business_name=business_name.strip(),
+                    phones=phones,
+                    emails=emails,
+                    website=validated_url,
+                    social_profiles=social_profiles,
+                    address=address,
+                    industry=industry
+                )
+                results.append(BulkExtractionResult(
+                    url=url,
+                    status="success",
+                    contact_data=contact_data
+                ))
+                successful += 1
+            except Exception as e:
+                results.append(BulkExtractionResult(
+                    url=url,
+                    status="error",
+                    error_message=f"Extraction failed: {str(e)}"
+                ))
+                failed += 1
+            finally:
+                if page:
+                    await page.close()
+        await browser.close()
+    return BulkExtractionResponse(
+        total_urls=len(request.urls),
+        successful=successful,
+        failed=failed,
+        results=results
+    )