apexherbert200 commited on
Commit
2850050
·
1 Parent(s): 9a5d887

Tool for scraping contacts

Browse files
Files changed (2) hide show
  1. Dockerfile +1 -1
  2. contacts.py +181 -0
Dockerfile CHANGED
@@ -53,4 +53,4 @@ RUN python -m playwright install chromium
53
  EXPOSE 7860
54
 
55
  # Run the FastAPI application
56
- CMD ["python", "-m", "uvicorn", "scrapeAPI2:app", "--host", "0.0.0.0", "--port", "7860"]
 
53
  EXPOSE 7860
54
 
55
  # Run the FastAPI application
56
+ CMD ["python", "-m", "uvicorn", "contacts:app", "--host", "0.0.0.0", "--port", "7860"]
contacts.py CHANGED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, Query, HTTPException
2
+ from pydantic import BaseModel
3
+ from playwright.async_api import async_playwright
4
+ from urllib.parse import urlparse
5
+ import base64
6
+ import logging
7
+ from typing import List, Optional, Set
8
+ import re
9
+
10
+ app = FastAPI(title="Lead Generation Web Scraper API")
11
+
12
+ # -------------------- Logging Setup --------------------
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # -------------------- Pydantic Models --------------------
17
+ class LinkInfo(BaseModel):
18
+ text: str
19
+ href: str
20
+
21
+ class ContactInfo(BaseModel):
22
+ emails: List[str] = []
23
+ phones: List[str] = []
24
+ social_media: List[str] = []
25
+ contact_forms: List[str] = []
26
+
27
+ class BusinessInfo(BaseModel):
28
+ company_name: Optional[str] = None
29
+ address: Optional[str] = None
30
+ description: Optional[str] = None
31
+ industry_keywords: List[str] = []
32
+
33
+ class LeadData(BaseModel):
34
+ contact_info: ContactInfo
35
+ business_info: BusinessInfo
36
+ lead_score: int = 0
37
+ technologies: List[str] = []
38
+
39
+ class ScrapedPage(BaseModel):
40
+ url: str
41
+ page_title: Optional[str]
42
+ meta_description: Optional[str]
43
+ screenshot: Optional[str]
44
+ links: Optional[List[LinkInfo]]
45
+ lead_data: Optional[LeadData]
46
+
47
+ class CrawlResponse(BaseModel):
48
+ pages_visited: int
49
+ results: List[ScrapedPage]
50
+
51
+ # -------------------- Utility Functions --------------------
52
+ async def extract_page_data(page, url: str, take_screenshot: bool) -> ScrapedPage:
53
+ await page.goto(url, timeout=30000)
54
+ title = await page.title()
55
+
56
+ meta_desc = await page.evaluate("""
57
+ () => {
58
+ const meta = document.querySelector('meta[name="description"]');
59
+ return meta ? meta.getAttribute('content') : null;
60
+ }
61
+ """)
62
+
63
+ screenshot = None
64
+ if take_screenshot:
65
+ shot = await page.screenshot(full_page=True)
66
+ screenshot = base64.b64encode(shot).decode('utf-8')
67
+
68
+ links = await page.evaluate("""
69
+ () => Array.from(document.querySelectorAll('a[href]')).map(a => ({
70
+ text: a.innerText.trim().substring(0, 200),
71
+ href: a.href
72
+ }))
73
+ """)
74
+
75
+ text = await page.evaluate("document.body.innerText")
76
+
77
+ emails = list(set(re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", text)))[:10]
78
+ phones = list(set(re.findall(r"(\+?1?[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", text)))[:5]
79
+
80
+ social_links = list(set(
81
+ l['href'] for l in links if re.search(r"facebook|linkedin|twitter|instagram|youtube", l['href'], re.I)
82
+ ))[:10]
83
+
84
+ contact_forms = list(set(await page.evaluate("""
85
+ () => Array.from(document.querySelectorAll('form')).map(f => f.action || location.href)
86
+ """)))[:5]
87
+
88
+ company_name = await page.evaluate("""
89
+ () => document.querySelector('meta[property="og:site_name"]')?.content ||
90
+ document.querySelector('meta[name="application-name"]')?.content ||
91
+ document.querySelector('h1')?.innerText ||
92
+ document.title?.split('|')[0]?.split('-')[0]?.trim()
93
+ """)
94
+
95
+ address_matches = re.findall(r"\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln|Drive|Dr|Court|Ct|Place|Pl)[^\n,]*", text)
96
+ address = address_matches[0] if address_matches else None
97
+
98
+ html = await page.content()
99
+ tech_stack = [t for t in ['wordpress', 'shopify', 'react', 'angular', 'vue', 'bootstrap', 'jquery', 'google analytics'] if t in html.lower()]
100
+
101
+ industry_keywords = [k for k in ['consulting', 'marketing', 'software', 'healthcare', 'finance', 'education'] if k in text.lower()]
102
+
103
+ lead_score = min(100, sum([
104
+ 30 if emails else 0,
105
+ 25 if phones else 0,
106
+ 15 if social_links else 0,
107
+ 10 if company_name else 0,
108
+ 15 if address else 0,
109
+ 10 if tech_stack else 0,
110
+ 5 if industry_keywords else 0
111
+ ]))
112
+
113
+ return ScrapedPage(
114
+ url=url,
115
+ page_title=title,
116
+ meta_description=meta_desc,
117
+ screenshot=screenshot,
118
+ links=[LinkInfo(**l) for l in links],
119
+ lead_data=LeadData(
120
+ contact_info=ContactInfo(
121
+ emails=emails,
122
+ phones=phones,
123
+ social_media=social_links,
124
+ contact_forms=contact_forms,
125
+ ),
126
+ business_info=BusinessInfo(
127
+ company_name=company_name,
128
+ address=address,
129
+ description=meta_desc,
130
+ industry_keywords=industry_keywords
131
+ ),
132
+ lead_score=lead_score,
133
+ technologies=tech_stack
134
+ )
135
+ )
136
+
137
+ # -------------------- API Endpoint --------------------
138
+ @app.get("/crawl-leads", response_model=CrawlResponse)
139
+ async def crawl_leads(
140
+ website: str = Query(..., description="Base website URL to crawl"),
141
+ max_depth: int = Query(1, ge=1, le=3),
142
+ screenshot: bool = Query(False, description="Take full page screenshots")
143
+ ):
144
+ visited: Set[str] = set()
145
+ results: List[ScrapedPage] = []
146
+
147
+ try:
148
+ async with async_playwright() as p:
149
+ browser = await p.chromium.launch(headless=True)
150
+ context = await browser.new_context()
151
+ page = await context.new_page()
152
+
153
+ queue = [(website, 0)]
154
+
155
+ while queue:
156
+ url, depth = queue.pop(0)
157
+ if url in visited or depth > max_depth:
158
+ continue
159
+ visited.add(url)
160
+
161
+ try:
162
+ result = await extract_page_data(page, url, screenshot)
163
+ results.append(result)
164
+
165
+ if depth < max_depth:
166
+ links = [link.href for link in result.links if urlparse(link.href).netloc == urlparse(website).netloc]
167
+ for link in links:
168
+ if link not in visited:
169
+ queue.append((link, depth + 1))
170
+ except Exception as e:
171
+ logger.warning(f"Failed to scrape {url}: {e}")
172
+
173
+ await browser.close()
174
+
175
+ return CrawlResponse(
176
+ pages_visited=len(visited),
177
+ results=results
178
+ )
179
+
180
+ except Exception as e:
181
+ raise HTTPException(status_code=500, detail=str(e))