apexherbert200 commited on
Commit
79508b3
·
1 Parent(s): 2850050

Tool for scraping contacts

Browse files
Files changed (1) hide show
  1. contacts.py +185 -143
contacts.py CHANGED
@@ -1,19 +1,18 @@
1
- from fastapi import FastAPI, Query, HTTPException
2
  from pydantic import BaseModel
3
  from playwright.async_api import async_playwright
4
- from urllib.parse import urlparse
5
  import base64
6
  import logging
7
- from typing import List, Optional, Set
8
- import re
9
-
10
- app = FastAPI(title="Lead Generation Web Scraper API")
11
 
12
- # -------------------- Logging Setup --------------------
13
  logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
15
 
16
- # -------------------- Pydantic Models --------------------
 
17
  class LinkInfo(BaseModel):
18
  text: str
19
  href: str
@@ -24,6 +23,11 @@ class ContactInfo(BaseModel):
24
  social_media: List[str] = []
25
  contact_forms: List[str] = []
26
 
 
 
 
 
 
27
  class BusinessInfo(BaseModel):
28
  company_name: Optional[str] = None
29
  address: Optional[str] = None
@@ -36,146 +40,184 @@ class LeadData(BaseModel):
36
  lead_score: int = 0
37
  technologies: List[str] = []
38
 
39
- class ScrapedPage(BaseModel):
40
- url: str
41
- page_title: Optional[str]
42
- meta_description: Optional[str]
43
- screenshot: Optional[str]
44
- links: Optional[List[LinkInfo]]
45
- lead_data: Optional[LeadData]
46
-
47
- class CrawlResponse(BaseModel):
48
- pages_visited: int
49
- results: List[ScrapedPage]
50
-
51
- # -------------------- Utility Functions --------------------
52
- async def extract_page_data(page, url: str, take_screenshot: bool) -> ScrapedPage:
53
- await page.goto(url, timeout=30000)
54
- title = await page.title()
55
-
56
- meta_desc = await page.evaluate("""
57
- () => {
58
- const meta = document.querySelector('meta[name="description"]');
59
- return meta ? meta.getAttribute('content') : null;
60
  }
61
- """)
62
-
63
- screenshot = None
64
- if take_screenshot:
65
- shot = await page.screenshot(full_page=True)
66
- screenshot = base64.b64encode(shot).decode('utf-8')
67
-
68
- links = await page.evaluate("""
69
- () => Array.from(document.querySelectorAll('a[href]')).map(a => ({
70
- text: a.innerText.trim().substring(0, 200),
71
- href: a.href
72
- }))
73
- """)
74
-
75
- text = await page.evaluate("document.body.innerText")
76
-
77
- emails = list(set(re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", text)))[:10]
78
- phones = list(set(re.findall(r"(\+?1?[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", text)))[:5]
79
-
80
- social_links = list(set(
81
- l['href'] for l in links if re.search(r"facebook|linkedin|twitter|instagram|youtube", l['href'], re.I)
82
- ))[:10]
83
-
84
- contact_forms = list(set(await page.evaluate("""
85
- () => Array.from(document.querySelectorAll('form')).map(f => f.action || location.href)
86
- """)))[:5]
87
-
88
- company_name = await page.evaluate("""
89
- () => document.querySelector('meta[property="og:site_name"]')?.content ||
90
- document.querySelector('meta[name="application-name"]')?.content ||
91
- document.querySelector('h1')?.innerText ||
92
- document.title?.split('|')[0]?.split('-')[0]?.trim()
93
- """)
94
-
95
- address_matches = re.findall(r"\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln|Drive|Dr|Court|Ct|Place|Pl)[^\n,]*", text)
96
- address = address_matches[0] if address_matches else None
97
-
98
- html = await page.content()
99
- tech_stack = [t for t in ['wordpress', 'shopify', 'react', 'angular', 'vue', 'bootstrap', 'jquery', 'google analytics'] if t in html.lower()]
100
-
101
- industry_keywords = [k for k in ['consulting', 'marketing', 'software', 'healthcare', 'finance', 'education'] if k in text.lower()]
102
-
103
- lead_score = min(100, sum([
104
- 30 if emails else 0,
105
- 25 if phones else 0,
106
- 15 if social_links else 0,
107
- 10 if company_name else 0,
108
- 15 if address else 0,
109
- 10 if tech_stack else 0,
110
- 5 if industry_keywords else 0
111
- ]))
112
-
113
- return ScrapedPage(
114
- url=url,
115
- page_title=title,
116
- meta_description=meta_desc,
117
- screenshot=screenshot,
118
- links=[LinkInfo(**l) for l in links],
119
- lead_data=LeadData(
120
- contact_info=ContactInfo(
121
- emails=emails,
122
- phones=phones,
123
- social_media=social_links,
124
- contact_forms=contact_forms,
125
- ),
126
- business_info=BusinessInfo(
127
- company_name=company_name,
128
- address=address,
129
- description=meta_desc,
130
- industry_keywords=industry_keywords
131
- ),
132
- lead_score=lead_score,
133
- technologies=tech_stack
134
- )
135
- )
136
-
137
- # -------------------- API Endpoint --------------------
138
- @app.get("/crawl-leads", response_model=CrawlResponse)
139
- async def crawl_leads(
140
- website: str = Query(..., description="Base website URL to crawl"),
141
- max_depth: int = Query(1, ge=1, le=3),
142
- screenshot: bool = Query(False, description="Take full page screenshots")
143
  ):
144
- visited: Set[str] = set()
145
- results: List[ScrapedPage] = []
 
146
 
 
 
147
  try:
148
  async with async_playwright() as p:
149
  browser = await p.chromium.launch(headless=True)
150
- context = await browser.new_context()
151
- page = await context.new_page()
152
-
153
- queue = [(website, 0)]
154
-
155
- while queue:
156
- url, depth = queue.pop(0)
157
- if url in visited or depth > max_depth:
158
- continue
159
- visited.add(url)
160
-
161
- try:
162
- result = await extract_page_data(page, url, screenshot)
163
- results.append(result)
164
-
165
- if depth < max_depth:
166
- links = [link.href for link in result.links if urlparse(link.href).netloc == urlparse(website).netloc]
167
- for link in links:
168
- if link not in visited:
169
- queue.append((link, depth + 1))
170
- except Exception as e:
171
- logger.warning(f"Failed to scrape {url}: {e}")
172
-
173
- await browser.close()
174
-
175
- return CrawlResponse(
176
- pages_visited=len(visited),
177
- results=results
178
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
  except Exception as e:
181
- raise HTTPException(status_code=500, detail=str(e))
 
 
1
+ from fastapi import FastAPI, HTTPException, Query
2
  from pydantic import BaseModel
3
  from playwright.async_api import async_playwright
4
+ import asyncio
5
  import base64
6
  import logging
7
+ from typing import List, Optional
8
+ from urllib.parse import urlparse
 
 
9
 
10
+ # Set up logging
11
  logging.basicConfig(level=logging.INFO)
12
  logger = logging.getLogger(__name__)
13
 
14
+ app = FastAPI(title="Playwright Web Scraper", description="A simple web scraper using Playwright")
15
+
16
  class LinkInfo(BaseModel):
17
  text: str
18
  href: str
 
23
  social_media: List[str] = []
24
  contact_forms: List[str] = []
25
 
26
+ class ScriptInfo(BaseModel):
27
+ src: str
28
+ script_type: Optional[str] = None
29
+ is_external: bool = False
30
+
31
  class BusinessInfo(BaseModel):
32
  company_name: Optional[str] = None
33
  address: Optional[str] = None
 
40
  lead_score: int = 0
41
  technologies: List[str] = []
42
 
43
+ class ScrapeResponse(BaseModel):
44
+ body_content: Optional[str] = None
45
+ screenshot: Optional[str] = None
46
+ links: Optional[List[LinkInfo]] = None
47
+ scripts: Optional[List[ScriptInfo]] = None
48
+ page_title: Optional[str] = None
49
+ meta_description: Optional[str] = None
50
+ lead_data: Optional[LeadData] = None
51
+
52
+ visited_urls = set()
53
+
54
+ @app.get("/")
55
+ async def root():
56
+ return {
57
+ "message": "🚀 Lead Generation Web Scraper API",
58
+ "tagline": "Turn any website into qualified leads",
59
+ "endpoints": {
60
+ "/scrape": "Extract leads, contacts, and business data from any website",
61
+ "/docs": "API documentation"
 
 
62
  }
63
+ }
64
+
65
+ def normalize_url(url):
66
+ parsed = urlparse(url)
67
+ return parsed._replace(fragment='', query='').geturl().rstrip('/')
68
+
69
+ @app.get("/scrape")
70
+ async def scrape_page(
71
+ url: str = Query(..., description="URL to scrape"),
72
+ lead_generation: bool = Query(True, description="Extract lead generation data (emails, phones, business info)"),
73
+ screenshot: bool = Query(True, description="Take a full page screenshot"),
74
+ get_links: bool = Query(True, description="Extract all links from the page"),
75
+ get_body: bool = Query(False, description="Extract body tag content (can be large)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  ):
77
+ norm_url = normalize_url(url)
78
+ if norm_url in visited_urls:
79
+ raise HTTPException(status_code=400, detail="URL already scraped")
80
 
81
+ visited_urls.add(norm_url)
82
+ logger.info(f"Starting scrape for URL: {norm_url}")
83
  try:
84
  async with async_playwright() as p:
85
  browser = await p.chromium.launch(headless=True)
86
+ page = await browser.new_page()
87
+
88
+ try:
89
+ await page.goto(norm_url, wait_until="domcontentloaded", timeout=60000)
90
+
91
+ response = ScrapeResponse()
92
+ response.page_title = await page.title()
93
+
94
+ response.meta_description = await page.evaluate("""
95
+ () => {
96
+ const meta = document.querySelector('meta[name="description"]');
97
+ return meta ? meta.getAttribute('content') : null;
98
+ }
99
+ """)
100
+
101
+ if get_body:
102
+ response.body_content = await page.evaluate("""
103
+ () => {
104
+ const body = document.querySelector('body');
105
+ if (!body) return null;
106
+ const scripts = body.querySelectorAll('script, style, noscript');
107
+ scripts.forEach(el => el.remove());
108
+ return body.innerText.trim();
109
+ }
110
+ """)
111
+
112
+ if screenshot:
113
+ screenshot_bytes = await page.screenshot(full_page=True)
114
+ response.screenshot = base64.b64encode(screenshot_bytes).decode('utf-8')
115
+
116
+ if get_links:
117
+ links = await page.evaluate("""
118
+ () => {
119
+ return Array.from(document.querySelectorAll('a[href]')).map(a => {
120
+ const text = a.innerText.trim() || a.getAttribute('aria-label') || a.getAttribute('title') || a.href;
121
+ const href = a.href;
122
+ if (href && href.startsWith('http')) {
123
+ return { text: text.substring(0, 200), href: href };
124
+ }
125
+ return null;
126
+ }).filter(link => link !== null);
127
+ }
128
+ """)
129
+ response.links = [LinkInfo(**link) for link in links]
130
+
131
+ if lead_generation:
132
+ lead_data_raw = await page.evaluate("""
133
+ () => {
134
+ const result = {
135
+ emails: [], phones: [], social_media: [], contact_forms: [],
136
+ company_name: null, address: null, technologies: [], industry_keywords: []
137
+ };
138
+
139
+ const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
140
+ const pageText = document.body.innerText;
141
+ const emails = pageText.match(emailRegex) || [];
142
+ const mailtoEmails = Array.from(document.querySelectorAll('a[href^="mailto:"]'))
143
+ .map(a => a.href.replace(/^mailto:/, '').split('?')[0]);
144
+ result.emails = [...new Set([...emails, ...mailtoEmails])].slice(0, 10);
145
+
146
+ const phoneRegex = /(\+?1?[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})/g;
147
+ const phones = pageText.match(phoneRegex) || [];
148
+ const telPhones = Array.from(document.querySelectorAll('a[href^="tel:"]'))
149
+ .map(a => a.href.replace(/^tel:/, '').split('?')[0]);
150
+ result.phones = [...new Set([...phones, ...telPhones])].slice(0, 5);
151
+
152
+ const socialLinks = Array.from(document.querySelectorAll('a[href]'))
153
+ .map(a => a.href).filter(href => /facebook|twitter|linkedin|instagram|youtube|tiktok/i.test(href));
154
+ result.social_media = [...new Set(socialLinks)].slice(0, 10);
155
+
156
+ const forms = Array.from(document.querySelectorAll('form')).map(form => form.action || window.location.href);
157
+ result.contact_forms = [...new Set(forms)].slice(0, 5);
158
+
159
+ result.company_name =
160
+ document.querySelector('meta[property="og:site_name"]')?.content ||
161
+ document.querySelector('meta[name="application-name"]')?.content ||
162
+ document.querySelector('h1')?.innerText?.trim() ||
163
+ document.title?.split('|')[0]?.split('-')[0]?.trim();
164
+
165
+ const addressRegex = /\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln|Drive|Dr|Court|Ct|Place|Pl)\s*,?\s*[A-Za-z\s]+,?\s*[A-Z]{2}\s*\d{5}/g;
166
+ const addresses = pageText.match(addressRegex) || [];
167
+ result.address = addresses[0] || null;
168
+
169
+ const techKeywords = ['wordpress', 'shopify', 'react', 'angular', 'vue', 'bootstrap', 'jquery', 'google analytics', 'facebook pixel'];
170
+ const htmlContent = document.documentElement.outerHTML.toLowerCase();
171
+ result.technologies = techKeywords.filter(tech => htmlContent.includes(tech));
172
+
173
+ const industryKeywords = ['consulting', 'marketing', 'software', 'healthcare', 'finance', 'real estate', 'education', 'retail', 'manufacturing', 'legal', 'restaurant', 'fitness', 'beauty', 'automotive'];
174
+ const lowerPageText = pageText.toLowerCase();
175
+ result.industry_keywords = industryKeywords.filter(keyword => lowerPageText.includes(keyword));
176
+
177
+ return result;
178
+ }
179
+ """)
180
+
181
+ lead_score = 0
182
+ if lead_data_raw['emails']: lead_score += 30
183
+ if lead_data_raw['phones']: lead_score += 25
184
+ if lead_data_raw['contact_forms']: lead_score += 15
185
+ if lead_data_raw['social_media']: lead_score += 10
186
+ if lead_data_raw['company_name']: lead_score += 10
187
+ if lead_data_raw['address']: lead_score += 10
188
+ if lead_data_raw['technologies']: lead_score += 5
189
+ if lead_data_raw['industry_keywords']: lead_score += 5
190
+
191
+ contact_info = ContactInfo(
192
+ emails=lead_data_raw['emails'],
193
+ phones=lead_data_raw['phones'],
194
+ social_media=lead_data_raw['social_media'],
195
+ contact_forms=lead_data_raw['contact_forms']
196
+ )
197
+
198
+ business_info = BusinessInfo(
199
+ company_name=lead_data_raw['company_name'],
200
+ address=lead_data_raw['address'],
201
+ description=response.meta_description,
202
+ industry_keywords=lead_data_raw['industry_keywords']
203
+ )
204
+
205
+ response.lead_data = LeadData(
206
+ contact_info=contact_info,
207
+ business_info=business_info,
208
+ lead_score=min(lead_score, 100),
209
+ technologies=lead_data_raw['technologies']
210
+ )
211
+
212
+ await browser.close()
213
+ logger.info("Scraping completed successfully")
214
+ return response
215
+
216
+ except Exception as e:
217
+ logger.error(f"Error during scraping: {str(e)}")
218
+ await browser.close()
219
+ raise HTTPException(status_code=500, detail=f"Scraping error: {str(e)}")
220
 
221
  except Exception as e:
222
+ logger.error(f"Error launching browser: {str(e)}")
223
+ raise HTTPException(status_code=500, detail=f"Browser launch error: {str(e)}")