Commit
·
79508b3
1
Parent(s):
2850050
Tool for scraping contacts
Browse files- contacts.py +185 -143
contacts.py
CHANGED
@@ -1,19 +1,18 @@
|
|
1 |
-
from fastapi import FastAPI,
|
2 |
from pydantic import BaseModel
|
3 |
from playwright.async_api import async_playwright
|
4 |
-
|
5 |
import base64
|
6 |
import logging
|
7 |
-
from typing import List, Optional
|
8 |
-
import
|
9 |
-
|
10 |
-
app = FastAPI(title="Lead Generation Web Scraper API")
|
11 |
|
12 |
-
#
|
13 |
logging.basicConfig(level=logging.INFO)
|
14 |
logger = logging.getLogger(__name__)
|
15 |
|
16 |
-
|
|
|
17 |
class LinkInfo(BaseModel):
|
18 |
text: str
|
19 |
href: str
|
@@ -24,6 +23,11 @@ class ContactInfo(BaseModel):
|
|
24 |
social_media: List[str] = []
|
25 |
contact_forms: List[str] = []
|
26 |
|
|
|
|
|
|
|
|
|
|
|
27 |
class BusinessInfo(BaseModel):
|
28 |
company_name: Optional[str] = None
|
29 |
address: Optional[str] = None
|
@@ -36,146 +40,184 @@ class LeadData(BaseModel):
|
|
36 |
lead_score: int = 0
|
37 |
technologies: List[str] = []
|
38 |
|
39 |
-
class
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
const meta = document.querySelector('meta[name="description"]');
|
59 |
-
return meta ? meta.getAttribute('content') : null;
|
60 |
}
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
""
|
74 |
-
|
75 |
-
text = await page.evaluate("document.body.innerText")
|
76 |
-
|
77 |
-
emails = list(set(re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", text)))[:10]
|
78 |
-
phones = list(set(re.findall(r"(\+?1?[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", text)))[:5]
|
79 |
-
|
80 |
-
social_links = list(set(
|
81 |
-
l['href'] for l in links if re.search(r"facebook|linkedin|twitter|instagram|youtube", l['href'], re.I)
|
82 |
-
))[:10]
|
83 |
-
|
84 |
-
contact_forms = list(set(await page.evaluate("""
|
85 |
-
() => Array.from(document.querySelectorAll('form')).map(f => f.action || location.href)
|
86 |
-
""")))[:5]
|
87 |
-
|
88 |
-
company_name = await page.evaluate("""
|
89 |
-
() => document.querySelector('meta[property="og:site_name"]')?.content ||
|
90 |
-
document.querySelector('meta[name="application-name"]')?.content ||
|
91 |
-
document.querySelector('h1')?.innerText ||
|
92 |
-
document.title?.split('|')[0]?.split('-')[0]?.trim()
|
93 |
-
""")
|
94 |
-
|
95 |
-
address_matches = re.findall(r"\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln|Drive|Dr|Court|Ct|Place|Pl)[^\n,]*", text)
|
96 |
-
address = address_matches[0] if address_matches else None
|
97 |
-
|
98 |
-
html = await page.content()
|
99 |
-
tech_stack = [t for t in ['wordpress', 'shopify', 'react', 'angular', 'vue', 'bootstrap', 'jquery', 'google analytics'] if t in html.lower()]
|
100 |
-
|
101 |
-
industry_keywords = [k for k in ['consulting', 'marketing', 'software', 'healthcare', 'finance', 'education'] if k in text.lower()]
|
102 |
-
|
103 |
-
lead_score = min(100, sum([
|
104 |
-
30 if emails else 0,
|
105 |
-
25 if phones else 0,
|
106 |
-
15 if social_links else 0,
|
107 |
-
10 if company_name else 0,
|
108 |
-
15 if address else 0,
|
109 |
-
10 if tech_stack else 0,
|
110 |
-
5 if industry_keywords else 0
|
111 |
-
]))
|
112 |
-
|
113 |
-
return ScrapedPage(
|
114 |
-
url=url,
|
115 |
-
page_title=title,
|
116 |
-
meta_description=meta_desc,
|
117 |
-
screenshot=screenshot,
|
118 |
-
links=[LinkInfo(**l) for l in links],
|
119 |
-
lead_data=LeadData(
|
120 |
-
contact_info=ContactInfo(
|
121 |
-
emails=emails,
|
122 |
-
phones=phones,
|
123 |
-
social_media=social_links,
|
124 |
-
contact_forms=contact_forms,
|
125 |
-
),
|
126 |
-
business_info=BusinessInfo(
|
127 |
-
company_name=company_name,
|
128 |
-
address=address,
|
129 |
-
description=meta_desc,
|
130 |
-
industry_keywords=industry_keywords
|
131 |
-
),
|
132 |
-
lead_score=lead_score,
|
133 |
-
technologies=tech_stack
|
134 |
-
)
|
135 |
-
)
|
136 |
-
|
137 |
-
# -------------------- API Endpoint --------------------
|
138 |
-
@app.get("/crawl-leads", response_model=CrawlResponse)
|
139 |
-
async def crawl_leads(
|
140 |
-
website: str = Query(..., description="Base website URL to crawl"),
|
141 |
-
max_depth: int = Query(1, ge=1, le=3),
|
142 |
-
screenshot: bool = Query(False, description="Take full page screenshots")
|
143 |
):
|
144 |
-
|
145 |
-
|
|
|
146 |
|
|
|
|
|
147 |
try:
|
148 |
async with async_playwright() as p:
|
149 |
browser = await p.chromium.launch(headless=True)
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
|
180 |
except Exception as e:
|
181 |
-
|
|
|
|
1 |
+
from fastapi import FastAPI, HTTPException, Query
|
2 |
from pydantic import BaseModel
|
3 |
from playwright.async_api import async_playwright
|
4 |
+
import asyncio
|
5 |
import base64
|
6 |
import logging
|
7 |
+
from typing import List, Optional
|
8 |
+
from urllib.parse import urlparse
|
|
|
|
|
9 |
|
10 |
+
# Set up logging
|
11 |
logging.basicConfig(level=logging.INFO)
|
12 |
logger = logging.getLogger(__name__)
|
13 |
|
14 |
+
app = FastAPI(title="Playwright Web Scraper", description="A simple web scraper using Playwright")
|
15 |
+
|
16 |
class LinkInfo(BaseModel):
|
17 |
text: str
|
18 |
href: str
|
|
|
23 |
social_media: List[str] = []
|
24 |
contact_forms: List[str] = []
|
25 |
|
26 |
+
class ScriptInfo(BaseModel):
|
27 |
+
src: str
|
28 |
+
script_type: Optional[str] = None
|
29 |
+
is_external: bool = False
|
30 |
+
|
31 |
class BusinessInfo(BaseModel):
|
32 |
company_name: Optional[str] = None
|
33 |
address: Optional[str] = None
|
|
|
40 |
lead_score: int = 0
|
41 |
technologies: List[str] = []
|
42 |
|
43 |
+
class ScrapeResponse(BaseModel):
|
44 |
+
body_content: Optional[str] = None
|
45 |
+
screenshot: Optional[str] = None
|
46 |
+
links: Optional[List[LinkInfo]] = None
|
47 |
+
scripts: Optional[List[ScriptInfo]] = None
|
48 |
+
page_title: Optional[str] = None
|
49 |
+
meta_description: Optional[str] = None
|
50 |
+
lead_data: Optional[LeadData] = None
|
51 |
+
|
52 |
+
visited_urls = set()
|
53 |
+
|
54 |
+
@app.get("/")
|
55 |
+
async def root():
|
56 |
+
return {
|
57 |
+
"message": "🚀 Lead Generation Web Scraper API",
|
58 |
+
"tagline": "Turn any website into qualified leads",
|
59 |
+
"endpoints": {
|
60 |
+
"/scrape": "Extract leads, contacts, and business data from any website",
|
61 |
+
"/docs": "API documentation"
|
|
|
|
|
62 |
}
|
63 |
+
}
|
64 |
+
|
65 |
+
def normalize_url(url):
|
66 |
+
parsed = urlparse(url)
|
67 |
+
return parsed._replace(fragment='', query='').geturl().rstrip('/')
|
68 |
+
|
69 |
+
@app.get("/scrape")
|
70 |
+
async def scrape_page(
|
71 |
+
url: str = Query(..., description="URL to scrape"),
|
72 |
+
lead_generation: bool = Query(True, description="Extract lead generation data (emails, phones, business info)"),
|
73 |
+
screenshot: bool = Query(True, description="Take a full page screenshot"),
|
74 |
+
get_links: bool = Query(True, description="Extract all links from the page"),
|
75 |
+
get_body: bool = Query(False, description="Extract body tag content (can be large)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
):
|
77 |
+
norm_url = normalize_url(url)
|
78 |
+
if norm_url in visited_urls:
|
79 |
+
raise HTTPException(status_code=400, detail="URL already scraped")
|
80 |
|
81 |
+
visited_urls.add(norm_url)
|
82 |
+
logger.info(f"Starting scrape for URL: {norm_url}")
|
83 |
try:
|
84 |
async with async_playwright() as p:
|
85 |
browser = await p.chromium.launch(headless=True)
|
86 |
+
page = await browser.new_page()
|
87 |
+
|
88 |
+
try:
|
89 |
+
await page.goto(norm_url, wait_until="domcontentloaded", timeout=60000)
|
90 |
+
|
91 |
+
response = ScrapeResponse()
|
92 |
+
response.page_title = await page.title()
|
93 |
+
|
94 |
+
response.meta_description = await page.evaluate("""
|
95 |
+
() => {
|
96 |
+
const meta = document.querySelector('meta[name="description"]');
|
97 |
+
return meta ? meta.getAttribute('content') : null;
|
98 |
+
}
|
99 |
+
""")
|
100 |
+
|
101 |
+
if get_body:
|
102 |
+
response.body_content = await page.evaluate("""
|
103 |
+
() => {
|
104 |
+
const body = document.querySelector('body');
|
105 |
+
if (!body) return null;
|
106 |
+
const scripts = body.querySelectorAll('script, style, noscript');
|
107 |
+
scripts.forEach(el => el.remove());
|
108 |
+
return body.innerText.trim();
|
109 |
+
}
|
110 |
+
""")
|
111 |
+
|
112 |
+
if screenshot:
|
113 |
+
screenshot_bytes = await page.screenshot(full_page=True)
|
114 |
+
response.screenshot = base64.b64encode(screenshot_bytes).decode('utf-8')
|
115 |
+
|
116 |
+
if get_links:
|
117 |
+
links = await page.evaluate("""
|
118 |
+
() => {
|
119 |
+
return Array.from(document.querySelectorAll('a[href]')).map(a => {
|
120 |
+
const text = a.innerText.trim() || a.getAttribute('aria-label') || a.getAttribute('title') || a.href;
|
121 |
+
const href = a.href;
|
122 |
+
if (href && href.startsWith('http')) {
|
123 |
+
return { text: text.substring(0, 200), href: href };
|
124 |
+
}
|
125 |
+
return null;
|
126 |
+
}).filter(link => link !== null);
|
127 |
+
}
|
128 |
+
""")
|
129 |
+
response.links = [LinkInfo(**link) for link in links]
|
130 |
+
|
131 |
+
if lead_generation:
|
132 |
+
lead_data_raw = await page.evaluate("""
|
133 |
+
() => {
|
134 |
+
const result = {
|
135 |
+
emails: [], phones: [], social_media: [], contact_forms: [],
|
136 |
+
company_name: null, address: null, technologies: [], industry_keywords: []
|
137 |
+
};
|
138 |
+
|
139 |
+
const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
|
140 |
+
const pageText = document.body.innerText;
|
141 |
+
const emails = pageText.match(emailRegex) || [];
|
142 |
+
const mailtoEmails = Array.from(document.querySelectorAll('a[href^="mailto:"]'))
|
143 |
+
.map(a => a.href.replace(/^mailto:/, '').split('?')[0]);
|
144 |
+
result.emails = [...new Set([...emails, ...mailtoEmails])].slice(0, 10);
|
145 |
+
|
146 |
+
const phoneRegex = /(\+?1?[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})/g;
|
147 |
+
const phones = pageText.match(phoneRegex) || [];
|
148 |
+
const telPhones = Array.from(document.querySelectorAll('a[href^="tel:"]'))
|
149 |
+
.map(a => a.href.replace(/^tel:/, '').split('?')[0]);
|
150 |
+
result.phones = [...new Set([...phones, ...telPhones])].slice(0, 5);
|
151 |
+
|
152 |
+
const socialLinks = Array.from(document.querySelectorAll('a[href]'))
|
153 |
+
.map(a => a.href).filter(href => /facebook|twitter|linkedin|instagram|youtube|tiktok/i.test(href));
|
154 |
+
result.social_media = [...new Set(socialLinks)].slice(0, 10);
|
155 |
+
|
156 |
+
const forms = Array.from(document.querySelectorAll('form')).map(form => form.action || window.location.href);
|
157 |
+
result.contact_forms = [...new Set(forms)].slice(0, 5);
|
158 |
+
|
159 |
+
result.company_name =
|
160 |
+
document.querySelector('meta[property="og:site_name"]')?.content ||
|
161 |
+
document.querySelector('meta[name="application-name"]')?.content ||
|
162 |
+
document.querySelector('h1')?.innerText?.trim() ||
|
163 |
+
document.title?.split('|')[0]?.split('-')[0]?.trim();
|
164 |
+
|
165 |
+
const addressRegex = /\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln|Drive|Dr|Court|Ct|Place|Pl)\s*,?\s*[A-Za-z\s]+,?\s*[A-Z]{2}\s*\d{5}/g;
|
166 |
+
const addresses = pageText.match(addressRegex) || [];
|
167 |
+
result.address = addresses[0] || null;
|
168 |
+
|
169 |
+
const techKeywords = ['wordpress', 'shopify', 'react', 'angular', 'vue', 'bootstrap', 'jquery', 'google analytics', 'facebook pixel'];
|
170 |
+
const htmlContent = document.documentElement.outerHTML.toLowerCase();
|
171 |
+
result.technologies = techKeywords.filter(tech => htmlContent.includes(tech));
|
172 |
+
|
173 |
+
const industryKeywords = ['consulting', 'marketing', 'software', 'healthcare', 'finance', 'real estate', 'education', 'retail', 'manufacturing', 'legal', 'restaurant', 'fitness', 'beauty', 'automotive'];
|
174 |
+
const lowerPageText = pageText.toLowerCase();
|
175 |
+
result.industry_keywords = industryKeywords.filter(keyword => lowerPageText.includes(keyword));
|
176 |
+
|
177 |
+
return result;
|
178 |
+
}
|
179 |
+
""")
|
180 |
+
|
181 |
+
lead_score = 0
|
182 |
+
if lead_data_raw['emails']: lead_score += 30
|
183 |
+
if lead_data_raw['phones']: lead_score += 25
|
184 |
+
if lead_data_raw['contact_forms']: lead_score += 15
|
185 |
+
if lead_data_raw['social_media']: lead_score += 10
|
186 |
+
if lead_data_raw['company_name']: lead_score += 10
|
187 |
+
if lead_data_raw['address']: lead_score += 10
|
188 |
+
if lead_data_raw['technologies']: lead_score += 5
|
189 |
+
if lead_data_raw['industry_keywords']: lead_score += 5
|
190 |
+
|
191 |
+
contact_info = ContactInfo(
|
192 |
+
emails=lead_data_raw['emails'],
|
193 |
+
phones=lead_data_raw['phones'],
|
194 |
+
social_media=lead_data_raw['social_media'],
|
195 |
+
contact_forms=lead_data_raw['contact_forms']
|
196 |
+
)
|
197 |
+
|
198 |
+
business_info = BusinessInfo(
|
199 |
+
company_name=lead_data_raw['company_name'],
|
200 |
+
address=lead_data_raw['address'],
|
201 |
+
description=response.meta_description,
|
202 |
+
industry_keywords=lead_data_raw['industry_keywords']
|
203 |
+
)
|
204 |
+
|
205 |
+
response.lead_data = LeadData(
|
206 |
+
contact_info=contact_info,
|
207 |
+
business_info=business_info,
|
208 |
+
lead_score=min(lead_score, 100),
|
209 |
+
technologies=lead_data_raw['technologies']
|
210 |
+
)
|
211 |
+
|
212 |
+
await browser.close()
|
213 |
+
logger.info("Scraping completed successfully")
|
214 |
+
return response
|
215 |
+
|
216 |
+
except Exception as e:
|
217 |
+
logger.error(f"Error during scraping: {str(e)}")
|
218 |
+
await browser.close()
|
219 |
+
raise HTTPException(status_code=500, detail=f"Scraping error: {str(e)}")
|
220 |
|
221 |
except Exception as e:
|
222 |
+
logger.error(f"Error launching browser: {str(e)}")
|
223 |
+
raise HTTPException(status_code=500, detail=f"Browser launch error: {str(e)}")
|