apexherbert200 commited on
Commit
35b24cc
Β·
1 Parent(s): 9e4b598

πŸš€ Transform scraper into Lead Generation powerhouse

Browse files

MAJOR FEATURES ADDED:
πŸ“§ Email extraction with regex pattern matching
πŸ“ž Phone number detection and formatting
🏒 Company name identification (multiple methods)
πŸ“ Address extraction with US format regex
πŸ”— Social media profile discovery
πŸ“ Contact form detection
⚑ Technology stack identification
🎯 Industry keyword classification
πŸ“Š Intelligent lead scoring system (0-100)

LEAD SCORING ALGORITHM:
- Emails found: +30 points
- Phone numbers: +25 points
- Contact forms: +20 points
- Social media: +15 points
- Address: +15 points
- Company name: +10 points
- Technologies: +10 points
- Industry keywords: +5 points

BUSINESS APPLICATIONS:
- B2B lead generation
- Sales prospecting
- Market research
- Competitor analysis
- Contact discovery

Example: /scrape?url=https://company.com&lead_generation=true

Files changed (1) hide show
  1. scrape.py +144 -9
scrape.py CHANGED
@@ -16,36 +16,73 @@ class LinkInfo(BaseModel):
16
  text: str
17
  href: str
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  class ScrapeResponse(BaseModel):
20
  body_content: Optional[str] = None
21
  screenshot: Optional[str] = None
22
  links: Optional[List[LinkInfo]] = None
23
  page_title: Optional[str] = None
24
  meta_description: Optional[str] = None
 
25
 
26
  @app.get("/")
27
  async def root():
28
  return {
29
- "message": "Playwright Web Scraper API - Body, Links & Images",
 
30
  "endpoints": {
31
- "/scrape": "Scrape webpage body content, links, and take screenshot",
32
  "/docs": "API documentation"
33
  },
34
- "example": "/scrape?url=https://example.com&screenshot=true&get_links=true&get_body=true",
35
- "features": [
36
- "Extract body tag content (clean text)",
37
- "Get all links with text and URLs",
38
- "Take full page screenshot",
39
- "Extract page title and meta description"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  ]
41
  }
42
 
43
  @app.get("/scrape")
44
  async def scrape_page(
45
  url: str = Query(..., description="URL to scrape"),
 
46
  screenshot: bool = Query(True, description="Take a full page screenshot"),
47
  get_links: bool = Query(True, description="Extract all links from the page"),
48
- get_body: bool = Query(True, description="Extract body tag content")
49
  ):
50
  logger.info(f"Starting scrape for URL: {url}")
51
  try:
@@ -128,6 +165,104 @@ async def scrape_page(
128
  """)
129
  response.links = [LinkInfo(**link) for link in links]
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  await browser.close()
132
  logger.info("Scraping completed successfully")
133
  return response
 
16
  text: str
17
  href: str
18
 
19
+ class ContactInfo(BaseModel):
20
+ emails: List[str] = []
21
+ phones: List[str] = []
22
+ social_media: List[str] = []
23
+ contact_forms: List[str] = []
24
+
25
+ class BusinessInfo(BaseModel):
26
+ company_name: Optional[str] = None
27
+ address: Optional[str] = None
28
+ description: Optional[str] = None
29
+ industry_keywords: List[str] = []
30
+
31
+ class LeadData(BaseModel):
32
+ contact_info: ContactInfo
33
+ business_info: BusinessInfo
34
+ lead_score: int = 0
35
+ technologies: List[str] = []
36
+
37
  class ScrapeResponse(BaseModel):
38
  body_content: Optional[str] = None
39
  screenshot: Optional[str] = None
40
  links: Optional[List[LinkInfo]] = None
41
  page_title: Optional[str] = None
42
  meta_description: Optional[str] = None
43
+ lead_data: Optional[LeadData] = None
44
 
45
  @app.get("/")
46
  async def root():
47
  return {
48
+ "message": "πŸš€ Lead Generation Web Scraper API",
49
+ "tagline": "Turn any website into qualified leads",
50
  "endpoints": {
51
+ "/scrape": "Extract leads, contacts, and business data from any website",
52
  "/docs": "API documentation"
53
  },
54
+ "example": "/scrape?url=https://example.com&lead_generation=true&screenshot=true",
55
+ "lead_generation_features": [
56
+ "πŸ“§ Extract email addresses and contact forms",
57
+ "πŸ“ž Find phone numbers and contact info",
58
+ "🏒 Identify company names and addresses",
59
+ "πŸ”— Discover social media profiles",
60
+ "⚑ Detect technologies and tools used",
61
+ "πŸ“Š Calculate lead quality scores",
62
+ "🎯 Industry keyword extraction"
63
+ ],
64
+ "basic_features": [
65
+ "πŸ“„ Clean body text extraction",
66
+ "πŸ”— Smart link filtering",
67
+ "πŸ“Έ Full page screenshots",
68
+ "πŸ“‹ Page metadata extraction"
69
+ ],
70
+ "use_cases": [
71
+ "B2B lead generation",
72
+ "Sales prospecting",
73
+ "Market research",
74
+ "Competitor analysis",
75
+ "Contact discovery"
76
  ]
77
  }
78
 
79
  @app.get("/scrape")
80
  async def scrape_page(
81
  url: str = Query(..., description="URL to scrape"),
82
+ lead_generation: bool = Query(True, description="Extract lead generation data (emails, phones, business info)"),
83
  screenshot: bool = Query(True, description="Take a full page screenshot"),
84
  get_links: bool = Query(True, description="Extract all links from the page"),
85
+ get_body: bool = Query(False, description="Extract body tag content (can be large)")
86
  ):
87
  logger.info(f"Starting scrape for URL: {url}")
88
  try:
 
165
  """)
166
  response.links = [LinkInfo(**link) for link in links]
167
 
168
+ # Lead Generation Extraction
169
+ if lead_generation:
170
+ logger.info("Extracting lead generation data...")
171
+ lead_data_raw = await page.evaluate("""
172
+ () => {
173
+ const result = {
174
+ emails: [],
175
+ phones: [],
176
+ social_media: [],
177
+ contact_forms: [],
178
+ company_name: null,
179
+ address: null,
180
+ technologies: [],
181
+ industry_keywords: []
182
+ };
183
+
184
+ // Extract emails
185
+ const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
186
+ const pageText = document.body.innerText;
187
+ const emails = pageText.match(emailRegex) || [];
188
+ result.emails = [...new Set(emails)].slice(0, 10); // Unique emails, max 10
189
+
190
+ // Extract phone numbers
191
+ const phoneRegex = /(\+?1?[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})/g;
192
+ const phones = pageText.match(phoneRegex) || [];
193
+ result.phones = [...new Set(phones)].slice(0, 5); // Unique phones, max 5
194
+
195
+ // Extract social media links
196
+ const socialLinks = Array.from(document.querySelectorAll('a[href]')).map(a => a.href)
197
+ .filter(href => /facebook|twitter|linkedin|instagram|youtube|tiktok/i.test(href));
198
+ result.social_media = [...new Set(socialLinks)].slice(0, 10);
199
+
200
+ // Find contact forms
201
+ const forms = Array.from(document.querySelectorAll('form')).map(form => {
202
+ const action = form.action || window.location.href;
203
+ return action;
204
+ });
205
+ result.contact_forms = [...new Set(forms)].slice(0, 5);
206
+
207
+ // Extract company name (try multiple methods)
208
+ result.company_name =
209
+ document.querySelector('meta[property="og:site_name"]')?.content ||
210
+ document.querySelector('meta[name="application-name"]')?.content ||
211
+ document.querySelector('h1')?.innerText?.trim() ||
212
+ document.title?.split('|')[0]?.split('-')[0]?.trim();
213
+
214
+ // Extract address
215
+ const addressRegex = /\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln|Drive|Dr|Court|Ct|Place|Pl)\s*,?\s*[A-Za-z\s]+,?\s*[A-Z]{2}\s*\d{5}/g;
216
+ const addresses = pageText.match(addressRegex) || [];
217
+ result.address = addresses[0] || null;
218
+
219
+ // Detect technologies
220
+ const techKeywords = ['wordpress', 'shopify', 'react', 'angular', 'vue', 'bootstrap', 'jquery', 'google analytics', 'facebook pixel'];
221
+ const htmlContent = document.documentElement.outerHTML.toLowerCase();
222
+ result.technologies = techKeywords.filter(tech => htmlContent.includes(tech));
223
+
224
+ // Industry keywords
225
+ const industryKeywords = ['consulting', 'marketing', 'software', 'healthcare', 'finance', 'real estate', 'education', 'retail', 'manufacturing', 'legal', 'restaurant', 'fitness', 'beauty', 'automotive'];
226
+ const lowerPageText = pageText.toLowerCase();
227
+ result.industry_keywords = industryKeywords.filter(keyword => lowerPageText.includes(keyword));
228
+
229
+ return result;
230
+ }
231
+ """)
232
+
233
+ # Calculate lead score
234
+ lead_score = 0
235
+ if lead_data_raw['emails']: lead_score += 30
236
+ if lead_data_raw['phones']: lead_score += 25
237
+ if lead_data_raw['contact_forms']: lead_score += 20
238
+ if lead_data_raw['social_media']: lead_score += 15
239
+ if lead_data_raw['company_name']: lead_score += 10
240
+ if lead_data_raw['address']: lead_score += 15
241
+ if lead_data_raw['technologies']: lead_score += 10
242
+ if lead_data_raw['industry_keywords']: lead_score += 5
243
+
244
+ # Create lead data object
245
+ contact_info = ContactInfo(
246
+ emails=lead_data_raw['emails'],
247
+ phones=lead_data_raw['phones'],
248
+ social_media=lead_data_raw['social_media'],
249
+ contact_forms=lead_data_raw['contact_forms']
250
+ )
251
+
252
+ business_info = BusinessInfo(
253
+ company_name=lead_data_raw['company_name'],
254
+ address=lead_data_raw['address'],
255
+ description=response.meta_description,
256
+ industry_keywords=lead_data_raw['industry_keywords']
257
+ )
258
+
259
+ response.lead_data = LeadData(
260
+ contact_info=contact_info,
261
+ business_info=business_info,
262
+ lead_score=min(lead_score, 100), # Cap at 100
263
+ technologies=lead_data_raw['technologies']
264
+ )
265
+
266
  await browser.close()
267
  logger.info("Scraping completed successfully")
268
  return response