apexherbert200 commited on
Commit
43614ad
·
1 Parent(s): a8c57d0

Changed timeouts

Browse files
Files changed (2) hide show
  1. scrape.py +2 -2
  2. test1.py +323 -49
scrape.py CHANGED
@@ -302,7 +302,7 @@ async def search_leads(
302
 
303
  # Accept cookies if present (optional, depends on region)
304
  try:
305
- await page.click('button[aria-label="Accept all"]', timeout=3000)
306
  except:
307
  pass
308
 
@@ -311,7 +311,7 @@ async def search_leads(
311
  await page.click('button#searchbox-searchbutton')
312
 
313
  # Wait for search results to load - selector for listings container
314
- await page.wait_for_selector('div[role="article"]', timeout=10000)
315
 
316
  # Scroll results container to load more items (optional)
317
  # For now, scrape the visible ones
 
302
 
303
  # Accept cookies if present (optional, depends on region)
304
  try:
305
+ await page.click('button[aria-label="Accept all"]', timeout=180000)
306
  except:
307
  pass
308
 
 
311
  await page.click('button#searchbox-searchbutton')
312
 
313
  # Wait for search results to load - selector for listings container
314
+ await page.wait_for_selector('div[role="article"]', timeout=180000)
315
 
316
  # Scroll results container to load more items (optional)
317
  # For now, scrape the visible ones
test1.py CHANGED
@@ -1,72 +1,346 @@
1
- from fastapi import Query, FastAPI
 
 
 
 
 
 
 
2
 
 
 
 
3
 
 
4
 
5
- app = FastAPI(title="LeadGen Scraper", description="A Lead generation scraper")
 
 
6
 
 
 
 
 
 
7
 
8
- @app.get("/search_leads")
9
- async def search_leads(
10
- query: str = Query(..., description="Search term for business leads")
11
- ):
12
- logger.info(f"Searching Google Maps for: {query}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  async with async_playwright() as p:
15
  browser = await p.chromium.launch(headless=True)
16
- page = await browser.new_page()
17
-
 
 
 
 
 
18
  try:
19
- # Go to Google Maps
20
- await page.goto("https://www.google.com/maps", wait_until="networkidle")
21
-
22
- # Accept cookies if present (optional, depends on region)
23
  try:
24
- await page.click('button[aria-label="Accept all"]', timeout=3000)
 
 
 
 
25
  except:
26
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- # Type the query in the search box and press Enter
29
- await page.fill('input#searchboxinput', query)
30
- await page.click('button#searchbox-searchbutton')
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
- # Wait for search results to load - selector for listings container
33
- await page.wait_for_selector('div[role="article"]', timeout=10000)
 
 
34
 
35
- # Scroll results container to load more items (optional)
36
- # For now, scrape the visible ones
 
37
 
38
- # Extract data from listings
39
- results = await page.evaluate("""
40
- () => {
41
- const listings = [];
42
- const elements = document.querySelectorAll('div[role="article"]');
43
- elements.forEach(el => {
44
- const nameEl = el.querySelector('h3 span');
45
- const name = nameEl ? nameEl.innerText : null;
46
 
47
- const addressEl = el.querySelector('[data-tooltip="Address"]');
48
- const address = addressEl ? addressEl.innerText : null;
 
 
 
 
 
49
 
50
- const phoneEl = el.querySelector('button[data-tooltip="Copy phone number"]');
51
- const phone = phoneEl ? phoneEl.getAttribute('aria-label')?.replace('Copy phone number ', '') : null;
 
52
 
53
- const websiteEl = el.querySelector('a[aria-label*="Website"]');
54
- const website = websiteEl ? websiteEl.href : null;
 
 
 
55
 
56
- listings.push({name, address, phone, website});
57
- });
58
- return listings;
59
- }
60
- """)
61
 
62
- await browser.close()
 
 
 
 
 
 
 
63
 
64
- # Filter out empty entries
65
- filtered = [r for r in results if r['name']]
 
 
 
 
 
 
 
 
 
 
66
 
67
- return {"query": query, "results_count": len(filtered), "results": filtered}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
- except Exception as e:
70
- await browser.close()
71
- logger.error(f"Error during Google Maps search scraping: {str(e)}")
72
- raise HTTPException(status_code=500, detail=f"Search scraping error: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, Query
2
+ from pydantic import BaseModel
3
+ from playwright.async_api import async_playwright
4
+ import asyncio
5
+ import base64
6
+ import logging
7
+ from typing import List, Optional
8
+ from urllib.parse import urlparse, parse_qs
9
 
10
+ # Set up logging
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
 
14
+ app = FastAPI(title="Playwright Web Scraper", description="Scrape websites based on search queries")
15
 
16
+ class LinkInfo(BaseModel):
17
+ text: str
18
+ href: str
19
 
20
+ class ContactInfo(BaseModel):
21
+ emails: List[str] = []
22
+ phones: List[str] = []
23
+ social_media: List[str] = []
24
+ contact_forms: List[str] = []
25
 
26
+ class ScriptInfo(BaseModel):
27
+ src: str
28
+ script_type: Optional[str] = None
29
+ is_external: bool = False
30
+
31
+ class BusinessInfo(BaseModel):
32
+ company_name: Optional[str] = None
33
+ address: Optional[str] = None
34
+ description: Optional[str] = None
35
+ industry_keywords: List[str] = []
36
+
37
+ class LeadData(BaseModel):
38
+ contact_info: ContactInfo
39
+ business_info: BusinessInfo
40
+ lead_score: int = 0
41
+ technologies: List[str] = []
42
+
43
+ class ScrapeResponse(BaseModel):
44
+ body_content: Optional[str] = None
45
+ screenshot: Optional[str] = None
46
+ links: Optional[List[LinkInfo]] = None
47
+ scripts: Optional[List[ScriptInfo]] = None
48
+ page_title: Optional[str] = None
49
+ meta_description: Optional[str] = None
50
+ lead_data: Optional[LeadData] = None
51
+ source_url: Optional[str] = None
52
 
53
+ @app.get("/")
54
+ async def root():
55
+ return {
56
+ "message": "🚀 Query-Based Web Scraper API",
57
+ "tagline": "Search and scrape websites based on queries",
58
+ "endpoints": {
59
+ "/scrape": "Search Google for the query and scrape the top result",
60
+ "/docs": "API documentation"
61
+ },
62
+ "example": "/scrape?query=plumbers+near+me&lead_generation=true&screenshot=true",
63
+ "note": "Now accepts search queries instead of direct URLs"
64
+ }
65
+
66
+ async def get_top_search_result(query: str):
67
+ """Perform Google search and return top result URL"""
68
+ user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
69
  async with async_playwright() as p:
70
  browser = await p.chromium.launch(headless=True)
71
+ context = await browser.new_context(
72
+ user_agent=user_agent,
73
+ locale='en-US',
74
+ viewport={'width': 1920, 'height': 1080}
75
+ )
76
+ page = await context.new_page()
77
+
78
  try:
79
+ logger.info(f"Searching Google for: {query}")
80
+ await page.goto("https://www.google.com", timeout=60000)
81
+
82
+ # Handle consent form if it appears
83
  try:
84
+ consent_button = await page.wait_for_selector('button:has-text("Accept all")', timeout=5000)
85
+ if consent_button:
86
+ await consent_button.click()
87
+ logger.info("Accepted Google consent form")
88
+ await page.wait_for_load_state("networkidle")
89
  except:
90
+ pass # Consent form didn't appear
91
+
92
+ # Perform search
93
+ await page.fill('textarea[name="q"]', query)
94
+ await page.keyboard.press("Enter")
95
+ await page.wait_for_selector("#search", timeout=30000)
96
+
97
+ # Extract top results
98
+ results = await page.query_selector_all('.g')
99
+ if not results:
100
+ raise Exception("No search results found")
101
+
102
+ urls = []
103
+ for result in results[:3]: # Check top 3 results
104
+ try:
105
+ link = await result.query_selector('a')
106
+ if not link:
107
+ continue
108
+
109
+ # Extract both data-href and href attributes
110
+ data_href = await link.get_attribute('data-href')
111
+ href = await link.get_attribute('href')
112
+ target_url = data_href or href
113
+
114
+ if target_url and target_url.startswith('/url?q='):
115
+ target_url = f"https://www.google.com{target_url}"
116
+
117
+ if target_url and target_url.startswith('https://www.google.com/url?'):
118
+ parsed = urlparse(target_url)
119
+ qs = parse_qs(parsed.query)
120
+ target_url = qs.get('q', [target_url])[0]
121
+
122
+ if target_url and target_url.startswith('http'):
123
+ urls.append(target_url)
124
+ logger.info(f"Found search result: {target_url}")
125
+ except Exception as e:
126
+ logger.warning(f"Error processing result: {str(e)}")
127
+
128
+ if not urls:
129
+ raise Exception("No valid URLs found in search results")
130
+
131
+ await browser.close()
132
+ return urls[0] # Return top result
133
+
134
+ except Exception as e:
135
+ logger.error(f"Search failed: {str(e)}")
136
+ await browser.close()
137
+ raise
138
+
139
+ @app.get("/scrape")
140
+ async def scrape_page(
141
+ query: str = Query(..., description="Search query to find a website"),
142
+ lead_generation: bool = Query(True, description="Extract lead generation data"),
143
+ screenshot: bool = Query(True, description="Take a full page screenshot"),
144
+ get_links: bool = Query(True, description="Extract all links from the page"),
145
+ get_body: bool = Query(False, description="Extract body tag content")
146
+ ):
147
+ logger.info(f"Starting scrape for query: {query}")
148
+
149
+ try:
150
+ # Get top search result URL
151
+ target_url = await get_top_search_result(query)
152
+ logger.info(f"Scraping top result: {target_url}")
153
+ except Exception as e:
154
+ logger.error(f"Search error: {str(e)}")
155
+ raise HTTPException(status_code=500, detail=f"Search failed: {str(e)}")
156
 
157
+ try:
158
+ async with async_playwright() as p:
159
+ logger.info("Launching browser...")
160
+ browser = await p.chromium.launch(
161
+ headless=True,
162
+ args=[
163
+ '--no-sandbox',
164
+ '--disable-setuid-sandbox',
165
+ '--disable-dev-shm-usage',
166
+ '--disable-accelerated-2d-canvas',
167
+ '--no-first-run',
168
+ '--no-zygote',
169
+ '--disable-gpu'
170
+ ]
171
+ )
172
+ page = await browser.new_page()
173
 
174
+ try:
175
+ logger.info(f"Navigating to {target_url}...")
176
+ await page.goto(target_url, wait_until="networkidle")
177
+ response = ScrapeResponse(source_url=target_url)
178
 
179
+ # Always get page title and meta description
180
+ logger.info("Getting page metadata...")
181
+ response.page_title = await page.title()
182
 
183
+ meta_desc = await page.evaluate("""
184
+ () => {
185
+ const meta = document.querySelector('meta[name="description"]');
186
+ return meta ? meta.getAttribute('content') : null;
187
+ }
188
+ """)
189
+ response.meta_description = meta_desc
 
190
 
191
+ # Get body content (clean text)
192
+ if get_body:
193
+ logger.info("Extracting body content...")
194
+ body_content = await page.evaluate("""
195
+ () => {
196
+ const body = document.querySelector('body');
197
+ if (!body) return null;
198
 
199
+ // Remove script and style elements
200
+ const scripts = body.querySelectorAll('script, style, noscript');
201
+ scripts.forEach(el => el.remove());
202
 
203
+ // Get clean text content
204
+ return body.innerText.trim();
205
+ }
206
+ """)
207
+ response.body_content = body_content
208
 
209
+ # Get screenshot (full page)
210
+ if screenshot:
211
+ logger.info("Taking full page screenshot...")
212
+ screenshot_bytes = await page.screenshot(full_page=True)
213
+ response.screenshot = base64.b64encode(screenshot_bytes).decode('utf-8')
214
 
215
+ # Get links with better filtering
216
+ if get_links:
217
+ logger.info("Extracting links...")
218
+ links = await page.evaluate("""
219
+ () => {
220
+ return Array.from(document.querySelectorAll('a[href]')).map(a => {
221
+ const text = a.innerText.trim();
222
+ const href = a.href;
223
 
224
+ // Only include links with meaningful text and valid URLs
225
+ if (text && href && href.startsWith('http')) {
226
+ return {
227
+ text: text.substring(0, 200), // Limit text length
228
+ href: href
229
+ }
230
+ }
231
+ return null;
232
+ }).filter(link => link !== null);
233
+ }
234
+ """)
235
+ response.links = [LinkInfo(**link) for link in links]
236
 
237
+ # Lead Generation Extraction
238
+ if lead_generation:
239
+ logger.info("Extracting lead generation data...")
240
+ lead_data_raw = await page.evaluate("""
241
+ () => {
242
+ const result = {
243
+ emails: [],
244
+ phones: [],
245
+ social_media: [],
246
+ contact_forms: [],
247
+ company_name: null,
248
+ address: null,
249
+ technologies: [],
250
+ industry_keywords: []
251
+ };
252
 
253
+ // Extract emails
254
+ const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
255
+ const pageText = document.body.innerText;
256
+ const emails = pageText.match(emailRegex) || [];
257
+ result.emails = [...new Set(emails)].slice(0, 10); // Unique emails, max 10
258
+
259
+ // Extract phone numbers
260
+ const phoneRegex = /(\+?1?[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})/g;
261
+ const phones = pageText.match(phoneRegex) || [];
262
+ result.phones = [...new Set(phones)].slice(0, 5); // Unique phones, max 5
263
+
264
+ // Extract social media links
265
+ const socialLinks = Array.from(document.querySelectorAll('a[href]')).map(a => a.href)
266
+ .filter(href => /facebook|twitter|linkedin|instagram|youtube|tiktok/i.test(href));
267
+ result.social_media = [...new Set(socialLinks)].slice(0, 10);
268
+
269
+ // Find contact forms
270
+ const forms = Array.from(document.querySelectorAll('form')).map(form => {
271
+ const action = form.action || window.location.href;
272
+ return action;
273
+ });
274
+ result.contact_forms = [...new Set(forms)].slice(0, 5);
275
+
276
+ // Extract company name (try multiple methods)
277
+ result.company_name =
278
+ document.querySelector('meta[property="og:site_name"]')?.content ||
279
+ document.querySelector('meta[name="application-name"]')?.content ||
280
+ document.querySelector('h1')?.innerText?.trim() ||
281
+ document.title?.split('|')[0]?.split('-')[0]?.trim();
282
+
283
+ // Extract address
284
+ const addressRegex = /\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln|Drive|Dr|Court|Ct|Place|Pl)\s*,?\s*[A-Za-z\s]+,?\s*[A-Z]{2}\s*\d{5}/g;
285
+ const addresses = pageText.match(addressRegex) || [];
286
+ result.address = addresses[0] || null;
287
+
288
+ // Detect technologies
289
+ const techKeywords = ['wordpress', 'shopify', 'react', 'angular', 'vue', 'bootstrap', 'jquery', 'google analytics', 'facebook pixel'];
290
+ const htmlContent = document.documentElement.outerHTML.toLowerCase();
291
+ result.technologies = techKeywords.filter(tech => htmlContent.includes(tech));
292
+
293
+ // Industry keywords
294
+ const industryKeywords = ['consulting', 'marketing', 'software', 'healthcare', 'finance', 'real estate', 'education', 'retail', 'manufacturing', 'legal', 'restaurant', 'fitness', 'beauty', 'automotive'];
295
+ const lowerPageText = pageText.toLowerCase();
296
+ result.industry_keywords = industryKeywords.filter(keyword => lowerPageText.includes(keyword));
297
+
298
+ return result;
299
+ }
300
+ """)
301
+
302
+ # Calculate lead score
303
+ lead_score = 0
304
+ if lead_data_raw['emails']: lead_score += 30
305
+ if lead_data_raw['phones']: lead_score += 25
306
+ if lead_data_raw['contact_forms']: lead_score += 20
307
+ if lead_data_raw['social_media']: lead_score += 15
308
+ if lead_data_raw['company_name']: lead_score += 10
309
+ if lead_data_raw['address']: lead_score += 15
310
+ if lead_data_raw['technologies']: lead_score += 10
311
+ if lead_data_raw['industry_keywords']: lead_score += 5
312
+
313
+ # Create lead data object
314
+ contact_info = ContactInfo(
315
+ emails=lead_data_raw['emails'],
316
+ phones=lead_data_raw['phones'],
317
+ social_media=lead_data_raw['social_media'],
318
+ contact_forms=lead_data_raw['contact_forms']
319
+ )
320
+
321
+ business_info = BusinessInfo(
322
+ company_name=lead_data_raw['company_name'],
323
+ address=lead_data_raw['address'],
324
+ description=response.meta_description,
325
+ industry_keywords=lead_data_raw['industry_keywords']
326
+ )
327
+
328
+ response.lead_data = LeadData(
329
+ contact_info=contact_info,
330
+ business_info=business_info,
331
+ lead_score=min(lead_score, 100), # Cap at 100
332
+ technologies=lead_data_raw['technologies']
333
+ )
334
+
335
+ await browser.close()
336
+ logger.info("Scraping completed successfully")
337
+ return response
338
+
339
+ except Exception as e:
340
+ logger.error(f"Error during scraping: {str(e)}")
341
+ await browser.close()
342
+ raise HTTPException(status_code=500, detail=f"Scraping error: {str(e)}")
343
+
344
+ except Exception as e:
345
+ logger.error(f"Error launching browser: {str(e)}")
346
+ raise HTTPException(status_code=500, detail=f"Browser launch error: {str(e)}")