ahmednoorx commited on
Commit
bd0ddb1
·
verified ·
1 Parent(s): baf4fd0

Add scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +279 -0
scraper.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import time
4
+ import re
5
+ from urllib.parse import urlparse, urljoin
6
+ import sqlite3
7
+ from selenium import webdriver
8
+ from selenium.webdriver.chrome.options import Options
9
+ from selenium.webdriver.common.by import By
10
+ from selenium.webdriver.support.ui import WebDriverWait
11
+ from selenium.webdriver.support import expected_conditions as EC
12
+ from webdriver_manager.chrome import ChromeDriverManager
13
+
14
+ class LinkedInScraper:
15
+ def __init__(self, timeout=10, use_selenium=False):
16
+ self.timeout = timeout
17
+ self.use_selenium = use_selenium
18
+ self.session = requests.Session()
19
+ self.session.headers.update({
20
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
21
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
22
+ 'Accept-Language': 'en-US,en;q=0.5',
23
+ 'Accept-Encoding': 'gzip, deflate',
24
+ 'Connection': 'keep-alive',
25
+ 'Upgrade-Insecure-Requests': '1',
26
+ })
27
+
28
+ if self.use_selenium:
29
+ self._setup_selenium()
30
+
31
+ def _setup_selenium(self):
32
+ """Setup Selenium WebDriver"""
33
+ try:
34
+ chrome_options = Options()
35
+ chrome_options.add_argument('--headless')
36
+ chrome_options.add_argument('--no-sandbox')
37
+ chrome_options.add_argument('--disable-dev-shm-usage')
38
+ chrome_options.add_argument('--disable-gpu')
39
+ chrome_options.add_argument('--window-size=1920,1080')
40
+
41
+ self.driver = webdriver.Chrome(
42
+ ChromeDriverManager().install(),
43
+ options=chrome_options
44
+ )
45
+ except Exception as e:
46
+ print(f"Error setting up Selenium: {e}")
47
+ self.use_selenium = False
48
+
49
+ def _get_cached_data(self, url):
50
+ """Check if URL data is cached in database"""
51
+ try:
52
+ conn = sqlite3.connect('leads.db')
53
+ cursor = conn.cursor()
54
+
55
+ cursor.execute('''
56
+ CREATE TABLE IF NOT EXISTS scraped_cache (
57
+ url TEXT PRIMARY KEY,
58
+ content TEXT,
59
+ scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
60
+ )
61
+ ''')
62
+
63
+ cursor.execute('SELECT content FROM scraped_cache WHERE url = ?', (url,))
64
+ result = cursor.fetchone()
65
+ conn.close()
66
+
67
+ return result[0] if result else None
68
+ except Exception as e:
69
+ print(f"Cache error: {e}")
70
+ return None
71
+
72
+ def _cache_data(self, url, content):
73
+ """Cache scraped data"""
74
+ try:
75
+ conn = sqlite3.connect('leads.db')
76
+ cursor = conn.cursor()
77
+
78
+ cursor.execute('''
79
+ INSERT OR REPLACE INTO scraped_cache (url, content)
80
+ VALUES (?, ?)
81
+ ''', (url, content))
82
+
83
+ conn.commit()
84
+ conn.close()
85
+ except Exception as e:
86
+ print(f"Cache save error: {e}")
87
+
88
+ def scrape_with_requests(self, url):
89
+ """Scrape URL using requests and BeautifulSoup"""
90
+ try:
91
+ response = self.session.get(url, timeout=self.timeout)
92
+ response.raise_for_status()
93
+
94
+ soup = BeautifulSoup(response.content, 'html.parser')
95
+
96
+ # Extract various content types
97
+ content_parts = []
98
+
99
+ # Try to get meta description
100
+ meta_desc = soup.find('meta', attrs={'name': 'description'})
101
+ if meta_desc:
102
+ content_parts.append(f"Description: {meta_desc.get('content', '')}")
103
+
104
+ # Try to get title
105
+ title = soup.find('title')
106
+ if title:
107
+ content_parts.append(f"Title: {title.get_text().strip()}")
108
+
109
+ # Try to get about section or main content
110
+ about_selectors = [
111
+ '.about-section',
112
+ '.company-description',
113
+ '.about-us',
114
+ '[class*="about"]',
115
+ '.description',
116
+ '.summary',
117
+ 'main',
118
+ '.content'
119
+ ]
120
+
121
+ for selector in about_selectors:
122
+ elements = soup.select(selector)
123
+ for element in elements:
124
+ text = element.get_text().strip()
125
+ if len(text) > 50: # Only meaningful content
126
+ content_parts.append(text[:500]) # Limit length
127
+ break
128
+ if content_parts:
129
+ break
130
+
131
+ # If no specific content found, get paragraphs
132
+ if not content_parts:
133
+ paragraphs = soup.find_all('p')
134
+ for p in paragraphs[:3]: # First 3 paragraphs
135
+ text = p.get_text().strip()
136
+ if len(text) > 30:
137
+ content_parts.append(text[:300])
138
+
139
+ return ' | '.join(content_parts) if content_parts else "No content extracted"
140
+
141
+ except Exception as e:
142
+ return f"Error scraping {url}: {str(e)}"
143
+
144
+ def scrape_with_selenium(self, url):
145
+ """Scrape URL using Selenium"""
146
+ try:
147
+ self.driver.get(url)
148
+ WebDriverWait(self.driver, self.timeout).until(
149
+ EC.presence_of_element_located((By.TAG_NAME, "body"))
150
+ )
151
+
152
+ # Wait a bit for dynamic content
153
+ time.sleep(2)
154
+
155
+ content_parts = []
156
+
157
+ # Try different selectors for LinkedIn
158
+ linkedin_selectors = [
159
+ '[data-test-id="about-us-description"]',
160
+ '.company-about-us-description',
161
+ '.about-section',
162
+ '[class*="about"]'
163
+ ]
164
+
165
+ for selector in linkedin_selectors:
166
+ try:
167
+ elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
168
+ for element in elements:
169
+ text = element.text.strip()
170
+ if len(text) > 50:
171
+ content_parts.append(text[:500])
172
+ break
173
+ except:
174
+ continue
175
+
176
+ # If no LinkedIn-specific content, try general selectors
177
+ if not content_parts:
178
+ general_selectors = ['main', '.content', 'article', '.description']
179
+ for selector in general_selectors:
180
+ try:
181
+ elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
182
+ for element in elements:
183
+ text = element.text.strip()
184
+ if len(text) > 50:
185
+ content_parts.append(text[:500])
186
+ break
187
+ except:
188
+ continue
189
+
190
+ return ' | '.join(content_parts) if content_parts else "No content extracted"
191
+
192
+ except Exception as e:
193
+ return f"Error scraping {url} with Selenium: {str(e)}"
194
+
195
+ def scrape_linkedin_profile(self, linkedin_url):
196
+ """Scrape LinkedIn company profile"""
197
+ if not linkedin_url or not linkedin_url.strip():
198
+ return "No LinkedIn URL provided"
199
+
200
+ # Check cache first
201
+ cached_content = self._get_cached_data(linkedin_url)
202
+ if cached_content:
203
+ return cached_content
204
+
205
+ try:
206
+ # Clean URL
207
+ linkedin_url = linkedin_url.strip()
208
+ if not linkedin_url.startswith('http'):
209
+ linkedin_url = 'https://' + linkedin_url
210
+
211
+ # Use appropriate scraping method
212
+ if self.use_selenium:
213
+ content = self.scrape_with_selenium(linkedin_url)
214
+ else:
215
+ content = self.scrape_with_requests(linkedin_url)
216
+
217
+ # Cache the result
218
+ self._cache_data(linkedin_url, content)
219
+
220
+ return content
221
+
222
+ except Exception as e:
223
+ return f"Error accessing LinkedIn: {str(e)}"
224
+
225
+ def scrape_company_website(self, company_name):
226
+ """Scrape company website as fallback"""
227
+ try:
228
+ # Try to construct company website URL
229
+ company_clean = re.sub(r'[^\w\s-]', '', company_name.lower())
230
+ company_clean = re.sub(r'\s+', '', company_clean)
231
+
232
+ possible_urls = [
233
+ f"https://{company_clean}.com",
234
+ f"https://www.{company_clean}.com",
235
+ f"https://{company_clean}.org",
236
+ f"https://www.{company_clean}.org"
237
+ ]
238
+
239
+ for url in possible_urls:
240
+ cached_content = self._get_cached_data(url)
241
+ if cached_content:
242
+ return cached_content
243
+
244
+ try:
245
+ if self.use_selenium:
246
+ content = self.scrape_with_selenium(url)
247
+ else:
248
+ content = self.scrape_with_requests(url)
249
+
250
+ if "Error" not in content and len(content) > 50:
251
+ self._cache_data(url, content)
252
+ return content
253
+ except:
254
+ continue
255
+
256
+ return f"Could not find website for {company_name}"
257
+
258
+ except Exception as e:
259
+ return f"Error finding company website: {str(e)}"
260
+
261
+ def scrape_linkedin_or_company(self, linkedin_url, company_name):
262
+ """Main method to scrape LinkedIn or fallback to company website"""
263
+ # First try LinkedIn
264
+ if linkedin_url and linkedin_url.strip():
265
+ linkedin_content = self.scrape_linkedin_profile(linkedin_url)
266
+ if "Error" not in linkedin_content and len(linkedin_content) > 50:
267
+ return f"LinkedIn: {linkedin_content}"
268
+
269
+ # Fallback to company website
270
+ company_content = self.scrape_company_website(company_name)
271
+ return f"Company Website: {company_content}"
272
+
273
+ def __del__(self):
274
+ """Clean up Selenium driver"""
275
+ if hasattr(self, 'driver'):
276
+ try:
277
+ self.driver.quit()
278
+ except:
279
+ pass