ahmednoorx commited on
Commit
178c98f
·
verified ·
1 Parent(s): 9447233

fixed bugs

Browse files
Files changed (1) hide show
  1. scraper.py +283 -279
scraper.py CHANGED
@@ -1,279 +1,283 @@
1
- import requests
2
- from bs4 import BeautifulSoup
3
- import time
4
- import re
5
- from urllib.parse import urlparse, urljoin
6
- import sqlite3
7
- from selenium import webdriver
8
- from selenium.webdriver.chrome.options import Options
9
- from selenium.webdriver.common.by import By
10
- from selenium.webdriver.support.ui import WebDriverWait
11
- from selenium.webdriver.support import expected_conditions as EC
12
- from webdriver_manager.chrome import ChromeDriverManager
13
-
14
- class LinkedInScraper:
15
- def __init__(self, timeout=10, use_selenium=False):
16
- self.timeout = timeout
17
- self.use_selenium = use_selenium
18
- self.session = requests.Session()
19
- self.session.headers.update({
20
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
21
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
22
- 'Accept-Language': 'en-US,en;q=0.5',
23
- 'Accept-Encoding': 'gzip, deflate',
24
- 'Connection': 'keep-alive',
25
- 'Upgrade-Insecure-Requests': '1',
26
- })
27
-
28
- if self.use_selenium:
29
- self._setup_selenium()
30
-
31
- def _setup_selenium(self):
32
- """Setup Selenium WebDriver"""
33
- try:
34
- chrome_options = Options()
35
- chrome_options.add_argument('--headless')
36
- chrome_options.add_argument('--no-sandbox')
37
- chrome_options.add_argument('--disable-dev-shm-usage')
38
- chrome_options.add_argument('--disable-gpu')
39
- chrome_options.add_argument('--window-size=1920,1080')
40
-
41
- self.driver = webdriver.Chrome(
42
- ChromeDriverManager().install(),
43
- options=chrome_options
44
- )
45
- except Exception as e:
46
- print(f"Error setting up Selenium: {e}")
47
- self.use_selenium = False
48
-
49
- def _get_cached_data(self, url):
50
- """Check if URL data is cached in database"""
51
- try:
52
- conn = sqlite3.connect('leads.db')
53
- cursor = conn.cursor()
54
-
55
- cursor.execute('''
56
- CREATE TABLE IF NOT EXISTS scraped_cache (
57
- url TEXT PRIMARY KEY,
58
- content TEXT,
59
- scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
60
- )
61
- ''')
62
-
63
- cursor.execute('SELECT content FROM scraped_cache WHERE url = ?', (url,))
64
- result = cursor.fetchone()
65
- conn.close()
66
-
67
- return result[0] if result else None
68
- except Exception as e:
69
- print(f"Cache error: {e}")
70
- return None
71
-
72
- def _cache_data(self, url, content):
73
- """Cache scraped data"""
74
- try:
75
- conn = sqlite3.connect('leads.db')
76
- cursor = conn.cursor()
77
-
78
- cursor.execute('''
79
- INSERT OR REPLACE INTO scraped_cache (url, content)
80
- VALUES (?, ?)
81
- ''', (url, content))
82
-
83
- conn.commit()
84
- conn.close()
85
- except Exception as e:
86
- print(f"Cache save error: {e}")
87
-
88
- def scrape_with_requests(self, url):
89
- """Scrape URL using requests and BeautifulSoup"""
90
- try:
91
- response = self.session.get(url, timeout=self.timeout)
92
- response.raise_for_status()
93
-
94
- soup = BeautifulSoup(response.content, 'html.parser')
95
-
96
- # Extract various content types
97
- content_parts = []
98
-
99
- # Try to get meta description
100
- meta_desc = soup.find('meta', attrs={'name': 'description'})
101
- if meta_desc:
102
- content_parts.append(f"Description: {meta_desc.get('content', '')}")
103
-
104
- # Try to get title
105
- title = soup.find('title')
106
- if title:
107
- content_parts.append(f"Title: {title.get_text().strip()}")
108
-
109
- # Try to get about section or main content
110
- about_selectors = [
111
- '.about-section',
112
- '.company-description',
113
- '.about-us',
114
- '[class*="about"]',
115
- '.description',
116
- '.summary',
117
- 'main',
118
- '.content'
119
- ]
120
-
121
- for selector in about_selectors:
122
- elements = soup.select(selector)
123
- for element in elements:
124
- text = element.get_text().strip()
125
- if len(text) > 50: # Only meaningful content
126
- content_parts.append(text[:500]) # Limit length
127
- break
128
- if content_parts:
129
- break
130
-
131
- # If no specific content found, get paragraphs
132
- if not content_parts:
133
- paragraphs = soup.find_all('p')
134
- for p in paragraphs[:3]: # First 3 paragraphs
135
- text = p.get_text().strip()
136
- if len(text) > 30:
137
- content_parts.append(text[:300])
138
-
139
- return ' | '.join(content_parts) if content_parts else "No content extracted"
140
-
141
- except Exception as e:
142
- return f"Error scraping {url}: {str(e)}"
143
-
144
- def scrape_with_selenium(self, url):
145
- """Scrape URL using Selenium"""
146
- try:
147
- self.driver.get(url)
148
- WebDriverWait(self.driver, self.timeout).until(
149
- EC.presence_of_element_located((By.TAG_NAME, "body"))
150
- )
151
-
152
- # Wait a bit for dynamic content
153
- time.sleep(2)
154
-
155
- content_parts = []
156
-
157
- # Try different selectors for LinkedIn
158
- linkedin_selectors = [
159
- '[data-test-id="about-us-description"]',
160
- '.company-about-us-description',
161
- '.about-section',
162
- '[class*="about"]'
163
- ]
164
-
165
- for selector in linkedin_selectors:
166
- try:
167
- elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
168
- for element in elements:
169
- text = element.text.strip()
170
- if len(text) > 50:
171
- content_parts.append(text[:500])
172
- break
173
- except:
174
- continue
175
-
176
- # If no LinkedIn-specific content, try general selectors
177
- if not content_parts:
178
- general_selectors = ['main', '.content', 'article', '.description']
179
- for selector in general_selectors:
180
- try:
181
- elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
182
- for element in elements:
183
- text = element.text.strip()
184
- if len(text) > 50:
185
- content_parts.append(text[:500])
186
- break
187
- except:
188
- continue
189
-
190
- return ' | '.join(content_parts) if content_parts else "No content extracted"
191
-
192
- except Exception as e:
193
- return f"Error scraping {url} with Selenium: {str(e)}"
194
-
195
- def scrape_linkedin_profile(self, linkedin_url):
196
- """Scrape LinkedIn company profile"""
197
- if not linkedin_url or not linkedin_url.strip():
198
- return "No LinkedIn URL provided"
199
-
200
- # Check cache first
201
- cached_content = self._get_cached_data(linkedin_url)
202
- if cached_content:
203
- return cached_content
204
-
205
- try:
206
- # Clean URL
207
- linkedin_url = linkedin_url.strip()
208
- if not linkedin_url.startswith('http'):
209
- linkedin_url = 'https://' + linkedin_url
210
-
211
- # Use appropriate scraping method
212
- if self.use_selenium:
213
- content = self.scrape_with_selenium(linkedin_url)
214
- else:
215
- content = self.scrape_with_requests(linkedin_url)
216
-
217
- # Cache the result
218
- self._cache_data(linkedin_url, content)
219
-
220
- return content
221
-
222
- except Exception as e:
223
- return f"Error accessing LinkedIn: {str(e)}"
224
-
225
- def scrape_company_website(self, company_name):
226
- """Scrape company website as fallback"""
227
- try:
228
- # Try to construct company website URL
229
- company_clean = re.sub(r'[^\w\s-]', '', company_name.lower())
230
- company_clean = re.sub(r'\s+', '', company_clean)
231
-
232
- possible_urls = [
233
- f"https://{company_clean}.com",
234
- f"https://www.{company_clean}.com",
235
- f"https://{company_clean}.org",
236
- f"https://www.{company_clean}.org"
237
- ]
238
-
239
- for url in possible_urls:
240
- cached_content = self._get_cached_data(url)
241
- if cached_content:
242
- return cached_content
243
-
244
- try:
245
- if self.use_selenium:
246
- content = self.scrape_with_selenium(url)
247
- else:
248
- content = self.scrape_with_requests(url)
249
-
250
- if "Error" not in content and len(content) > 50:
251
- self._cache_data(url, content)
252
- return content
253
- except:
254
- continue
255
-
256
- return f"Could not find website for {company_name}"
257
-
258
- except Exception as e:
259
- return f"Error finding company website: {str(e)}"
260
-
261
- def scrape_linkedin_or_company(self, linkedin_url, company_name):
262
- """Main method to scrape LinkedIn or fallback to company website"""
263
- # First try LinkedIn
264
- if linkedin_url and linkedin_url.strip():
265
- linkedin_content = self.scrape_linkedin_profile(linkedin_url)
266
- if "Error" not in linkedin_content and len(linkedin_content) > 50:
267
- return f"LinkedIn: {linkedin_content}"
268
-
269
- # Fallback to company website
270
- company_content = self.scrape_company_website(company_name)
271
- return f"Company Website: {company_content}"
272
-
273
- def __del__(self):
274
- """Clean up Selenium driver"""
275
- if hasattr(self, 'driver'):
276
- try:
277
- self.driver.quit()
278
- except:
279
- pass
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import time
4
+ import re
5
+ from urllib.parse import urlparse, urljoin
6
+ import sqlite3
7
+ from selenium import webdriver
8
+ from selenium.webdriver.chrome.options import Options
9
+ from selenium.webdriver.common.by import By
10
+ from selenium.webdriver.support.ui import WebDriverWait
11
+ from selenium.webdriver.support import expected_conditions as EC
12
+ from webdriver_manager.chrome import ChromeDriverManager
13
+
14
+ class LinkedInScraper:
15
+ def __init__(self, timeout=10, use_selenium=False):
16
+ self.timeout = timeout
17
+ self.use_selenium = use_selenium
18
+ self.session = requests.Session()
19
+ self.session.headers.update({
20
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
21
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
22
+ 'Accept-Language': 'en-US,en;q=0.5',
23
+ 'Accept-Encoding': 'gzip, deflate',
24
+ 'Connection': 'keep-alive',
25
+ 'Upgrade-Insecure-Requests': '1',
26
+ })
27
+
28
+ if self.use_selenium:
29
+ self._setup_selenium()
30
+
31
+ def _setup_selenium(self):
32
+ """Setup Selenium WebDriver"""
33
+ try:
34
+ chrome_options = Options()
35
+ chrome_options.add_argument('--headless')
36
+ chrome_options.add_argument('--no-sandbox')
37
+ chrome_options.add_argument('--disable-dev-shm-usage')
38
+ chrome_options.add_argument('--disable-gpu')
39
+ chrome_options.add_argument('--window-size=1920,1080')
40
+
41
+ self.driver = webdriver.Chrome(
42
+ ChromeDriverManager().install(),
43
+ options=chrome_options
44
+ )
45
+ except Exception as e:
46
+ print(f"Error setting up Selenium: {e}")
47
+ self.use_selenium = False
48
+
49
+ def _get_cached_data(self, url):
50
+ """Check if URL data is cached in database"""
51
+ try:
52
+ conn = sqlite3.connect('leads.db')
53
+ cursor = conn.cursor()
54
+
55
+ cursor.execute('''
56
+ CREATE TABLE IF NOT EXISTS scraped_cache (
57
+ url TEXT PRIMARY KEY,
58
+ content TEXT,
59
+ scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
60
+ )
61
+ ''')
62
+
63
+ cursor.execute('SELECT content FROM scraped_cache WHERE url = ?', (url,))
64
+ result = cursor.fetchone()
65
+ conn.close()
66
+
67
+ return result[0] if result else None
68
+ except Exception as e:
69
+ print(f"Cache error: {e}")
70
+ return None
71
+
72
+ def _cache_data(self, url, content):
73
+ """Cache scraped data"""
74
+ try:
75
+ conn = sqlite3.connect('leads.db')
76
+ cursor = conn.cursor()
77
+
78
+ cursor.execute('''
79
+ INSERT OR REPLACE INTO scraped_cache (url, content)
80
+ VALUES (?, ?)
81
+ ''', (url, content))
82
+
83
+ conn.commit()
84
+ conn.close()
85
+ except Exception as e:
86
+ print(f"Cache save error: {e}")
87
+
88
+ def scrape_with_requests(self, url):
89
+ """Scrape URL using requests and BeautifulSoup"""
90
+ try:
91
+ response = self.session.get(url, timeout=self.timeout)
92
+ response.raise_for_status()
93
+
94
+ soup = BeautifulSoup(response.content, 'html.parser')
95
+
96
+ # Extract various content types
97
+ content_parts = []
98
+
99
+ # Try to get meta description
100
+ meta_desc = soup.find('meta', attrs={'name': 'description'})
101
+ if meta_desc:
102
+ content_parts.append(f"Description: {meta_desc.get('content', '')}")
103
+
104
+ # Try to get title
105
+ title = soup.find('title')
106
+ if title:
107
+ content_parts.append(f"Title: {title.get_text().strip()}")
108
+
109
+ # Try to get about section or main content
110
+ about_selectors = [
111
+ '.about-section',
112
+ '.company-description',
113
+ '.about-us',
114
+ '[class*="about"]',
115
+ '.description',
116
+ '.summary',
117
+ 'main',
118
+ '.content'
119
+ ]
120
+
121
+ for selector in about_selectors:
122
+ elements = soup.select(selector)
123
+ for element in elements:
124
+ text = element.get_text().strip()
125
+ if len(text) > 50: # Only meaningful content
126
+ content_parts.append(text[:500]) # Limit length
127
+ break
128
+ if content_parts:
129
+ break
130
+
131
+ # If no specific content found, get paragraphs
132
+ if not content_parts:
133
+ paragraphs = soup.find_all('p')
134
+ for p in paragraphs[:3]: # First 3 paragraphs
135
+ text = p.get_text().strip()
136
+ if len(text) > 30:
137
+ content_parts.append(text[:300])
138
+
139
+ return ' | '.join(content_parts) if content_parts else "No content extracted"
140
+
141
+ except Exception as e:
142
+ return f"Error scraping {url}: {str(e)}"
143
+
144
+ def scrape_with_selenium(self, url):
145
+ """Scrape URL using Selenium"""
146
+ try:
147
+ self.driver.get(url)
148
+ WebDriverWait(self.driver, self.timeout).until(
149
+ EC.presence_of_element_located((By.TAG_NAME, "body"))
150
+ )
151
+
152
+ # Wait a bit for dynamic content
153
+ time.sleep(2)
154
+
155
+ content_parts = []
156
+
157
+ # Try different selectors for LinkedIn
158
+ linkedin_selectors = [
159
+ '[data-test-id="about-us-description"]',
160
+ '.company-about-us-description',
161
+ '.about-section',
162
+ '[class*="about"]'
163
+ ]
164
+
165
+ for selector in linkedin_selectors:
166
+ try:
167
+ elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
168
+ for element in elements:
169
+ text = element.text.strip()
170
+ if len(text) > 50:
171
+ content_parts.append(text[:500])
172
+ break
173
+ except:
174
+ continue
175
+
176
+ # If no LinkedIn-specific content, try general selectors
177
+ if not content_parts:
178
+ general_selectors = ['main', '.content', 'article', '.description']
179
+ for selector in general_selectors:
180
+ try:
181
+ elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
182
+ for element in elements:
183
+ text = element.text.strip()
184
+ if len(text) > 50:
185
+ content_parts.append(text[:500])
186
+ break
187
+ except:
188
+ continue
189
+
190
+ return ' | '.join(content_parts) if content_parts else "No content extracted"
191
+
192
+ except Exception as e:
193
+ return f"Error scraping {url} with Selenium: {str(e)}"
194
+
195
+ def scrape_linkedin_profile(self, linkedin_url):
196
+ """Scrape LinkedIn company profile"""
197
+ if not linkedin_url or not linkedin_url.strip():
198
+ return "No LinkedIn URL provided"
199
+
200
+ # Check cache first
201
+ cached_content = self._get_cached_data(linkedin_url)
202
+ if cached_content:
203
+ return cached_content
204
+
205
+ try:
206
+ # Clean URL
207
+ linkedin_url = linkedin_url.strip()
208
+ if not linkedin_url.startswith('http'):
209
+ linkedin_url = 'https://' + linkedin_url
210
+
211
+ # Use appropriate scraping method
212
+ if self.use_selenium:
213
+ content = self.scrape_with_selenium(linkedin_url)
214
+ else:
215
+ content = self.scrape_with_requests(linkedin_url)
216
+
217
+ # Cache the result
218
+ self._cache_data(linkedin_url, content)
219
+
220
+ return content
221
+
222
+ except Exception as e:
223
+ return f"Error accessing LinkedIn: {str(e)}"
224
+
225
+ def scrape_linkedin_company(self, linkedin_url):
226
+ """Alias for scrape_linkedin_profile - for compatibility"""
227
+ return self.scrape_linkedin_profile(linkedin_url)
228
+
229
+ def scrape_company_website(self, company_name):
230
+ """Scrape company website as fallback"""
231
+ try:
232
+ # Try to construct company website URL
233
+ company_clean = re.sub(r'[^\w\s-]', '', company_name.lower())
234
+ company_clean = re.sub(r'\s+', '', company_clean)
235
+
236
+ possible_urls = [
237
+ f"https://{company_clean}.com",
238
+ f"https://www.{company_clean}.com",
239
+ f"https://{company_clean}.org",
240
+ f"https://www.{company_clean}.org"
241
+ ]
242
+
243
+ for url in possible_urls:
244
+ cached_content = self._get_cached_data(url)
245
+ if cached_content:
246
+ return cached_content
247
+
248
+ try:
249
+ if self.use_selenium:
250
+ content = self.scrape_with_selenium(url)
251
+ else:
252
+ content = self.scrape_with_requests(url)
253
+
254
+ if "Error" not in content and len(content) > 50:
255
+ self._cache_data(url, content)
256
+ return content
257
+ except:
258
+ continue
259
+
260
+ return f"Could not find website for {company_name}"
261
+
262
+ except Exception as e:
263
+ return f"Error finding company website: {str(e)}"
264
+
265
+ def scrape_linkedin_or_company(self, linkedin_url, company_name):
266
+ """Main method to scrape LinkedIn or fallback to company website"""
267
+ # First try LinkedIn
268
+ if linkedin_url and linkedin_url.strip():
269
+ linkedin_content = self.scrape_linkedin_profile(linkedin_url)
270
+ if "Error" not in linkedin_content and len(linkedin_content) > 50:
271
+ return f"LinkedIn: {linkedin_content}"
272
+
273
+ # Fallback to company website
274
+ company_content = self.scrape_company_website(company_name)
275
+ return f"Company Website: {company_content}"
276
+
277
+ def __del__(self):
278
+ """Clean up Selenium driver"""
279
+ if hasattr(self, 'driver'):
280
+ try:
281
+ self.driver.quit()
282
+ except:
283
+ pass