acecalisto3 commited on
Commit
6bdfb9b
·
verified ·
1 Parent(s): df3f48b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -48
app.py CHANGED
@@ -73,15 +73,28 @@ class URLProcessor:
73
  if not validators.url(url):
74
  return {'is_valid': False, 'message': 'Invalid URL format'}
75
 
76
- response = self.session.head(url, timeout=self.timeout)
77
- response.raise_for_status()
 
 
 
 
 
 
 
 
 
 
78
  return {'is_valid': True, 'message': 'URL is valid and accessible'}
79
  except Exception as e:
 
80
  return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
81
 
82
  def fetch_content(self, url: str) -> Optional[Dict]:
83
  """Universal content fetcher with special case handling"""
84
  try:
 
 
85
  # Google Drive document handling
86
  if 'drive.google.com' in url:
87
  return self._handle_google_drive(url)
@@ -91,52 +104,37 @@ class URLProcessor:
91
  return self._handle_google_calendar(url)
92
 
93
  # Standard HTML processing
94
- return self._fetch_html_content(url)
95
- except Exception as e:
96
- logger.error(f"Content fetch failed: {e}")
97
- return None
98
-
99
- def _handle_google_drive(self, url: str) -> Optional[Dict]:
100
- """Process Google Drive file links"""
101
- try:
102
- file_id = re.search(r'/file/d/([a-zA-Z0-9_-]+)', url)
103
- if not file_id:
104
- logger.error(f"Invalid Google Drive URL: {url}")
105
- return None
106
-
107
- direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
108
- response = self.session.get(direct_url, timeout=self.timeout)
109
- response.raise_for_status()
110
-
111
- return {
112
- 'content': response.text,
113
- 'content_type': response.headers.get('Content-Type', ''),
114
- 'timestamp': datetime.now().isoformat()
115
- }
116
- except Exception as e:
117
- logger.error(f"Google Drive processing failed: {e}")
118
- return None
119
 
120
- def _handle_google_calendar(self, url: str) -> Optional[Dict]:
121
- """Process Google Calendar ICS feeds"""
122
- try:
123
- response = self.session.get(url, timeout=self.timeout)
124
- response.raise_for_status()
125
- return {
126
- 'content': response.text,
127
- 'content_type': 'text/calendar',
128
- 'timestamp': datetime.now().isoformat()
129
- }
130
  except Exception as e:
131
- logger.error(f"Calendar fetch failed: {e}")
132
  return None
133
 
134
  def _fetch_html_content(self, url: str) -> Optional[Dict]:
135
  """Standard HTML content processing"""
136
  try:
 
 
 
 
 
 
137
  response = self.session.get(url, timeout=self.timeout)
138
  response.raise_for_status()
139
-
 
 
 
 
 
 
140
  soup = BeautifulSoup(response.text, 'html.parser')
141
 
142
  # Remove unwanted elements
@@ -154,12 +152,42 @@ class URLProcessor:
154
  elif 'instagram.com' in url or 'twitter.com' in url or 'x.com' in url:
155
  for element in soup.select('[role="presentation"], [role="banner"], [role="complementary"]'):
156
  element.decompose()
157
-
158
- # Extract content using a general approach
159
- # First try to find main content containers
160
- main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=lambda c: c and any(term in c for term in ['content', 'main', 'body', 'post', 'feed', 'timeline']))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
- # If no specific container found, fall back to body
163
  if not main_content or not main_content.get_text(strip=True):
164
  logger.info(f"No main content container found for {url}, using body")
165
  main_content = soup.body if soup.body else soup
@@ -167,17 +195,29 @@ class URLProcessor:
167
  # Extract text with proper spacing
168
  text_content = main_content.get_text(separator='\n', strip=True)
169
 
170
- # If content is too short, try a more aggressive approach to get all visible text
171
  if len(text_content) < 100:
172
- logger.info(f"Content too short for {url}, using all visible text")
173
  visible_text = []
174
  for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'span', 'div']):
175
  if element.get_text(strip=True):
176
  visible_text.append(element.get_text(strip=True))
177
  text_content = '\n'.join(visible_text)
 
 
 
 
 
178
 
179
  # Clean and structure content
180
  cleaned_content = self.advanced_text_cleaning(text_content)
 
 
 
 
 
 
 
181
 
182
  return {
183
  'content': cleaned_content,
@@ -355,7 +395,7 @@ def generate_qr(json_data):
355
  qr.add_data(json_data)
356
  qr.make(fit=True)
357
 
358
- img = qr.make_image(fill_color="black", back_color="white")
359
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
360
  img.save(temp_file.name)
361
  return temp_file.name
@@ -373,7 +413,7 @@ def generate_qr(json_data):
373
  qr.add_data("Error: Data too large for QR code")
374
  qr.make(fit=True)
375
 
376
- img = qr.make_image(fill_color="black", back_color="white")
377
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
378
  img.save(temp_file.name)
379
  return temp_file.name
 
73
  if not validators.url(url):
74
  return {'is_valid': False, 'message': 'Invalid URL format'}
75
 
76
+ # Some sites block HEAD requests but allow GET
77
+ try:
78
+ response = self.session.head(url, timeout=self.timeout)
79
+ response.raise_for_status()
80
+ except (requests.exceptions.RequestException, Exception) as e:
81
+ logger.warning(f"HEAD request failed for {url}, trying GET: {e}")
82
+ # Try with GET request if HEAD fails
83
+ response = self.session.get(url, timeout=self.timeout, stream=True)
84
+ response.raise_for_status()
85
+ # Close the connection to avoid downloading the entire content
86
+ response.close()
87
+
88
  return {'is_valid': True, 'message': 'URL is valid and accessible'}
89
  except Exception as e:
90
+ logger.error(f"URL validation failed for {url}: {str(e)}")
91
  return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
92
 
93
  def fetch_content(self, url: str) -> Optional[Dict]:
94
  """Universal content fetcher with special case handling"""
95
  try:
96
+ logger.info(f"Fetching content from: {url}")
97
+
98
  # Google Drive document handling
99
  if 'drive.google.com' in url:
100
  return self._handle_google_drive(url)
 
104
  return self._handle_google_calendar(url)
105
 
106
  # Standard HTML processing
107
+ result = self._fetch_html_content(url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
+ # Log the result status
110
+ if result:
111
+ logger.info(f"Successfully extracted content from {url} ({len(result.get('content', ''))} chars)")
112
+ else:
113
+ logger.error(f"Failed to extract content from {url}")
114
+
115
+ return result
 
 
 
116
  except Exception as e:
117
+ logger.error(f"Content fetch failed for {url}: {e}")
118
  return None
119
 
120
  def _fetch_html_content(self, url: str) -> Optional[Dict]:
121
  """Standard HTML content processing"""
122
  try:
123
+ # Try with a different user agent if it's a social media site
124
+ if any(domain in url for domain in ['facebook.com', 'instagram.com', 'twitter.com', 'x.com', 'huggingface.co']):
125
+ self.session.headers.update({
126
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
127
+ })
128
+
129
  response = self.session.get(url, timeout=self.timeout)
130
  response.raise_for_status()
131
+
132
+ logger.info(f"Response status: {response.status_code}, Content-Type: {response.headers.get('Content-Type')}")
133
+
134
+ # Save the raw HTML for debugging if needed
135
+ with open(f"debug_raw_{int(time.time())}.html", "w", encoding="utf-8") as f:
136
+ f.write(response.text)
137
+
138
  soup = BeautifulSoup(response.text, 'html.parser')
139
 
140
  # Remove unwanted elements
 
152
  elif 'instagram.com' in url or 'twitter.com' in url or 'x.com' in url:
153
  for element in soup.select('[role="presentation"], [role="banner"], [role="complementary"]'):
154
  element.decompose()
155
+ elif 'huggingface.co' in url:
156
+ # Special handling for Hugging Face
157
+ logger.info("Applying special handling for Hugging Face")
158
+ # Try to find the main content
159
+ hf_selectors = ['.prose', '.space-content', '.model-description', '.dataset-description', 'article', '.markdown']
160
+ for selector in hf_selectors:
161
+ elements = soup.select(selector)
162
+ if elements:
163
+ logger.info(f"Found Hugging Face content with selector: {selector}")
164
+ break
165
+
166
+ # Extract content using a general approach - try multiple strategies
167
+ # Strategy 1: Look for semantic HTML5 elements
168
+ main_content = None
169
+ for selector in ['main', 'article', 'section', '.content', '.main', '.body', '.post', '.entry', '.page']:
170
+ elements = soup.select(selector)
171
+ if elements:
172
+ main_content = elements[0]
173
+ logger.info(f"Found content with selector: {selector}")
174
+ break
175
+
176
+ # Strategy 2: If no semantic elements, try common class names
177
+ if not main_content or not main_content.get_text(strip=True):
178
+ for div in soup.find_all('div'):
179
+ class_name = div.get('class', [])
180
+ id_name = div.get('id', '')
181
+ if any(term in ' '.join(class_name).lower() for term in ['content', 'main', 'body', 'article', 'post']):
182
+ main_content = div
183
+ logger.info(f"Found content with div class: {class_name}")
184
+ break
185
+ if any(term in id_name.lower() for term in ['content', 'main', 'body', 'article', 'post']):
186
+ main_content = div
187
+ logger.info(f"Found content with div id: {id_name}")
188
+ break
189
 
190
+ # Strategy 3: Fall back to body
191
  if not main_content or not main_content.get_text(strip=True):
192
  logger.info(f"No main content container found for {url}, using body")
193
  main_content = soup.body if soup.body else soup
 
195
  # Extract text with proper spacing
196
  text_content = main_content.get_text(separator='\n', strip=True)
197
 
198
+ # Strategy 4: If content is too short, extract all visible text
199
  if len(text_content) < 100:
200
+ logger.info(f"Content too short for {url} ({len(text_content)} chars), using all visible text")
201
  visible_text = []
202
  for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'span', 'div']):
203
  if element.get_text(strip=True):
204
  visible_text.append(element.get_text(strip=True))
205
  text_content = '\n'.join(visible_text)
206
+
207
+ # Strategy 5: Last resort - get all text from the page
208
+ if len(text_content) < 50:
209
+ logger.info(f"Still insufficient content for {url} ({len(text_content)} chars), using entire page text")
210
+ text_content = soup.get_text(separator='\n', strip=True)
211
 
212
  # Clean and structure content
213
  cleaned_content = self.advanced_text_cleaning(text_content)
214
+
215
+ logger.info(f"Final content length: {len(cleaned_content)} chars")
216
+
217
+ # If we still have no content, this is a failure
218
+ if len(cleaned_content) < 20:
219
+ logger.error(f"Failed to extract meaningful content from {url}")
220
+ return None
221
 
222
  return {
223
  'content': cleaned_content,
 
395
  qr.add_data(json_data)
396
  qr.make(fit=True)
397
 
398
+ img = qrcode.make_image(fill_color="black", back_color="white")
399
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
400
  img.save(temp_file.name)
401
  return temp_file.name
 
413
  qr.add_data("Error: Data too large for QR code")
414
  qr.make(fit=True)
415
 
416
+ img = qrcode.make_image(fill_color="black", back_color="white")
417
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
418
  img.save(temp_file.name)
419
  return temp_file.name