Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -73,15 +73,28 @@ class URLProcessor:
|
|
73 |
if not validators.url(url):
|
74 |
return {'is_valid': False, 'message': 'Invalid URL format'}
|
75 |
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
return {'is_valid': True, 'message': 'URL is valid and accessible'}
|
79 |
except Exception as e:
|
|
|
80 |
return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
|
81 |
|
82 |
def fetch_content(self, url: str) -> Optional[Dict]:
|
83 |
"""Universal content fetcher with special case handling"""
|
84 |
try:
|
|
|
|
|
85 |
# Google Drive document handling
|
86 |
if 'drive.google.com' in url:
|
87 |
return self._handle_google_drive(url)
|
@@ -91,52 +104,37 @@ class URLProcessor:
|
|
91 |
return self._handle_google_calendar(url)
|
92 |
|
93 |
# Standard HTML processing
|
94 |
-
|
95 |
-
except Exception as e:
|
96 |
-
logger.error(f"Content fetch failed: {e}")
|
97 |
-
return None
|
98 |
-
|
99 |
-
def _handle_google_drive(self, url: str) -> Optional[Dict]:
|
100 |
-
"""Process Google Drive file links"""
|
101 |
-
try:
|
102 |
-
file_id = re.search(r'/file/d/([a-zA-Z0-9_-]+)', url)
|
103 |
-
if not file_id:
|
104 |
-
logger.error(f"Invalid Google Drive URL: {url}")
|
105 |
-
return None
|
106 |
-
|
107 |
-
direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
|
108 |
-
response = self.session.get(direct_url, timeout=self.timeout)
|
109 |
-
response.raise_for_status()
|
110 |
-
|
111 |
-
return {
|
112 |
-
'content': response.text,
|
113 |
-
'content_type': response.headers.get('Content-Type', ''),
|
114 |
-
'timestamp': datetime.now().isoformat()
|
115 |
-
}
|
116 |
-
except Exception as e:
|
117 |
-
logger.error(f"Google Drive processing failed: {e}")
|
118 |
-
return None
|
119 |
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
'content_type': 'text/calendar',
|
128 |
-
'timestamp': datetime.now().isoformat()
|
129 |
-
}
|
130 |
except Exception as e:
|
131 |
-
logger.error(f"
|
132 |
return None
|
133 |
|
134 |
def _fetch_html_content(self, url: str) -> Optional[Dict]:
|
135 |
"""Standard HTML content processing"""
|
136 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
response = self.session.get(url, timeout=self.timeout)
|
138 |
response.raise_for_status()
|
139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
soup = BeautifulSoup(response.text, 'html.parser')
|
141 |
|
142 |
# Remove unwanted elements
|
@@ -154,12 +152,42 @@ class URLProcessor:
|
|
154 |
elif 'instagram.com' in url or 'twitter.com' in url or 'x.com' in url:
|
155 |
for element in soup.select('[role="presentation"], [role="banner"], [role="complementary"]'):
|
156 |
element.decompose()
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
|
162 |
-
#
|
163 |
if not main_content or not main_content.get_text(strip=True):
|
164 |
logger.info(f"No main content container found for {url}, using body")
|
165 |
main_content = soup.body if soup.body else soup
|
@@ -167,17 +195,29 @@ class URLProcessor:
|
|
167 |
# Extract text with proper spacing
|
168 |
text_content = main_content.get_text(separator='\n', strip=True)
|
169 |
|
170 |
-
# If content is too short,
|
171 |
if len(text_content) < 100:
|
172 |
-
logger.info(f"Content too short for {url}, using all visible text")
|
173 |
visible_text = []
|
174 |
for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'span', 'div']):
|
175 |
if element.get_text(strip=True):
|
176 |
visible_text.append(element.get_text(strip=True))
|
177 |
text_content = '\n'.join(visible_text)
|
|
|
|
|
|
|
|
|
|
|
178 |
|
179 |
# Clean and structure content
|
180 |
cleaned_content = self.advanced_text_cleaning(text_content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
|
182 |
return {
|
183 |
'content': cleaned_content,
|
@@ -355,7 +395,7 @@ def generate_qr(json_data):
|
|
355 |
qr.add_data(json_data)
|
356 |
qr.make(fit=True)
|
357 |
|
358 |
-
img =
|
359 |
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
|
360 |
img.save(temp_file.name)
|
361 |
return temp_file.name
|
@@ -373,7 +413,7 @@ def generate_qr(json_data):
|
|
373 |
qr.add_data("Error: Data too large for QR code")
|
374 |
qr.make(fit=True)
|
375 |
|
376 |
-
img =
|
377 |
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
|
378 |
img.save(temp_file.name)
|
379 |
return temp_file.name
|
|
|
73 |
if not validators.url(url):
|
74 |
return {'is_valid': False, 'message': 'Invalid URL format'}
|
75 |
|
76 |
+
# Some sites block HEAD requests but allow GET
|
77 |
+
try:
|
78 |
+
response = self.session.head(url, timeout=self.timeout)
|
79 |
+
response.raise_for_status()
|
80 |
+
except (requests.exceptions.RequestException, Exception) as e:
|
81 |
+
logger.warning(f"HEAD request failed for {url}, trying GET: {e}")
|
82 |
+
# Try with GET request if HEAD fails
|
83 |
+
response = self.session.get(url, timeout=self.timeout, stream=True)
|
84 |
+
response.raise_for_status()
|
85 |
+
# Close the connection to avoid downloading the entire content
|
86 |
+
response.close()
|
87 |
+
|
88 |
return {'is_valid': True, 'message': 'URL is valid and accessible'}
|
89 |
except Exception as e:
|
90 |
+
logger.error(f"URL validation failed for {url}: {str(e)}")
|
91 |
return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
|
92 |
|
93 |
def fetch_content(self, url: str) -> Optional[Dict]:
|
94 |
"""Universal content fetcher with special case handling"""
|
95 |
try:
|
96 |
+
logger.info(f"Fetching content from: {url}")
|
97 |
+
|
98 |
# Google Drive document handling
|
99 |
if 'drive.google.com' in url:
|
100 |
return self._handle_google_drive(url)
|
|
|
104 |
return self._handle_google_calendar(url)
|
105 |
|
106 |
# Standard HTML processing
|
107 |
+
result = self._fetch_html_content(url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
+
# Log the result status
|
110 |
+
if result:
|
111 |
+
logger.info(f"Successfully extracted content from {url} ({len(result.get('content', ''))} chars)")
|
112 |
+
else:
|
113 |
+
logger.error(f"Failed to extract content from {url}")
|
114 |
+
|
115 |
+
return result
|
|
|
|
|
|
|
116 |
except Exception as e:
|
117 |
+
logger.error(f"Content fetch failed for {url}: {e}")
|
118 |
return None
|
119 |
|
120 |
def _fetch_html_content(self, url: str) -> Optional[Dict]:
|
121 |
"""Standard HTML content processing"""
|
122 |
try:
|
123 |
+
# Try with a different user agent if it's a social media site
|
124 |
+
if any(domain in url for domain in ['facebook.com', 'instagram.com', 'twitter.com', 'x.com', 'huggingface.co']):
|
125 |
+
self.session.headers.update({
|
126 |
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
|
127 |
+
})
|
128 |
+
|
129 |
response = self.session.get(url, timeout=self.timeout)
|
130 |
response.raise_for_status()
|
131 |
+
|
132 |
+
logger.info(f"Response status: {response.status_code}, Content-Type: {response.headers.get('Content-Type')}")
|
133 |
+
|
134 |
+
# Save the raw HTML for debugging if needed
|
135 |
+
with open(f"debug_raw_{int(time.time())}.html", "w", encoding="utf-8") as f:
|
136 |
+
f.write(response.text)
|
137 |
+
|
138 |
soup = BeautifulSoup(response.text, 'html.parser')
|
139 |
|
140 |
# Remove unwanted elements
|
|
|
152 |
elif 'instagram.com' in url or 'twitter.com' in url or 'x.com' in url:
|
153 |
for element in soup.select('[role="presentation"], [role="banner"], [role="complementary"]'):
|
154 |
element.decompose()
|
155 |
+
elif 'huggingface.co' in url:
|
156 |
+
# Special handling for Hugging Face
|
157 |
+
logger.info("Applying special handling for Hugging Face")
|
158 |
+
# Try to find the main content
|
159 |
+
hf_selectors = ['.prose', '.space-content', '.model-description', '.dataset-description', 'article', '.markdown']
|
160 |
+
for selector in hf_selectors:
|
161 |
+
elements = soup.select(selector)
|
162 |
+
if elements:
|
163 |
+
logger.info(f"Found Hugging Face content with selector: {selector}")
|
164 |
+
break
|
165 |
+
|
166 |
+
# Extract content using a general approach - try multiple strategies
|
167 |
+
# Strategy 1: Look for semantic HTML5 elements
|
168 |
+
main_content = None
|
169 |
+
for selector in ['main', 'article', 'section', '.content', '.main', '.body', '.post', '.entry', '.page']:
|
170 |
+
elements = soup.select(selector)
|
171 |
+
if elements:
|
172 |
+
main_content = elements[0]
|
173 |
+
logger.info(f"Found content with selector: {selector}")
|
174 |
+
break
|
175 |
+
|
176 |
+
# Strategy 2: If no semantic elements, try common class names
|
177 |
+
if not main_content or not main_content.get_text(strip=True):
|
178 |
+
for div in soup.find_all('div'):
|
179 |
+
class_name = div.get('class', [])
|
180 |
+
id_name = div.get('id', '')
|
181 |
+
if any(term in ' '.join(class_name).lower() for term in ['content', 'main', 'body', 'article', 'post']):
|
182 |
+
main_content = div
|
183 |
+
logger.info(f"Found content with div class: {class_name}")
|
184 |
+
break
|
185 |
+
if any(term in id_name.lower() for term in ['content', 'main', 'body', 'article', 'post']):
|
186 |
+
main_content = div
|
187 |
+
logger.info(f"Found content with div id: {id_name}")
|
188 |
+
break
|
189 |
|
190 |
+
# Strategy 3: Fall back to body
|
191 |
if not main_content or not main_content.get_text(strip=True):
|
192 |
logger.info(f"No main content container found for {url}, using body")
|
193 |
main_content = soup.body if soup.body else soup
|
|
|
195 |
# Extract text with proper spacing
|
196 |
text_content = main_content.get_text(separator='\n', strip=True)
|
197 |
|
198 |
+
# Strategy 4: If content is too short, extract all visible text
|
199 |
if len(text_content) < 100:
|
200 |
+
logger.info(f"Content too short for {url} ({len(text_content)} chars), using all visible text")
|
201 |
visible_text = []
|
202 |
for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'span', 'div']):
|
203 |
if element.get_text(strip=True):
|
204 |
visible_text.append(element.get_text(strip=True))
|
205 |
text_content = '\n'.join(visible_text)
|
206 |
+
|
207 |
+
# Strategy 5: Last resort - get all text from the page
|
208 |
+
if len(text_content) < 50:
|
209 |
+
logger.info(f"Still insufficient content for {url} ({len(text_content)} chars), using entire page text")
|
210 |
+
text_content = soup.get_text(separator='\n', strip=True)
|
211 |
|
212 |
# Clean and structure content
|
213 |
cleaned_content = self.advanced_text_cleaning(text_content)
|
214 |
+
|
215 |
+
logger.info(f"Final content length: {len(cleaned_content)} chars")
|
216 |
+
|
217 |
+
# If we still have no content, this is a failure
|
218 |
+
if len(cleaned_content) < 20:
|
219 |
+
logger.error(f"Failed to extract meaningful content from {url}")
|
220 |
+
return None
|
221 |
|
222 |
return {
|
223 |
'content': cleaned_content,
|
|
|
395 |
qr.add_data(json_data)
|
396 |
qr.make(fit=True)
|
397 |
|
398 |
+
img = qrcode.make_image(fill_color="black", back_color="white")
|
399 |
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
|
400 |
img.save(temp_file.name)
|
401 |
return temp_file.name
|
|
|
413 |
qr.add_data("Error: Data too large for QR code")
|
414 |
qr.make(fit=True)
|
415 |
|
416 |
+
img = qrcode.make_image(fill_color="black", back_color="white")
|
417 |
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
|
418 |
img.save(temp_file.name)
|
419 |
return temp_file.name
|