Spaces:
Running
Running
Update app2.py
Browse files
app2.py
CHANGED
@@ -1,11 +1,10 @@
|
|
1 |
import json
|
2 |
import os
|
3 |
import re
|
4 |
-
import time
|
5 |
import logging
|
6 |
import mimetypes
|
7 |
from selenium import webdriver
|
8 |
-
from chromedriver_py import binary_path
|
9 |
import concurrent.futures
|
10 |
import string
|
11 |
import zipfile
|
@@ -17,7 +16,6 @@ from urllib.parse import urlparse
|
|
17 |
import requests
|
18 |
import validators
|
19 |
import gradio as gr
|
20 |
-
from diskcache import Cache
|
21 |
from bs4 import BeautifulSoup
|
22 |
from fake_useragent import UserAgent
|
23 |
from ratelimit import limits, sleep_and_retry
|
@@ -27,9 +25,6 @@ import nest_asyncio
|
|
27 |
nest_asyncio.apply()
|
28 |
import aiohttp
|
29 |
|
30 |
-
svc = webdriver.ChromeService(executable_path=binary_path)
|
31 |
-
driver = webdriver.Chrome(service=svc)
|
32 |
-
|
33 |
# Setup logging
|
34 |
logging.basicConfig(
|
35 |
level=logging.INFO,
|
@@ -49,7 +44,7 @@ class URLProcessor:
|
|
49 |
self.session = requests.Session()
|
50 |
self.timeout = 10 # seconds
|
51 |
self.session.headers.update({
|
52 |
-
'User-Agent': UserAgent().random,
|
53 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
54 |
'Accept-Language': 'en-US,en;q=0.5',
|
55 |
'Accept-Encoding': 'gzip, deflate, br',
|
@@ -77,7 +72,7 @@ class URLProcessor:
|
|
77 |
return cleaned_text
|
78 |
except Exception as e:
|
79 |
logger.warning(f"Text cleaning error: {e}. Using fallback method.")
|
80 |
-
text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text) # Remove control characters
|
81 |
text = text.encode('ascii', 'ignore').decode('ascii') # Remove non-ASCII characters
|
82 |
text = re.sub(r'\s+', ' ', text) # Normalize whitespace
|
83 |
return text.strip()
|
@@ -97,15 +92,10 @@ class URLProcessor:
|
|
97 |
def fetch_content(self, url: str) -> Optional[Dict]:
|
98 |
"""Universal content fetcher with special case handling"""
|
99 |
try:
|
100 |
-
# Google Drive document handling
|
101 |
if 'drive.google.com' in url:
|
102 |
return self._handle_google_drive(url)
|
103 |
-
|
104 |
-
# Google Calendar ICS handling
|
105 |
if 'calendar.google.com' in url and 'ical' in url:
|
106 |
return self._handle_google_calendar(url)
|
107 |
-
|
108 |
-
# Standard HTML processing
|
109 |
return self._fetch_html_content(url)
|
110 |
except Exception as e:
|
111 |
logger.error(f"Content fetch failed: {e}")
|
@@ -153,12 +143,9 @@ class URLProcessor:
|
|
153 |
response.raise_for_status()
|
154 |
|
155 |
soup = BeautifulSoup(response.text, 'html.parser')
|
156 |
-
|
157 |
-
# Remove unwanted elements
|
158 |
for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
|
159 |
element.decompose()
|
160 |
|
161 |
-
# Extract main content
|
162 |
main_content = soup.find('main') or soup.find('article') or soup.body
|
163 |
|
164 |
if main_content is None:
|
@@ -169,7 +156,6 @@ class URLProcessor:
|
|
169 |
'timestamp': datetime.now().isoformat()
|
170 |
}
|
171 |
|
172 |
-
# Clean and structure content
|
173 |
text_content = main_content.get_text(separator='\n', strip=True)
|
174 |
cleaned_content = self.advanced_text_cleaning(text_content)
|
175 |
|
@@ -206,7 +192,7 @@ class FileProcessor:
|
|
206 |
dataset = []
|
207 |
try:
|
208 |
file_size = os.path.getsize(file.name)
|
209 |
-
|
210 |
logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
|
211 |
return []
|
212 |
|
@@ -250,21 +236,17 @@ class FileProcessor:
|
|
250 |
try:
|
251 |
file_stat = os.stat(file.name)
|
252 |
|
253 |
-
# For very large files, read in chunks and summarize
|
254 |
if file_stat.st_size > 100 * 1024 * 1024: # 100MB
|
255 |
logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
|
256 |
|
257 |
-
# Read first and last 1MB for extremely large files
|
258 |
content = ""
|
259 |
with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
|
260 |
content = f.read(1 * 1024 * 1024) # First 1MB
|
261 |
content += "\n...[Content truncated due to large file size]...\n"
|
262 |
|
263 |
-
# Seek to the last 1MB
|
264 |
f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
|
265 |
content += f.read() # Last 1MB
|
266 |
else:
|
267 |
-
# Regular file processing
|
268 |
with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
|
269 |
content = f.read()
|
270 |
|
@@ -285,13 +267,10 @@ class FileProcessor:
|
|
285 |
def clean_json(data: Union[str, Dict]) -> Optional[Dict]:
|
286 |
"""Clean and validate JSON data"""
|
287 |
try:
|
288 |
-
# If it's a string, try to parse it
|
289 |
if isinstance(data, str):
|
290 |
-
# Remove any existing content and extra whitespace
|
291 |
data = data.strip()
|
292 |
data = json.loads(data)
|
293 |
|
294 |
-
# Convert to string and back to ensure proper JSON format
|
295 |
cleaned = json.loads(json.dumps(data))
|
296 |
return cleaned
|
297 |
except json.JSONDecodeError as e:
|
@@ -308,7 +287,6 @@ def generate_qr_code(data: Union[str, Dict], combined: bool = True) -> List[str]
|
|
308 |
output_dir.mkdir(parents=True, exist_ok=True)
|
309 |
|
310 |
if combined:
|
311 |
-
# Generate single QR code for all data
|
312 |
cleaned_data = clean_json(data)
|
313 |
if cleaned_data:
|
314 |
qr = qrcode.QRCode(
|
@@ -326,7 +304,6 @@ def generate_qr_code(data: Union[str, Dict], combined: bool = True) -> List[str]
|
|
326 |
img.save(str(output_path))
|
327 |
return [str(output_path)]
|
328 |
else:
|
329 |
-
# Generate separate QR codes for each item
|
330 |
if isinstance(data, list):
|
331 |
paths = []
|
332 |
for idx, item in enumerate(data):
|
@@ -339,7 +316,8 @@ def generate_qr_code(data: Union[str, Dict], combined: bool = True) -> List[str]
|
|
339 |
border=4,
|
340 |
)
|
341 |
json_str = json.dumps(cleaned_item, ensure_ascii=False)
|
342 |
-
qr.add_data(
|
|
|
343 |
qr.make(fit=True)
|
344 |
|
345 |
img = qr.make_image(fill_color="black", back_color="white")
|
@@ -348,7 +326,6 @@ def generate_qr_code(data: Union[str, Dict], combined: bool = True) -> List[str]
|
|
348 |
paths.append(str(output_path))
|
349 |
return paths
|
350 |
else:
|
351 |
-
# Single item, not combined
|
352 |
cleaned_item = clean_json(data)
|
353 |
if cleaned_item:
|
354 |
qr = qrcode.QRCode(
|
@@ -459,10 +436,8 @@ def create_interface():
|
|
459 |
try:
|
460 |
results = []
|
461 |
|
462 |
-
# Process text input first (since it's direct JSON)
|
463 |
if text and text.strip():
|
464 |
try:
|
465 |
-
# Try to parse as JSON
|
466 |
json_data = json.loads(text)
|
467 |
if isinstance(json_data, list):
|
468 |
results.extend(json_data)
|
@@ -471,7 +446,6 @@ def create_interface():
|
|
471 |
except json.JSONDecodeError as e:
|
472 |
return None, [], f"❌ Invalid JSON format: {str(e)}"
|
473 |
|
474 |
-
# Process URLs if provided
|
475 |
if urls and urls.strip():
|
476 |
processor = URLProcessor()
|
477 |
url_list = re.split(r'[,\n]', urls)
|
@@ -489,14 +463,12 @@ def create_interface():
|
|
489 |
'timestamp': datetime.now().isoformat()
|
490 |
})
|
491 |
|
492 |
-
# Process files if provided
|
493 |
if file:
|
494 |
file_processor = FileProcessor()
|
495 |
file_results = file_processor.process_file(file)
|
496 |
if file_results:
|
497 |
results.extend(file_results)
|
498 |
|
499 |
-
# Generate QR codes
|
500 |
if results:
|
501 |
qr_paths = generate_qr_code(results, combined=combine)
|
502 |
if qr_paths:
|
@@ -514,7 +486,6 @@ def create_interface():
|
|
514 |
logger.error(f"Processing error: {e}")
|
515 |
return None, [], f"❌ Error: {str(e)}"
|
516 |
|
517 |
-
# Set up event handlers
|
518 |
example_btn.click(load_example, outputs=[text_input])
|
519 |
clear_btn.click(clear_input, outputs=[text_input])
|
520 |
process_btn.click(
|
@@ -542,16 +513,9 @@ def create_interface():
|
|
542 |
return interface
|
543 |
|
544 |
def main():
|
545 |
-
# Configure system settings
|
546 |
mimetypes.init()
|
547 |
-
|
548 |
-
# Create output directories
|
549 |
Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
|
550 |
-
|
551 |
-
# Create and launch interface
|
552 |
interface = create_interface()
|
553 |
-
|
554 |
-
# Launch with proper configuration
|
555 |
interface.launch(
|
556 |
server_name="0.0.0.0",
|
557 |
server_port=8000,
|
@@ -562,6 +526,4 @@ def main():
|
|
562 |
)
|
563 |
|
564 |
if __name__ == "__main__":
|
565 |
-
main()
|
566 |
-
app.interface
|
567 |
-
|
|
|
1 |
import json
|
2 |
import os
|
3 |
import re
|
|
|
4 |
import logging
|
5 |
import mimetypes
|
6 |
from selenium import webdriver
|
7 |
+
from chromedriver_py import binary_path
|
8 |
import concurrent.futures
|
9 |
import string
|
10 |
import zipfile
|
|
|
16 |
import requests
|
17 |
import validators
|
18 |
import gradio as gr
|
|
|
19 |
from bs4 import BeautifulSoup
|
20 |
from fake_useragent import UserAgent
|
21 |
from ratelimit import limits, sleep_and_retry
|
|
|
25 |
nest_asyncio.apply()
|
26 |
import aiohttp
|
27 |
|
|
|
|
|
|
|
28 |
# Setup logging
|
29 |
logging.basicConfig(
|
30 |
level=logging.INFO,
|
|
|
44 |
self.session = requests.Session()
|
45 |
self.timeout = 10 # seconds
|
46 |
self.session.headers.update({
|
47 |
+
'User -Agent': UserAgent().random,
|
48 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
49 |
'Accept-Language': 'en-US,en;q=0.5',
|
50 |
'Accept-Encoding': 'gzip, deflate, br',
|
|
|
72 |
return cleaned_text
|
73 |
except Exception as e:
|
74 |
logger.warning(f"Text cleaning error: {e}. Using fallback method.")
|
75 |
+
text = re.sub(r'[\x00 -\x1F\x7F-\x9F]', '', text) # Remove control characters
|
76 |
text = text.encode('ascii', 'ignore').decode('ascii') # Remove non-ASCII characters
|
77 |
text = re.sub(r'\s+', ' ', text) # Normalize whitespace
|
78 |
return text.strip()
|
|
|
92 |
def fetch_content(self, url: str) -> Optional[Dict]:
|
93 |
"""Universal content fetcher with special case handling"""
|
94 |
try:
|
|
|
95 |
if 'drive.google.com' in url:
|
96 |
return self._handle_google_drive(url)
|
|
|
|
|
97 |
if 'calendar.google.com' in url and 'ical' in url:
|
98 |
return self._handle_google_calendar(url)
|
|
|
|
|
99 |
return self._fetch_html_content(url)
|
100 |
except Exception as e:
|
101 |
logger.error(f"Content fetch failed: {e}")
|
|
|
143 |
response.raise_for_status()
|
144 |
|
145 |
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
|
146 |
for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
|
147 |
element.decompose()
|
148 |
|
|
|
149 |
main_content = soup.find('main') or soup.find('article') or soup.body
|
150 |
|
151 |
if main_content is None:
|
|
|
156 |
'timestamp': datetime.now().isoformat()
|
157 |
}
|
158 |
|
|
|
159 |
text_content = main_content.get_text(separator='\n', strip=True)
|
160 |
cleaned_content = self.advanced_text_cleaning(text_content)
|
161 |
|
|
|
192 |
dataset = []
|
193 |
try:
|
194 |
file_size = os.path.getsize(file.name)
|
195 |
+
if file_size > self.max_file_size:
|
196 |
logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
|
197 |
return []
|
198 |
|
|
|
236 |
try:
|
237 |
file_stat = os.stat(file.name)
|
238 |
|
|
|
239 |
if file_stat.st_size > 100 * 1024 * 1024: # 100MB
|
240 |
logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
|
241 |
|
|
|
242 |
content = ""
|
243 |
with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
|
244 |
content = f.read(1 * 1024 * 1024) # First 1MB
|
245 |
content += "\n...[Content truncated due to large file size]...\n"
|
246 |
|
|
|
247 |
f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
|
248 |
content += f.read() # Last 1MB
|
249 |
else:
|
|
|
250 |
with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
|
251 |
content = f.read()
|
252 |
|
|
|
267 |
def clean_json(data: Union[str, Dict]) -> Optional[Dict]:
|
268 |
"""Clean and validate JSON data"""
|
269 |
try:
|
|
|
270 |
if isinstance(data, str):
|
|
|
271 |
data = data.strip()
|
272 |
data = json.loads(data)
|
273 |
|
|
|
274 |
cleaned = json.loads(json.dumps(data))
|
275 |
return cleaned
|
276 |
except json.JSONDecodeError as e:
|
|
|
287 |
output_dir.mkdir(parents=True, exist_ok=True)
|
288 |
|
289 |
if combined:
|
|
|
290 |
cleaned_data = clean_json(data)
|
291 |
if cleaned_data:
|
292 |
qr = qrcode.QRCode(
|
|
|
304 |
img.save(str(output_path))
|
305 |
return [str(output_path)]
|
306 |
else:
|
|
|
307 |
if isinstance(data, list):
|
308 |
paths = []
|
309 |
for idx, item in enumerate(data):
|
|
|
316 |
border=4,
|
317 |
)
|
318 |
json_str = json.dumps(cleaned_item, ensure_ascii=False)
|
319 |
+
qr.add_data(json ```python
|
320 |
+
_str)
|
321 |
qr.make(fit=True)
|
322 |
|
323 |
img = qr.make_image(fill_color="black", back_color="white")
|
|
|
326 |
paths.append(str(output_path))
|
327 |
return paths
|
328 |
else:
|
|
|
329 |
cleaned_item = clean_json(data)
|
330 |
if cleaned_item:
|
331 |
qr = qrcode.QRCode(
|
|
|
436 |
try:
|
437 |
results = []
|
438 |
|
|
|
439 |
if text and text.strip():
|
440 |
try:
|
|
|
441 |
json_data = json.loads(text)
|
442 |
if isinstance(json_data, list):
|
443 |
results.extend(json_data)
|
|
|
446 |
except json.JSONDecodeError as e:
|
447 |
return None, [], f"❌ Invalid JSON format: {str(e)}"
|
448 |
|
|
|
449 |
if urls and urls.strip():
|
450 |
processor = URLProcessor()
|
451 |
url_list = re.split(r'[,\n]', urls)
|
|
|
463 |
'timestamp': datetime.now().isoformat()
|
464 |
})
|
465 |
|
|
|
466 |
if file:
|
467 |
file_processor = FileProcessor()
|
468 |
file_results = file_processor.process_file(file)
|
469 |
if file_results:
|
470 |
results.extend(file_results)
|
471 |
|
|
|
472 |
if results:
|
473 |
qr_paths = generate_qr_code(results, combined=combine)
|
474 |
if qr_paths:
|
|
|
486 |
logger.error(f"Processing error: {e}")
|
487 |
return None, [], f"❌ Error: {str(e)}"
|
488 |
|
|
|
489 |
example_btn.click(load_example, outputs=[text_input])
|
490 |
clear_btn.click(clear_input, outputs=[text_input])
|
491 |
process_btn.click(
|
|
|
513 |
return interface
|
514 |
|
515 |
def main():
|
|
|
516 |
mimetypes.init()
|
|
|
|
|
517 |
Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
|
|
|
|
|
518 |
interface = create_interface()
|
|
|
|
|
519 |
interface.launch(
|
520 |
server_name="0.0.0.0",
|
521 |
server_port=8000,
|
|
|
526 |
)
|
527 |
|
528 |
if __name__ == "__main__":
|
529 |
+
main()
|
|
|
|