Spaces:
Running
Running
Update app2.py
Browse files
app2.py
CHANGED
@@ -18,6 +18,7 @@ import gradio as gr
|
|
18 |
from bs4 import BeautifulSoup
|
19 |
from fake_useragent import UserAgent
|
20 |
from cleantext import clean
|
|
|
21 |
|
22 |
# Setup logging
|
23 |
logging.basicConfig(
|
@@ -33,22 +34,10 @@ logger = logging.getLogger(__name__)
|
|
33 |
# Ensure output directories exist
|
34 |
Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
|
35 |
|
36 |
-
# At the top of the file, remove these imports:
|
37 |
-
# from config import Config
|
38 |
-
# from proxy_handler import ProxyHandler
|
39 |
-
# from robots_handler import RobotsHandler
|
40 |
-
|
41 |
class URLProcessor:
|
42 |
def __init__(self):
|
43 |
self.session = requests.Session()
|
44 |
-
self.timeout = 10
|
45 |
-
self.max_retries = 3
|
46 |
-
self.request_delay = 1.0
|
47 |
-
self.respect_robots = True
|
48 |
-
self.use_proxy = False
|
49 |
-
self.proxy_url = None
|
50 |
-
|
51 |
-
# Update session headers
|
52 |
self.session.headers.update({
|
53 |
'User-Agent': UserAgent().random,
|
54 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
@@ -57,55 +46,6 @@ class URLProcessor:
|
|
57 |
'Connection': 'keep-alive',
|
58 |
'Upgrade-Insecure-Requests': '1'
|
59 |
})
|
60 |
-
|
61 |
-
if self.use_proxy and self.proxy_url:
|
62 |
-
self.session.proxies = {
|
63 |
-
'http': self.proxy_url,
|
64 |
-
'https': self.proxy_url
|
65 |
-
}
|
66 |
-
|
67 |
-
def check_robots_txt(self, url: str) -> bool:
|
68 |
-
"""Check if URL is allowed by robots.txt"""
|
69 |
-
if not self.respect_robots:
|
70 |
-
return True
|
71 |
-
|
72 |
-
try:
|
73 |
-
from urllib.parse import urlparse
|
74 |
-
from urllib.robotparser import RobotFileParser
|
75 |
-
|
76 |
-
parsed_url = urlparse(url)
|
77 |
-
robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
|
78 |
-
|
79 |
-
rp = RobotFileParser()
|
80 |
-
rp.set_url(robots_url)
|
81 |
-
rp.read()
|
82 |
-
|
83 |
-
return rp.can_fetch(self.session.headers['User-Agent'], url)
|
84 |
-
except Exception as e:
|
85 |
-
logger.warning(f"Error checking robots.txt: {e}")
|
86 |
-
return True
|
87 |
-
|
88 |
-
def fetch_content(self, url: str) -> Optional[Dict]:
|
89 |
-
"""Fetch content with built-in rate limiting and robots.txt checking"""
|
90 |
-
if not self.check_robots_txt(url):
|
91 |
-
logger.warning(f"URL {url} is disallowed by robots.txt")
|
92 |
-
return None
|
93 |
-
|
94 |
-
time.sleep(self.request_delay) # Basic rate limiting
|
95 |
-
|
96 |
-
for attempt in range(self.max_retries):
|
97 |
-
try:
|
98 |
-
if 'drive.google.com' in url:
|
99 |
-
return self._handle_google_drive(url)
|
100 |
-
if 'calendar.google.com' in url:
|
101 |
-
return self._handle_google_calendar(url)
|
102 |
-
return self._fetch_html_content(url)
|
103 |
-
except Exception as e:
|
104 |
-
logger.error(f"Attempt {attempt + 1} failed: {e}")
|
105 |
-
if attempt < self.max_retries - 1:
|
106 |
-
time.sleep(self.request_delay * (attempt + 1))
|
107 |
-
|
108 |
-
return None
|
109 |
|
110 |
def advanced_text_cleaning(self, text: str) -> str:
|
111 |
"""Robust text cleaning with version compatibility"""
|
@@ -127,9 +67,9 @@ class URLProcessor:
|
|
127 |
return cleaned_text
|
128 |
except Exception as e:
|
129 |
logger.warning(f"Text cleaning error: {e}. Using fallback method.")
|
130 |
-
text = re.sub(r'[\x00
|
131 |
-
text = text.encode('ascii', 'ignore').decode('ascii')
|
132 |
-
text = re.sub(r'\s+', ' ', text)
|
133 |
return text.strip()
|
134 |
|
135 |
def validate_url(self, url: str) -> Dict:
|
@@ -226,7 +166,7 @@ class URLProcessor:
|
|
226 |
class FileProcessor:
|
227 |
"""Class to handle file processing"""
|
228 |
|
229 |
-
def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024):
|
230 |
self.max_file_size = max_file_size
|
231 |
self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
|
232 |
|
@@ -235,7 +175,7 @@ class FileProcessor:
|
|
235 |
try:
|
236 |
mime_type, _ = mimetypes.guess_type(filepath)
|
237 |
return (mime_type and mime_type.startswith('text/')) or \
|
238 |
-
(
|
239 |
except Exception:
|
240 |
return False
|
241 |
|
@@ -280,7 +220,7 @@ class FileProcessor:
|
|
280 |
"source": "file",
|
281 |
"filename": filename,
|
282 |
"content": content,
|
283 |
-
"timestamp": datetime.now
|
284 |
})
|
285 |
except Exception as e:
|
286 |
logger.error(f"Error reading file {filename}: {str(e)}")
|
@@ -319,6 +259,7 @@ class FileProcessor:
|
|
319 |
logger.error(f"File processing error: {e}")
|
320 |
return []
|
321 |
|
|
|
322 |
def clean_json(data: Union[str, Dict]) -> Optional[Dict]:
|
323 |
"""Clean and validate JSON data"""
|
324 |
try:
|
@@ -335,6 +276,7 @@ class FileProcessor:
|
|
335 |
logger.error(f"Unexpected error while cleaning JSON: {e}")
|
336 |
return None
|
337 |
|
|
|
338 |
def generate_qr_code(data: Union[str, Dict], combined: bool = True) -> List[str]:
|
339 |
"""Generate QR code(s) from data"""
|
340 |
try:
|
@@ -342,7 +284,7 @@ class FileProcessor:
|
|
342 |
output_dir.mkdir(parents=True, exist_ok=True)
|
343 |
|
344 |
if combined:
|
345 |
-
cleaned_data = clean_json(data)
|
346 |
if cleaned_data:
|
347 |
qr = qrcode.QRCode(
|
348 |
version=None,
|
@@ -359,10 +301,10 @@ class FileProcessor:
|
|
359 |
img.save(str(output_path))
|
360 |
return [str(output_path)]
|
361 |
else:
|
|
|
362 |
if isinstance(data, list):
|
363 |
-
paths = []
|
364 |
for idx, item in enumerate(data):
|
365 |
-
cleaned_item = clean_json(item)
|
366 |
if cleaned_item:
|
367 |
qr = qrcode.QRCode(
|
368 |
version=None,
|
@@ -378,9 +320,8 @@ class FileProcessor:
|
|
378 |
output_path = output_dir / f'item_{idx}_qr_{int(time.time())}.png'
|
379 |
img.save(str(output_path))
|
380 |
paths.append(str(output_path))
|
381 |
-
return paths
|
382 |
else:
|
383 |
-
cleaned_item = clean_json(data)
|
384 |
if cleaned_item:
|
385 |
qr = qrcode.QRCode(
|
386 |
version=None,
|
@@ -395,71 +336,32 @@ class FileProcessor:
|
|
395 |
img = qr.make_image(fill_color="black", back_color="white")
|
396 |
output_path = output_dir / f'single_qr_{int(time.time())}.png'
|
397 |
img.save(str(output_path))
|
398 |
-
|
399 |
-
|
400 |
-
return []
|
401 |
except Exception as e:
|
402 |
logger.error(f"QR generation error: {e}")
|
403 |
return []
|
404 |
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
logger.warning("No QR code found in image")
|
415 |
-
return None
|
416 |
-
except Exception as e:
|
417 |
-
logger.error(f"QR decoding error: {e}")
|
418 |
-
return None
|
419 |
-
|
420 |
-
# Replace the existing decode_qr function with this one
|
421 |
-
def decode_qr(image) -> List[str]:
|
422 |
-
"""Decode all QR codes found in an image"""
|
423 |
-
try:
|
424 |
-
# Convert to PIL Image if needed
|
425 |
-
if not isinstance(image, Image.Image):
|
426 |
-
image = Image.fromarray(image)
|
427 |
-
|
428 |
-
# Convert to grayscale for better detection
|
429 |
-
image = image.convert('L')
|
430 |
-
|
431 |
-
# Decode QR codes
|
432 |
-
decoded_objects = decode(image)
|
433 |
-
results = []
|
434 |
-
|
435 |
-
for obj in decoded_objects:
|
436 |
-
try:
|
437 |
-
decoded_text = obj.data.decode('utf-8')
|
438 |
-
results.append(decoded_text)
|
439 |
-
except UnicodeDecodeError:
|
440 |
-
logger.warning("Failed to decode QR code data as UTF-8")
|
441 |
-
continue
|
442 |
-
|
443 |
-
return results
|
444 |
-
except Exception as e:
|
445 |
-
logger.error(f"QR decoding error: {e}")
|
446 |
-
return []
|
447 |
-
|
448 |
-
raise ValueError("Unable to decode QR code")
|
449 |
-
except Exception as e:
|
450 |
-
logger.error(f"QR decoding error: {e}")
|
451 |
-
return None, None # Return None for both data and resolution in case of error
|
452 |
|
453 |
def datachat_trained(data_input: str, query: str) -> str:
|
454 |
"""Handle trained data interaction logic"""
|
455 |
-
data = clean_json(data_input)
|
456 |
if not data:
|
457 |
return "Invalid JSON data provided."
|
458 |
return f"[Trained Mode]\nData: {json.dumps(data, indent=2)}\nQuery: {query}"
|
459 |
|
460 |
def datachat_simple(data_input: str, query: str) -> str:
|
461 |
"""Handle simple chat interaction logic"""
|
462 |
-
data = clean_json(data_input)
|
463 |
if not data:
|
464 |
return "Invalid JSON data provided."
|
465 |
return f"[Chat Mode]\nData: {json.dumps(data, indent=2)}\nQuestion: {query}"
|
@@ -467,15 +369,14 @@ def datachat_simple(data_input: str, query: str) -> str:
|
|
467 |
def datachat_interface(mode: str, data_source: str, json_input: str, qr_image: str, query: str) -> str:
|
468 |
"""Interface for DataChat functionality"""
|
469 |
data = None
|
470 |
-
resolution = None # Initialize resolution variable
|
471 |
if data_source == "JSON Input":
|
472 |
data = json_input
|
473 |
elif data_source == "QR Code":
|
474 |
-
|
475 |
-
|
476 |
-
data = decoded_data
|
477 |
-
|
478 |
-
return
|
479 |
else:
|
480 |
return "No valid data source selected."
|
481 |
|
@@ -504,34 +405,37 @@ def create_interface():
|
|
504 |
json_input = gr.Textbox(lines=8, label="JSON Data")
|
505 |
qr_image = gr.Image(label="QR Code Image", type="filepath")
|
506 |
query = gr.Textbox(label="Query")
|
507 |
-
|
508 |
submit_btn = gr.Button("Submit")
|
509 |
output = gr.Textbox(label="Response")
|
510 |
-
|
511 |
submit_btn.click(datachat_interface, [mode, data_source, json_input, qr_image, query], output)
|
512 |
|
513 |
with gr.Tab("QR Generator"):
|
514 |
qr_input = gr.Textbox(lines=8, label="Input JSON for QR")
|
515 |
generate_btn = gr.Button("Generate QR")
|
516 |
qr_output = gr.Image(label="Generated QR Code")
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
525 |
|
526 |
return interface
|
527 |
|
528 |
def main():
|
529 |
mimetypes.init()
|
530 |
Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
|
531 |
-
|
532 |
-
|
533 |
-
iface.launch()
|
534 |
-
|
535 |
|
536 |
if __name__ == "__main__":
|
537 |
-
main()
|
|
|
18 |
from bs4 import BeautifulSoup
|
19 |
from fake_useragent import UserAgent
|
20 |
from cleantext import clean
|
21 |
+
import qrcode # Added missing import
|
22 |
|
23 |
# Setup logging
|
24 |
logging.basicConfig(
|
|
|
34 |
# Ensure output directories exist
|
35 |
Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
|
36 |
|
|
|
|
|
|
|
|
|
|
|
37 |
class URLProcessor:
|
38 |
def __init__(self):
|
39 |
self.session = requests.Session()
|
40 |
+
self.timeout = 10 # seconds
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
self.session.headers.update({
|
42 |
'User-Agent': UserAgent().random,
|
43 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
|
46 |
'Connection': 'keep-alive',
|
47 |
'Upgrade-Insecure-Requests': '1'
|
48 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
def advanced_text_cleaning(self, text: str) -> str:
|
51 |
"""Robust text cleaning with version compatibility"""
|
|
|
67 |
return cleaned_text
|
68 |
except Exception as e:
|
69 |
logger.warning(f"Text cleaning error: {e}. Using fallback method.")
|
70 |
+
text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text) # Fixed regex
|
71 |
+
text = text.encode('ascii', 'ignore').decode('ascii')
|
72 |
+
text = re.sub(r'\s+', ' ', text)
|
73 |
return text.strip()
|
74 |
|
75 |
def validate_url(self, url: str) -> Dict:
|
|
|
166 |
class FileProcessor:
|
167 |
"""Class to handle file processing"""
|
168 |
|
169 |
+
def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024):
|
170 |
self.max_file_size = max_file_size
|
171 |
self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
|
172 |
|
|
|
175 |
try:
|
176 |
mime_type, _ = mimetypes.guess_type(filepath)
|
177 |
return (mime_type and mime_type.startswith('text/')) or \
|
178 |
+
(Path(filepath).suffix.lower() in self.supported_text_extensions)
|
179 |
except Exception:
|
180 |
return False
|
181 |
|
|
|
220 |
"source": "file",
|
221 |
"filename": filename,
|
222 |
"content": content,
|
223 |
+
"timestamp": datetime.now().isoformat()
|
224 |
})
|
225 |
except Exception as e:
|
226 |
logger.error(f"Error reading file {filename}: {str(e)}")
|
|
|
259 |
logger.error(f"File processing error: {e}")
|
260 |
return []
|
261 |
|
262 |
+
@staticmethod
|
263 |
def clean_json(data: Union[str, Dict]) -> Optional[Dict]:
|
264 |
"""Clean and validate JSON data"""
|
265 |
try:
|
|
|
276 |
logger.error(f"Unexpected error while cleaning JSON: {e}")
|
277 |
return None
|
278 |
|
279 |
+
@staticmethod
|
280 |
def generate_qr_code(data: Union[str, Dict], combined: bool = True) -> List[str]:
|
281 |
"""Generate QR code(s) from data"""
|
282 |
try:
|
|
|
284 |
output_dir.mkdir(parents=True, exist_ok=True)
|
285 |
|
286 |
if combined:
|
287 |
+
cleaned_data = FileProcessor.clean_json(data)
|
288 |
if cleaned_data:
|
289 |
qr = qrcode.QRCode(
|
290 |
version=None,
|
|
|
301 |
img.save(str(output_path))
|
302 |
return [str(output_path)]
|
303 |
else:
|
304 |
+
paths = []
|
305 |
if isinstance(data, list):
|
|
|
306 |
for idx, item in enumerate(data):
|
307 |
+
cleaned_item = FileProcessor.clean_json(item)
|
308 |
if cleaned_item:
|
309 |
qr = qrcode.QRCode(
|
310 |
version=None,
|
|
|
320 |
output_path = output_dir / f'item_{idx}_qr_{int(time.time())}.png'
|
321 |
img.save(str(output_path))
|
322 |
paths.append(str(output_path))
|
|
|
323 |
else:
|
324 |
+
cleaned_item = FileProcessor.clean_json(data)
|
325 |
if cleaned_item:
|
326 |
qr = qrcode.QRCode(
|
327 |
version=None,
|
|
|
336 |
img = qr.make_image(fill_color="black", back_color="white")
|
337 |
output_path = output_dir / f'single_qr_{int(time.time())}.png'
|
338 |
img.save(str(output_path))
|
339 |
+
paths.append(str(output_path))
|
340 |
+
return paths
|
|
|
341 |
except Exception as e:
|
342 |
logger.error(f"QR generation error: {e}")
|
343 |
return []
|
344 |
|
345 |
+
def decode_qr(image_path: str) -> List[str]:
|
346 |
+
"""Decode QR code from image file"""
|
347 |
+
try:
|
348 |
+
image = Image.open(image_path)
|
349 |
+
decoded_objects = decode(image)
|
350 |
+
return [obj.data.decode('utf-8') for obj in decoded_objects]
|
351 |
+
except Exception as e:
|
352 |
+
logger.error(f"QR decoding error: {e}")
|
353 |
+
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
354 |
|
355 |
def datachat_trained(data_input: str, query: str) -> str:
|
356 |
"""Handle trained data interaction logic"""
|
357 |
+
data = FileProcessor.clean_json(data_input)
|
358 |
if not data:
|
359 |
return "Invalid JSON data provided."
|
360 |
return f"[Trained Mode]\nData: {json.dumps(data, indent=2)}\nQuery: {query}"
|
361 |
|
362 |
def datachat_simple(data_input: str, query: str) -> str:
|
363 |
"""Handle simple chat interaction logic"""
|
364 |
+
data = FileProcessor.clean_json(data_input)
|
365 |
if not data:
|
366 |
return "Invalid JSON data provided."
|
367 |
return f"[Chat Mode]\nData: {json.dumps(data, indent=2)}\nQuestion: {query}"
|
|
|
369 |
def datachat_interface(mode: str, data_source: str, json_input: str, qr_image: str, query: str) -> str:
|
370 |
"""Interface for DataChat functionality"""
|
371 |
data = None
|
|
|
372 |
if data_source == "JSON Input":
|
373 |
data = json_input
|
374 |
elif data_source == "QR Code":
|
375 |
+
decoded_data = decode_qr(qr_image)
|
376 |
+
if decoded_data:
|
377 |
+
data = decoded_data[0]
|
378 |
+
else:
|
379 |
+
return "Invalid QR code data provided"
|
380 |
else:
|
381 |
return "No valid data source selected."
|
382 |
|
|
|
405 |
json_input = gr.Textbox(lines=8, label="JSON Data")
|
406 |
qr_image = gr.Image(label="QR Code Image", type="filepath")
|
407 |
query = gr.Textbox(label="Query")
|
|
|
408 |
submit_btn = gr.Button("Submit")
|
409 |
output = gr.Textbox(label="Response")
|
|
|
410 |
submit_btn.click(datachat_interface, [mode, data_source, json_input, qr_image, query], output)
|
411 |
|
412 |
with gr.Tab("QR Generator"):
|
413 |
qr_input = gr.Textbox(lines=8, label="Input JSON for QR")
|
414 |
generate_btn = gr.Button("Generate QR")
|
415 |
qr_output = gr.Image(label="Generated QR Code")
|
416 |
+
generate_btn.click(
|
417 |
+
lambda x: FileProcessor.generate_qr_code(x)[0] if x else None,
|
418 |
+
inputs=qr_input,
|
419 |
+
outputs=qr_output
|
420 |
+
)
|
421 |
+
|
422 |
+
with gr.Tab("QR Decoder"):
|
423 |
+
qr_upload = gr.Image(label="Upload QR Code", type="filepath")
|
424 |
+
decode_btn = gr.Button("Decode QR")
|
425 |
+
decoded_output = gr.Textbox(label="Decoded Data")
|
426 |
+
decode_btn.click(
|
427 |
+
lambda x: "\n".join(decode_qr(x)),
|
428 |
+
inputs=qr_upload,
|
429 |
+
outputs=decoded_output
|
430 |
+
)
|
431 |
|
432 |
return interface
|
433 |
|
434 |
def main():
|
435 |
mimetypes.init()
|
436 |
Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
|
437 |
+
interface = create_interface()
|
438 |
+
interface.launch()
|
|
|
|
|
439 |
|
440 |
if __name__ == "__main__":
|
441 |
+
main()
|