Spaces:
Running
Running
Update app2.py
Browse files
app2.py
CHANGED
@@ -457,7 +457,7 @@ def datachat_interface(mode: str, data_source: str, json_input: str, qr_image: s
|
|
457 |
data = json_input
|
458 |
elif data_source == "QR Code":
|
459 |
try:
|
460 |
-
decoded_data = decode_qr_code(qr_image)
|
461 |
data = decoded_data
|
462 |
if not data:
|
463 |
return "No QR code found in the provided image."
|
@@ -485,6 +485,231 @@ def create_interface():
|
|
485 |
with gr.Blocks(css=css, title="Advanced Data Processor & QR Code Generator") as interface:
|
486 |
gr.Markdown("# 🌐 Advanced Data Processing & QR Code Generator")
|
487 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
488 |
with gr.Tab("DataChat"):
|
489 |
mode = gr.Radio(["Trained with Data", "Chat about Data"], label="Mode")
|
490 |
data_source = gr.Radio(["JSON Input", "QR Code"], label="Data Source")
|
@@ -497,15 +722,16 @@ def create_interface():
|
|
497 |
|
498 |
submit_btn.click(datachat_interface, [mode, data_source, json_input, qr_image, query], output)
|
499 |
|
|
|
500 |
with gr.Tab("QR Generator"):
|
501 |
qr_input = gr.Textbox(lines=8, label="Input JSON for QR")
|
502 |
generate_btn = gr.Button("Generate QR")
|
503 |
qr_output = gr.Image(label="Generated QR Code")
|
504 |
|
505 |
def generate_qr(json_data):
|
506 |
-
data = clean_json(json_data)
|
507 |
if data:
|
508 |
-
return generate_qr_code(data)
|
509 |
return None
|
510 |
|
511 |
generate_btn.click(generate_qr, qr_input, qr_output)
|
|
|
457 |
data = json_input
|
458 |
elif data_source == "QR Code":
|
459 |
try:
|
460 |
+
decoded_data = decode_qr_code(qr_image)
|
461 |
data = decoded_data
|
462 |
if not data:
|
463 |
return "No QR code found in the provided image."
|
|
|
485 |
with gr.Blocks(css=css, title="Advanced Data Processor & QR Code Generator") as interface:
|
486 |
gr.Markdown("# 🌐 Advanced Data Processing & QR Code Generator")
|
487 |
|
488 |
+
# URL Extraction Tab
|
489 |
+
with gr.Tab("URL Extraction"):
|
490 |
+
url_input = gr.Textbox(label="URL to Process", placeholder="https://example.com")
|
491 |
+
depth_slider = gr.Slider(minimum=1, maximum=5, value=1, step=1, label="Crawl Depth (Higher values may affect performance)")
|
492 |
+
respect_robots = gr.Checkbox(label="Respect robots.txt", value=True)
|
493 |
+
extract_btn = gr.Button("Extract Content")
|
494 |
+
url_output = gr.JSON(label="Extracted Data")
|
495 |
+
download_btn = gr.Button("Download Results as ZIP")
|
496 |
+
download_output = gr.File(label="Download")
|
497 |
+
|
498 |
+
# Warning about depth
|
499 |
+
gr.Markdown("""
|
500 |
+
<div class="warning">
|
501 |
+
⚠️ <strong>Warning:</strong> Higher depth values (>2) may significantly increase processing time and resource usage.
|
502 |
+
</div>
|
503 |
+
""")
|
504 |
+
|
505 |
+
# URL processor instance
|
506 |
+
url_processor = URLProcessor()
|
507 |
+
|
508 |
+
def process_url(url, depth, respect_robots):
|
509 |
+
url_processor.respect_robots = respect_robots
|
510 |
+
results = []
|
511 |
+
try:
|
512 |
+
# Validate URL
|
513 |
+
validation = url_processor.validate_url(url)
|
514 |
+
if not validation['is_valid']:
|
515 |
+
return {"error": validation['message']}
|
516 |
+
|
517 |
+
# Process with depth
|
518 |
+
processed_urls = set()
|
519 |
+
urls_to_process = [(url, 0)] # (url, current_depth)
|
520 |
+
|
521 |
+
while urls_to_process:
|
522 |
+
current_url, current_depth = urls_to_process.pop(0)
|
523 |
+
|
524 |
+
if current_url in processed_urls:
|
525 |
+
continue
|
526 |
+
|
527 |
+
processed_urls.add(current_url)
|
528 |
+
content = url_processor.fetch_content(current_url)
|
529 |
+
|
530 |
+
if content:
|
531 |
+
results.append({
|
532 |
+
"url": current_url,
|
533 |
+
"content": content.get('content', ''),
|
534 |
+
"content_type": content.get('content_type', ''),
|
535 |
+
"timestamp": datetime.now().isoformat()
|
536 |
+
})
|
537 |
+
|
538 |
+
# If we haven't reached max depth, extract and queue more URLs
|
539 |
+
if current_depth < depth:
|
540 |
+
soup = BeautifulSoup(content.get('content', ''), 'html.parser')
|
541 |
+
for link in soup.find_all('a', href=True):
|
542 |
+
next_url = link['href']
|
543 |
+
if next_url.startswith('/'):
|
544 |
+
# Convert relative URL to absolute
|
545 |
+
from urllib.parse import urlparse, urljoin
|
546 |
+
parsed_url = urlparse(current_url)
|
547 |
+
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
548 |
+
next_url = urljoin(base_url, next_url)
|
549 |
+
|
550 |
+
if validators.url(next_url) and next_url not in processed_urls:
|
551 |
+
urls_to_process.append((next_url, current_depth + 1))
|
552 |
+
|
553 |
+
return results
|
554 |
+
except Exception as e:
|
555 |
+
logger.error(f"URL processing error: {e}")
|
556 |
+
return {"error": str(e)}
|
557 |
+
|
558 |
+
def create_download_zip(results):
|
559 |
+
if not results or (isinstance(results, dict) and 'error' in results):
|
560 |
+
return None
|
561 |
+
|
562 |
+
try:
|
563 |
+
# Create a temporary zip file
|
564 |
+
with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as tmp:
|
565 |
+
with zipfile.ZipFile(tmp.name, 'w') as zipf:
|
566 |
+
# Add JSON data
|
567 |
+
zipf.writestr('extracted_data.json', json.dumps(results, indent=2))
|
568 |
+
|
569 |
+
# Add individual text files for each URL
|
570 |
+
for idx, item in enumerate(results):
|
571 |
+
if 'content' in item:
|
572 |
+
zipf.writestr(f'content_{idx}_{int(time.time())}.txt', item['content'])
|
573 |
+
|
574 |
+
return tmp.name
|
575 |
+
except Exception as e:
|
576 |
+
logger.error(f"Error creating ZIP file: {e}")
|
577 |
+
return None
|
578 |
+
|
579 |
+
extract_btn.click(process_url, [url_input, depth_slider, respect_robots], url_output)
|
580 |
+
download_btn.click(create_download_zip, [url_output], download_output)
|
581 |
+
|
582 |
+
# ZIP File Extractor Tab
|
583 |
+
with gr.Tab("ZIP File Extractor"):
|
584 |
+
zip_file_input = gr.File(label="Upload ZIP File")
|
585 |
+
extract_zip_btn = gr.Button("Extract and Process")
|
586 |
+
zip_output = gr.JSON(label="Extracted Data")
|
587 |
+
zip_qr_btn = gr.Button("Generate QR Code")
|
588 |
+
zip_qr_output = gr.Image(label="QR Code")
|
589 |
+
|
590 |
+
file_processor = FileProcessor()
|
591 |
+
|
592 |
+
def process_zip_file(file):
|
593 |
+
if not file:
|
594 |
+
return {"error": "No file uploaded"}
|
595 |
+
|
596 |
+
try:
|
597 |
+
results = file_processor.process_file(file)
|
598 |
+
return results
|
599 |
+
except Exception as e:
|
600 |
+
logger.error(f"ZIP processing error: {e}")
|
601 |
+
return {"error": str(e)}
|
602 |
+
|
603 |
+
def generate_zip_qr(data):
|
604 |
+
if not data or (isinstance(data, dict) and 'error' in data):
|
605 |
+
return None
|
606 |
+
|
607 |
+
try:
|
608 |
+
return file_processor.generate_qr_code(data, combined=True)[0]
|
609 |
+
except Exception as e:
|
610 |
+
logger.error(f"QR generation error: {e}")
|
611 |
+
return None
|
612 |
+
|
613 |
+
extract_zip_btn.click(process_zip_file, [zip_file_input], zip_output)
|
614 |
+
zip_qr_btn.click(generate_zip_qr, [zip_output], zip_qr_output)
|
615 |
+
|
616 |
+
# Raw Text to JSON Tab
|
617 |
+
with gr.Tab("Text to JSON"):
|
618 |
+
text_input = gr.Textbox(lines=10, label="Raw Text Input")
|
619 |
+
json_structure = gr.Dropdown(
|
620 |
+
choices=["Simple", "Structured", "Key-Value Pairs"],
|
621 |
+
label="JSON Structure",
|
622 |
+
value="Simple"
|
623 |
+
)
|
624 |
+
convert_btn = gr.Button("Convert to JSON")
|
625 |
+
json_output = gr.JSON(label="JSON Output")
|
626 |
+
combine_json_btn = gr.Button("Combine with Previous JSON")
|
627 |
+
previous_json = gr.Textbox(lines=5, label="Previous JSON (Optional)")
|
628 |
+
combined_output = gr.JSON(label="Combined JSON")
|
629 |
+
text_qr_btn = gr.Button("Generate QR Code")
|
630 |
+
text_qr_output = gr.Image(label="QR Code")
|
631 |
+
|
632 |
+
def convert_text_to_json(text, structure):
|
633 |
+
if not text.strip():
|
634 |
+
return {"error": "No text provided"}
|
635 |
+
|
636 |
+
try:
|
637 |
+
if structure == "Simple":
|
638 |
+
return {
|
639 |
+
"text": text,
|
640 |
+
"timestamp": datetime.now().isoformat()
|
641 |
+
}
|
642 |
+
elif structure == "Structured":
|
643 |
+
lines = text.split('\n')
|
644 |
+
paragraphs = []
|
645 |
+
current_para = []
|
646 |
+
|
647 |
+
for line in lines:
|
648 |
+
if line.strip():
|
649 |
+
current_para.append(line)
|
650 |
+
elif current_para:
|
651 |
+
paragraphs.append(' '.join(current_para))
|
652 |
+
current_para = []
|
653 |
+
|
654 |
+
if current_para:
|
655 |
+
paragraphs.append(' '.join(current_para))
|
656 |
+
|
657 |
+
return {
|
658 |
+
"title": paragraphs[0] if paragraphs else "",
|
659 |
+
"paragraphs": paragraphs[1:] if len(paragraphs) > 1 else [],
|
660 |
+
"timestamp": datetime.now().isoformat()
|
661 |
+
}
|
662 |
+
elif structure == "Key-Value Pairs":
|
663 |
+
pairs = {}
|
664 |
+
lines = text.split('\n')
|
665 |
+
|
666 |
+
for line in lines:
|
667 |
+
if ':' in line:
|
668 |
+
key, value = line.split(':', 1)
|
669 |
+
pairs[key.strip()] = value.strip()
|
670 |
+
|
671 |
+
pairs["timestamp"] = datetime.now().isoformat()
|
672 |
+
return pairs
|
673 |
+
|
674 |
+
return {"error": "Invalid structure selected"}
|
675 |
+
except Exception as e:
|
676 |
+
logger.error(f"Text to JSON conversion error: {e}")
|
677 |
+
return {"error": str(e)}
|
678 |
+
|
679 |
+
def combine_json_data(current, previous):
|
680 |
+
if not current or (isinstance(current, dict) and 'error' in current):
|
681 |
+
return {"error": "No valid current JSON"}
|
682 |
+
|
683 |
+
try:
|
684 |
+
if not previous.strip():
|
685 |
+
return current
|
686 |
+
|
687 |
+
prev_json = json.loads(previous)
|
688 |
+
|
689 |
+
# Determine how to combine based on types
|
690 |
+
if isinstance(prev_json, list) and isinstance(current, list):
|
691 |
+
return prev_json + current
|
692 |
+
elif isinstance(prev_json, list):
|
693 |
+
return prev_json + [current]
|
694 |
+
elif isinstance(current, list):
|
695 |
+
return [prev_json] + current
|
696 |
+
else:
|
697 |
+
# Both are objects, merge them
|
698 |
+
combined = {**prev_json, **current}
|
699 |
+
# Add a combined timestamp
|
700 |
+
combined["combined_timestamp"] = datetime.now().isoformat()
|
701 |
+
return combined
|
702 |
+
except json.JSONDecodeError:
|
703 |
+
return {"error": "Previous JSON is invalid"}
|
704 |
+
except Exception as e:
|
705 |
+
logger.error(f"JSON combination error: {e}")
|
706 |
+
return {"error": str(e)}
|
707 |
+
|
708 |
+
convert_btn.click(convert_text_to_json, [text_input, json_structure], json_output)
|
709 |
+
combine_json_btn.click(combine_json_data, [json_output, previous_json], combined_output)
|
710 |
+
text_qr_btn.click(generate_zip_qr, [json_output], text_qr_output)
|
711 |
+
|
712 |
+
# DataChat Tab (existing)
|
713 |
with gr.Tab("DataChat"):
|
714 |
mode = gr.Radio(["Trained with Data", "Chat about Data"], label="Mode")
|
715 |
data_source = gr.Radio(["JSON Input", "QR Code"], label="Data Source")
|
|
|
722 |
|
723 |
submit_btn.click(datachat_interface, [mode, data_source, json_input, qr_image, query], output)
|
724 |
|
725 |
+
# QR Generator Tab (existing)
|
726 |
with gr.Tab("QR Generator"):
|
727 |
qr_input = gr.Textbox(lines=8, label="Input JSON for QR")
|
728 |
generate_btn = gr.Button("Generate QR")
|
729 |
qr_output = gr.Image(label="Generated QR Code")
|
730 |
|
731 |
def generate_qr(json_data):
|
732 |
+
data = file_processor.clean_json(json_data)
|
733 |
if data:
|
734 |
+
return file_processor.generate_qr_code(data)
|
735 |
return None
|
736 |
|
737 |
generate_btn.click(generate_qr, qr_input, qr_output)
|