Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -517,4 +517,292 @@ class FileProcessor:
|
|
517 |
file_size = os.path.getsize(file.name)
|
518 |
if file_size > self.max_file_size:
|
519 |
logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
|
520 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
517 |
file_size = os.path.getsize(file.name)
|
518 |
if file_size > self.max_file_size:
|
519 |
logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
|
520 |
+
return []
|
521 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
522 |
+
if zipfile.is_zipfile(file.name):
|
523 |
+
dataset.extend(self._process_zip_file(file.name, temp_dir))
|
524 |
+
else:
|
525 |
+
dataset.extend(self._process_single_file(file))
|
526 |
+
except Exception as e:
|
527 |
+
logger.error(f"Error processing file: {str(e)}")
|
528 |
+
return []
|
529 |
+
return dataset
|
530 |
+
|
531 |
+
def _process_zip_file(self, zip_path, temp_dir):
|
532 |
+
"""Extract and process files within a ZIP archive."""
|
533 |
+
result = []
|
534 |
+
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
535 |
+
zip_ref.extractall(temp_dir)
|
536 |
+
for extracted_file in os.listdir(temp_dir):
|
537 |
+
extracted_file_path = os.path.join(temp_dir, extracted_file)
|
538 |
+
if os.path.isfile(extracted_file_path):
|
539 |
+
with open(extracted_file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
540 |
+
result.append({
|
541 |
+
'source': 'file_from_zip',
|
542 |
+
'filename': extracted_file,
|
543 |
+
'content': f.read(),
|
544 |
+
'timestamp': datetime.now().isoformat()
|
545 |
+
})
|
546 |
+
return result
|
547 |
+
|
548 |
+
def _process_single_file(self, file) -> List[Dict]:
|
549 |
+
try:
|
550 |
+
file_stat = os.stat(file.name)
|
551 |
+
# For very large files, read in chunks and summarize
|
552 |
+
if file_stat.st_size > 100 * 1024 * 1024: # 100MB
|
553 |
+
logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
|
554 |
+
# Read first and last 1MB for extremely large files
|
555 |
+
content = ""
|
556 |
+
with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
|
557 |
+
content = f.read(1 * 1024 * 1024) # First 1MB
|
558 |
+
content += "\n...[Content truncated due to large file size]...\n"
|
559 |
+
# Seek to the last 1MB
|
560 |
+
f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
|
561 |
+
content += f.read() # Last 1MB
|
562 |
+
else:
|
563 |
+
# Regular file processing
|
564 |
+
with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
|
565 |
+
content = f.read()
|
566 |
+
return [{
|
567 |
+
'source': 'file',
|
568 |
+
'filename': os.path.basename(file.name),
|
569 |
+
'file_size': file_stat.st_size,
|
570 |
+
'mime_type': mimetypes.guess_type(file.name)[0],
|
571 |
+
'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
|
572 |
+
'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
|
573 |
+
'content': content,
|
574 |
+
'timestamp': datetime.now().isoformat()
|
575 |
+
}]
|
576 |
+
except Exception as e:
|
577 |
+
logger.error(f"File processing error: {e}")
|
578 |
+
return []
|
579 |
+
|
580 |
+
|
581 |
+
# Move process_all_inputs outside of the FileProcessor class
|
582 |
+
def process_all_inputs(urls, file, text, notes):
|
583 |
+
"""Process all input types with progress tracking"""
|
584 |
+
try:
|
585 |
+
processor = URLProcessor()
|
586 |
+
file_processor = FileProcessor()
|
587 |
+
results = []
|
588 |
+
|
589 |
+
# Process URLs
|
590 |
+
if urls:
|
591 |
+
url_list = re.split(r'[,\n]', urls)
|
592 |
+
url_list = [url.strip() for url in url_list if url.strip()]
|
593 |
+
|
594 |
+
for url in url_list:
|
595 |
+
validation = processor.validate_url(url)
|
596 |
+
if validation.get('is_valid'):
|
597 |
+
content = processor.fetch_content(url)
|
598 |
+
if content:
|
599 |
+
results.append({
|
600 |
+
'source': 'url',
|
601 |
+
'url': url,
|
602 |
+
'content': content,
|
603 |
+
'timestamp': datetime.now().isoformat()
|
604 |
+
})
|
605 |
+
# Process files
|
606 |
+
if file:
|
607 |
+
results.extend(file_processor.process_file(file))
|
608 |
+
# Process text input
|
609 |
+
if text:
|
610 |
+
cleaned_text = processor.advanced_text_cleaning(text)
|
611 |
+
results.append({
|
612 |
+
'source': 'direct_input',
|
613 |
+
'content': cleaned_text,
|
614 |
+
'timestamp': datetime.now().isoformat()
|
615 |
+
})
|
616 |
+
# Generate output
|
617 |
+
if results:
|
618 |
+
output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
|
619 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
620 |
+
output_path = output_dir / f'processed_{int(time.time())}.json'
|
621 |
+
|
622 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
623 |
+
json.dump(results, f, ensure_ascii=False, indent=2)
|
624 |
+
summary = f"Processed {len(results)} items successfully!"
|
625 |
+
json_data = json.dumps(results, indent=2) # Prepare JSON for QR code
|
626 |
+
return str(output_path), summary, json_data # Return JSON for editor
|
627 |
+
else:
|
628 |
+
return None, "No valid content to process.", ""
|
629 |
+
except Exception as e:
|
630 |
+
logger.error(f"Processing error: {e}")
|
631 |
+
return None, f"Error: {str(e)}", ""
|
632 |
+
|
633 |
+
|
634 |
+
# Also move generate_qr_code outside of the FileProcessor class
|
635 |
+
def generate_qr_code(json_data):
|
636 |
+
"""Generate QR code from JSON data and return the file path."""
|
637 |
+
if json_data:
|
638 |
+
return generate_qr(json_data)
|
639 |
+
|
640 |
+
|
641 |
+
# Move generate_qr outside of the FileProcessor class as well
|
642 |
+
def generate_qr(json_data):
|
643 |
+
"""Generate QR code from JSON data and return the file path."""
|
644 |
+
try:
|
645 |
+
# Try first with automatic version selection
|
646 |
+
qr = qrcode.QRCode(
|
647 |
+
error_correction=qrcode.constants.ERROR_CORRECT_L,
|
648 |
+
box_size=10,
|
649 |
+
border=4,
|
650 |
+
)
|
651 |
+
qr.add_data(json_data)
|
652 |
+
qr.make(fit=True)
|
653 |
+
|
654 |
+
img = qrcode.make_image(fill_color="black", back_color="white")
|
655 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
|
656 |
+
img.save(temp_file.name)
|
657 |
+
return temp_file.name
|
658 |
+
except Exception as e:
|
659 |
+
# If the data is too large for a QR code
|
660 |
+
logger.error(f"QR generation error: {e}")
|
661 |
+
|
662 |
+
# Create a simple QR with error message
|
663 |
+
qr = qrcode.QRCode(
|
664 |
+
version=1,
|
665 |
+
error_correction=qrcode.constants.ERROR_CORRECT_L,
|
666 |
+
box_size=10,
|
667 |
+
border=4,
|
668 |
+
)
|
669 |
+
qr.add_data("Error: Data too large for QR code")
|
670 |
+
qr.make(fit=True)
|
671 |
+
|
672 |
+
img = qrcode.make_image(fill_color="black", back_color="white")
|
673 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
|
674 |
+
img.save(temp_file.name)
|
675 |
+
return temp_file.name
|
676 |
+
|
677 |
+
|
678 |
+
def create_interface():
|
679 |
+
"""Create a comprehensive Gradio interface with advanced features"""
|
680 |
+
css = """
|
681 |
+
.container { max-width: 1200px; margin: auto; }
|
682 |
+
.warning { background-color: #fff3cd; color: #856404; }
|
683 |
+
.error { background-color: #f8d7da; color: #721c24; }
|
684 |
+
"""
|
685 |
+
with gr.Blocks(css=css, title="Advanced Text & URL Processing") as interface:
|
686 |
+
gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
|
687 |
+
|
688 |
+
with gr.Tab("URL Processing"):
|
689 |
+
url_input = gr.Textbox(
|
690 |
+
label="Enter URLs (comma or newline separated)",
|
691 |
+
lines=5,
|
692 |
+
placeholder="https://example1.com\nhttps://example2.com"
|
693 |
+
)
|
694 |
+
|
695 |
+
with gr.Tab("File Input"):
|
696 |
+
file_input = gr.File(
|
697 |
+
label="Upload text file or ZIP archive",
|
698 |
+
file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"]
|
699 |
+
)
|
700 |
+
|
701 |
+
with gr.Tab("Text Input"):
|
702 |
+
text_input = gr.Textbox(
|
703 |
+
label="Raw Text Input",
|
704 |
+
lines=5,
|
705 |
+
placeholder="Paste your text here..."
|
706 |
+
)
|
707 |
+
|
708 |
+
with gr.Tab("JSON Editor"):
|
709 |
+
json_editor = gr.Textbox(
|
710 |
+
label="JSON Editor",
|
711 |
+
lines=20,
|
712 |
+
placeholder="View and edit your JSON data here...",
|
713 |
+
interactive=True,
|
714 |
+
elem_id="json-editor" # Optional: for custom styling
|
715 |
+
)
|
716 |
+
|
717 |
+
with gr.Tab("Scratchpad"):
|
718 |
+
scratchpad = gr.Textbox(
|
719 |
+
label="Scratchpad",
|
720 |
+
lines=10,
|
721 |
+
placeholder="Quick notes or text collections...",
|
722 |
+
interactive=True
|
723 |
+
)
|
724 |
+
|
725 |
+
process_btn = gr.Button("Process Input", variant="primary")
|
726 |
+
qr_btn = gr.Button("Generate QR Code", variant="secondary")
|
727 |
+
|
728 |
+
output_text = gr.Textbox(label="Processing Results", interactive=False)
|
729 |
+
output_file = gr.File(label="Processed Output")
|
730 |
+
qr_output = gr.Image(label="QR Code", type="filepath") # To display the generated QR code
|
731 |
+
|
732 |
+
process_btn.click(
|
733 |
+
process_all_inputs,
|
734 |
+
inputs=[url_input, file_input, text_input, scratchpad],
|
735 |
+
outputs=[output_file, output_text, json_editor] # Update outputs to include JSON editor
|
736 |
+
)
|
737 |
+
qr_btn.click(
|
738 |
+
generate_qr_code,
|
739 |
+
inputs=json_editor,
|
740 |
+
outputs=qr_output
|
741 |
+
)
|
742 |
+
gr.Markdown("""
|
743 |
+
### Usage Guidelines
|
744 |
+
- **URL Processing**: Enter valid HTTP/HTTPS URLs
|
745 |
+
- **File Input**: Upload text files or ZIP archives
|
746 |
+
- ** Text Input**: Direct text processing
|
747 |
+
- **JSON Editor**: View and edit your JSON data
|
748 |
+
- **Scratchpad**: Quick notes or text collections
|
749 |
+
- Advanced cleaning and validation included
|
750 |
+
""")
|
751 |
+
return interface
|
752 |
+
|
753 |
+
|
754 |
+
def check_network_connectivity():
|
755 |
+
"""Check if the network is working properly by testing connection to common sites"""
|
756 |
+
test_sites = ["https://www.google.com", "https://www.cloudflare.com", "https://www.amazon.com"]
|
757 |
+
results = []
|
758 |
+
|
759 |
+
for site in test_sites:
|
760 |
+
try:
|
761 |
+
response = requests.get(site, timeout=5)
|
762 |
+
results.append({
|
763 |
+
"site": site,
|
764 |
+
"status": "OK" if response.status_code == 200 else f"Error: {response.status_code}",
|
765 |
+
"response_time": response.elapsed.total_seconds()
|
766 |
+
})
|
767 |
+
except Exception as e:
|
768 |
+
results.append({
|
769 |
+
"site": site,
|
770 |
+
"status": f"Error: {str(e)}",
|
771 |
+
"response_time": None
|
772 |
+
})
|
773 |
+
# If all sites failed, there might be a network issue
|
774 |
+
if all(result["status"].startswith("Error") for result in results):
|
775 |
+
logger.error("Network connectivity issue detected. All test sites failed.")
|
776 |
+
return False, results
|
777 |
+
|
778 |
+
return True, results
|
779 |
+
|
780 |
+
|
781 |
+
# Add this to the main function
|
782 |
+
def main():
|
783 |
+
# Configure system settings
|
784 |
+
mimetypes.init()
|
785 |
+
|
786 |
+
# Check network connectivity
|
787 |
+
network_ok, network_results = check_network_connectivity()
|
788 |
+
if not network_ok:
|
789 |
+
logger.warning("Network connectivity issues detected. Some features may not work properly.")
|
790 |
+
for result in network_results:
|
791 |
+
logger.warning(f"Test site {result['site']}: {result['status']}")
|
792 |
+
# Create and launch interface
|
793 |
+
interface = create_interface()
|
794 |
+
|
795 |
+
# Launch with proper configuration
|
796 |
+
interface.launch(
|
797 |
+
server_name="0.0.0.0",
|
798 |
+
server_port=7860,
|
799 |
+
show_error=True,
|
800 |
+
share=False,
|
801 |
+
inbrowser=True,
|
802 |
+
debug=True
|
803 |
+
)
|
804 |
+
|
805 |
+
|
806 |
+
if __name__ == "__main__":
|
807 |
+
main()
|
808 |
+
|