Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -194,134 +194,29 @@ class EnhancedFileProcessor:
|
|
194 |
"""Process uploaded file with enhanced error handling and complete extraction"""
|
195 |
if not file:
|
196 |
return []
|
197 |
-
|
198 |
dataset = []
|
199 |
try:
|
200 |
file_size = os.path.getsize(file.name)
|
201 |
if file_size > self.max_file_size:
|
202 |
-
logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
|
203 |
return []
|
204 |
-
|
205 |
-
if file.name.endswith('.pdf'):
|
206 |
-
dataset.extend(self._process_pdf(file))
|
207 |
-
elif file.name.endswith('.docx'):
|
208 |
-
dataset.extend(self._process_docx(file))
|
209 |
-
elif file.name.endswith('.csv'):
|
210 |
-
dataset.extend(self._process_csv(file))
|
211 |
-
elif file.name.endswith('.json'):
|
212 |
-
dataset.extend(self._process_json(file))
|
213 |
-
elif file.name.endswith('.xml'):
|
214 |
-
dataset.extend(self._process_xml(file))
|
215 |
-
elif file.name.endswith('.md'):
|
216 |
-
dataset.extend(self._process_markdown(file))
|
217 |
-
# Add additional conditions for other file types...
|
218 |
-
|
219 |
-
except Exception as e:
|
220 |
-
logger.error(f"Error processing file: {str(e)}")
|
221 |
-
return []
|
222 |
-
|
223 |
-
return dataset
|
224 |
-
|
225 |
-
def _process_pdf(self, file) -> List[Dict]:
|
226 |
-
"""Process a PDF file and extract text"""
|
227 |
-
try:
|
228 |
-
content_parts = []
|
229 |
-
with open(file.name, 'rb') as f:
|
230 |
-
reader = PyPDF2.PdfReader(f)
|
231 |
-
for page in reader.pages:
|
232 |
-
content_parts.append(page.extract_text() or "")
|
233 |
-
complete_content = ''.join(content_parts)
|
234 |
-
return [{
|
235 |
-
'source': 'pdf',
|
236 |
-
'filename': os.path.basename(file.name),
|
237 |
-
'content': complete_content,
|
238 |
-
'timestamp': datetime.now().isoformat()
|
239 |
-
}]
|
240 |
-
except Exception as e:
|
241 |
-
logger.error(f"PDF processing error: {e}")
|
242 |
-
return []
|
243 |
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
return [{
|
253 |
-
'source': 'docx',
|
254 |
-
'filename': os.path.basename(file.name),
|
255 |
-
'content': complete_content,
|
256 |
-
'timestamp': datetime.now().isoformat()
|
257 |
-
}]
|
258 |
-
except Exception as e:
|
259 |
-
logger.error(f"DOCX processing error: {e}")
|
260 |
-
return []
|
261 |
-
|
262 |
-
def _process_csv(self, file) -> List[Dict]:
|
263 |
-
"""Process a CSV file and extract text"""
|
264 |
-
try:
|
265 |
-
import pandas as pd
|
266 |
-
df = pd.read_csv(file.name)
|
267 |
-
content = df.to_string(index=False)
|
268 |
-
return [{
|
269 |
-
'source': 'csv',
|
270 |
-
'filename': os.path.basename(file.name),
|
271 |
-
'content': content,
|
272 |
-
'timestamp': datetime.now().isoformat()
|
273 |
-
}]
|
274 |
-
except Exception as e:
|
275 |
-
logger.error(f"CSV processing error: {e}")
|
276 |
-
return []
|
277 |
-
|
278 |
-
def _process_json(self, file) -> List[Dict]:
|
279 |
-
"""Process a JSON file and extract text"""
|
280 |
-
try:
|
281 |
-
with open(file.name, 'r') as f:
|
282 |
-
content = json.load(f)
|
283 |
-
return [{
|
284 |
-
'source': 'json',
|
285 |
-
'filename': os.path.basename(file.name),
|
286 |
-
'content': json.dumps(content, indent=2),
|
287 |
-
'timestamp': datetime.now().isoformat()
|
288 |
-
}]
|
289 |
-
except Exception as e:
|
290 |
-
logger.error(f"JSON processing error: {e}")
|
291 |
-
return []
|
292 |
|
293 |
-
def _process_xml(self, file) -> List[Dict]:
|
294 |
-
"""Process an XML file and extract text"""
|
295 |
-
try:
|
296 |
-
with open(file.name, 'r') as f:
|
297 |
-
content = f.read()
|
298 |
-
return [{
|
299 |
-
'source': 'xml',
|
300 |
-
'filename': os.path.basename(file.name),
|
301 |
-
'content': content,
|
302 |
-
'timestamp': datetime.now().isoformat()
|
303 |
-
}]
|
304 |
except Exception as e:
|
305 |
-
logger.error(f"
|
306 |
return []
|
307 |
|
308 |
-
|
309 |
-
"""Process a Markdown file and extract text"""
|
310 |
-
try:
|
311 |
-
with open(file.name, 'r') as f:
|
312 |
-
content = f.read()
|
313 |
-
return [{
|
314 |
-
'source': 'markdown',
|
315 |
-
'filename': os.path.basename(file.name),
|
316 |
-
'content': content,
|
317 |
-
'timestamp': datetime.now().isoformat()
|
318 |
-
}]
|
319 |
-
except Exception as e:
|
320 |
-
logger.error(f"Markdown processing error: {e}")
|
321 |
-
return []
|
322 |
|
323 |
-
# Add similar methods for other file types as needed...
|
324 |
-
|
325 |
def _is_archive(self, filepath: str) -> bool:
|
326 |
"""Check if file is an archive"""
|
327 |
return any(filepath.lower().endswith(ext) for ext in [
|
@@ -665,47 +560,7 @@ def create_modern_interface():
|
|
665 |
with gr.Tab("π File Input"):
|
666 |
file_input = gr.File(
|
667 |
label="Upload Files",
|
668 |
-
file_types=[
|
669 |
-
"text/*", # All text files
|
670 |
-
"application/pdf", # PDF files
|
671 |
-
"application/zip", # ZIP files
|
672 |
-
"application/x-zip-compressed", # Compressed ZIP files
|
673 |
-
"application/x-zip", # Another ZIP type
|
674 |
-
"application/x-rar-compressed", # RAR files
|
675 |
-
"application/x-tar", # TAR files
|
676 |
-
"application/gzip", # GZ files
|
677 |
-
"application/x-bzip2", # BZ2 files
|
678 |
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document", # DOCX files
|
679 |
-
"application/msword", # DOC files
|
680 |
-
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", # XLSX files
|
681 |
-
"application/vnd.ms-excel", # XLS files
|
682 |
-
"application/vnd.openxmlformats-officedocument.presentationml.presentation", # PPTX files
|
683 |
-
"application/vnd.ms-powerpoint", # PPT files
|
684 |
-
"application/json", # JSON files
|
685 |
-
"application/xml", # XML files
|
686 |
-
"text/csv", # CSV files
|
687 |
-
"text/markdown", # Markdown files
|
688 |
-
"application/octet-stream", # Binary files
|
689 |
-
"application/x-7z-compressed", # 7z files
|
690 |
-
"application/x-iso9660-image", # ISO files
|
691 |
-
"application/x-dosexec", # EXE files
|
692 |
-
"application/x-sh", # Shell script files
|
693 |
-
"application/x-php", # PHP files
|
694 |
-
"application/x-python", # Python files
|
695 |
-
"application/x-java-archive", # JAR files
|
696 |
-
"application/x-asp", # ASP files
|
697 |
-
"application/x-c", # C source files
|
698 |
-
"application/x-c++", # C++ source files
|
699 |
-
"application/x-ruby", # Ruby files
|
700 |
-
"application/x-perl", # Perl files
|
701 |
-
"application/x-go", # Go files
|
702 |
-
"application/x-swift", # Swift files
|
703 |
-
"application/x-xml", # XML files
|
704 |
-
"application/x-yaml", # YAML files
|
705 |
-
"application/x-ini", # INI files
|
706 |
-
"application/x-log", # Log files
|
707 |
-
"application/x-configuration", # Configuration files
|
708 |
-
],
|
709 |
file_count="multiple"
|
710 |
)
|
711 |
|
@@ -746,133 +601,133 @@ def create_modern_interface():
|
|
746 |
)
|
747 |
|
748 |
# Load example data
|
749 |
-
|
750 |
-
|
751 |
-
|
752 |
-
|
753 |
-
|
754 |
-
|
755 |
-
|
756 |
-
|
757 |
-
|
758 |
-
|
759 |
-
|
760 |
-
|
761 |
-
|
762 |
-
|
763 |
-
|
764 |
-
|
765 |
-
|
766 |
-
|
767 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
768 |
}
|
769 |
-
],
|
770 |
-
"metadata": {
|
771 |
-
"timestamp": datetime.now().isoformat(),
|
772 |
-
"version": "2.0",
|
773 |
-
"source": "example"
|
774 |
}
|
775 |
-
|
776 |
-
return json.dumps(example, indent=2)
|
777 |
|
778 |
-
|
779 |
-
|
780 |
|
781 |
-
|
782 |
-
|
783 |
-
|
784 |
-
|
785 |
-
|
786 |
-
|
787 |
-
|
788 |
-
|
789 |
-
|
790 |
-
|
791 |
-
|
792 |
-
|
793 |
-
|
794 |
-
|
795 |
-
|
796 |
-
|
797 |
-
|
798 |
-
|
799 |
-
|
800 |
-
|
801 |
-
|
802 |
-
|
803 |
-
|
804 |
-
|
805 |
-
|
806 |
-
|
807 |
-
|
808 |
-
|
809 |
-
|
810 |
-
|
811 |
-
|
812 |
-
|
813 |
-
|
814 |
-
|
815 |
-
|
816 |
-
|
817 |
-
|
818 |
-
|
819 |
-
|
820 |
if file_results:
|
821 |
-
|
822 |
-
|
823 |
-
|
824 |
-
|
825 |
-
|
826 |
-
|
827 |
-
|
828 |
-
|
829 |
-
|
830 |
-
|
831 |
-
|
|
|
|
|
832 |
else:
|
833 |
-
return None, [], "
|
834 |
-
|
835 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
836 |
|
837 |
-
|
838 |
-
|
839 |
-
|
840 |
-
|
841 |
-
# Set up event handlers
|
842 |
-
example_btn.click(load_example, outputs=[text_input])
|
843 |
-
clear_btn.click(clear_input, outputs=[text_input])
|
844 |
-
process_btn.click(
|
845 |
-
process_inputs,
|
846 |
-
inputs=[url_input, file_input, text_input, combine_data],
|
847 |
-
outputs=[output_json, output_gallery, output_text]
|
848 |
-
)
|
849 |
-
|
850 |
-
# Add helpful documentation
|
851 |
-
gr.Markdown("""
|
852 |
-
### π Features
|
853 |
|
854 |
-
|
855 |
-
|
856 |
-
|
857 |
-
|
858 |
-
|
859 |
|
860 |
-
|
861 |
|
862 |
-
|
863 |
-
|
864 |
-
|
865 |
-
|
866 |
-
|
867 |
|
868 |
-
|
869 |
|
870 |
-
|
871 |
-
|
872 |
-
|
873 |
-
|
874 |
|
875 |
-
return interface
|
876 |
|
877 |
def main():
|
878 |
"""Initialize and launch the application"""
|
|
|
194 |
"""Process uploaded file with enhanced error handling and complete extraction"""
|
195 |
if not file:
|
196 |
return []
|
197 |
+
|
198 |
dataset = []
|
199 |
try:
|
200 |
file_size = os.path.getsize(file.name)
|
201 |
if file_size > self.max_file_size:
|
202 |
+
logger.warning(f"File size ({{file_size}} bytes) exceeds maximum allowed size")
|
203 |
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
|
205 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
206 |
+
temp_dir_path = Path(temp_dir)
|
207 |
+
|
208 |
+
# Handle different archive types
|
209 |
+
if self._is_archive(file.name):
|
210 |
+
dataset.extend(self._process_archive(file.name, temp_dir_path))
|
211 |
+
else:
|
212 |
+
dataset.extend(self._process_single_file(file))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
213 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
except Exception as e:
|
215 |
+
logger.error(f"Error processing file: {{str(e)}}")
|
216 |
return []
|
217 |
|
218 |
+
return dataset
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
219 |
|
|
|
|
|
220 |
def _is_archive(self, filepath: str) -> bool:
|
221 |
"""Check if file is an archive"""
|
222 |
return any(filepath.lower().endswith(ext) for ext in [
|
|
|
560 |
with gr.Tab("π File Input"):
|
561 |
file_input = gr.File(
|
562 |
label="Upload Files",
|
563 |
+
file_types=["*"], # Allow all file types
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
564 |
file_count="multiple"
|
565 |
)
|
566 |
|
|
|
601 |
)
|
602 |
|
603 |
# Load example data
|
604 |
+
def load_example():
|
605 |
+
example = {
|
606 |
+
"type": "product_catalog",
|
607 |
+
"items": [
|
608 |
+
{
|
609 |
+
"id": "123",
|
610 |
+
"name": "Premium Widget",
|
611 |
+
"description": "High-quality widget with advanced features",
|
612 |
+
"price": 299.99,
|
613 |
+
"category": "electronics",
|
614 |
+
"tags": ["premium", "featured", "new"]
|
615 |
+
},
|
616 |
+
{
|
617 |
+
"id": "456",
|
618 |
+
"name": "Basic Widget",
|
619 |
+
"description": "Reliable widget for everyday use",
|
620 |
+
"price": 149.99,
|
621 |
+
"category": "electronics",
|
622 |
+
"tags": ["basic", "popular"]
|
623 |
+
}
|
624 |
+
],
|
625 |
+
"metadata": {
|
626 |
+
"timestamp": datetime.now().isoformat(),
|
627 |
+
"version": "2.0",
|
628 |
+
"source": "example"
|
629 |
}
|
|
|
|
|
|
|
|
|
|
|
630 |
}
|
631 |
+
return json.dumps(example, indent=2)
|
|
|
632 |
|
633 |
+
def clear_input():
|
634 |
+
return ""
|
635 |
|
636 |
+
def process_inputs(urls, files, text, combine):
|
637 |
+
"""Process all inputs and generate QR codes"""
|
638 |
+
try:
|
639 |
+
results = []
|
640 |
+
url_processor = EnhancedURLProcessor()
|
641 |
+
file_processor = EnhancedFileProcessor()
|
642 |
+
|
643 |
+
# Process JSON input
|
644 |
+
if text and text.strip():
|
645 |
+
try:
|
646 |
+
json_data = json.loads(text)
|
647 |
+
if isinstance(json_data, list):
|
648 |
+
results.extend(json_data)
|
649 |
+
else:
|
650 |
+
results.append(json_data)
|
651 |
+
except json.JSONDecodeError as e:
|
652 |
+
return None, [], f"β Invalid JSON format: {str(e)}"
|
653 |
+
|
654 |
+
# Process URLs
|
655 |
+
if urls and urls.strip():
|
656 |
+
url_list = re.split(r'[,\n]', urls)
|
657 |
+
url_list = [url.strip() for url in url_list if url.strip()]
|
658 |
+
|
659 |
+
for url in url_list:
|
660 |
+
validation = url_processor.validate_url(url)
|
661 |
+
if validation['is_valid']:
|
662 |
+
content = url_processor.fetch_content(url)
|
663 |
+
if content:
|
664 |
+
results.append({
|
665 |
+
'source': 'url',
|
666 |
+
'url': url,
|
667 |
+
'content': content,
|
668 |
+
'timestamp': datetime.now().isoformat()
|
669 |
+
})
|
670 |
+
|
671 |
+
# Process files
|
672 |
+
if files:
|
673 |
+
for file in files:
|
674 |
+
file_results = file_processor.process_file(file)
|
675 |
if file_results:
|
676 |
+
results.extend(file_results)
|
677 |
+
|
678 |
+
# Generate QR codes
|
679 |
+
if results:
|
680 |
+
qr_paths = generate_qr_codes(results, combine)
|
681 |
+
if qr_paths:
|
682 |
+
return (
|
683 |
+
results,
|
684 |
+
[str(path) for path in qr_paths],
|
685 |
+
f"β
Successfully processed {len(results)} items and generated {len(qr_paths)} QR codes!"
|
686 |
+
)
|
687 |
+
else:
|
688 |
+
return None, [], "β Failed to generate QR codes"
|
689 |
else:
|
690 |
+
return None, [], "β οΈ No valid content to process"
|
691 |
+
|
692 |
+
except Exception as e:
|
693 |
+
logger.error(f"Processing error: {e}")
|
694 |
+
return None, [], f"β Error: {str(e)}"
|
695 |
+
|
696 |
+
# Set up event handlers
|
697 |
+
example_btn.click(load_example, outputs=[text_input])
|
698 |
+
clear_btn.click(clear_input, outputs=[text_input])
|
699 |
+
process_btn.click(
|
700 |
+
process_inputs,
|
701 |
+
inputs=[url_input, file_input, text_input, combine_data],
|
702 |
+
outputs=[output_json, output_gallery, output_text]
|
703 |
+
)
|
704 |
|
705 |
+
# Add helpful documentation
|
706 |
+
gr.Markdown("""
|
707 |
+
### π Features
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
708 |
|
709 |
+
- **Complete URL Scraping**: Extracts every character from web pages
|
710 |
+
- **Advanced File Processing**: Full content extraction from text files and archives
|
711 |
+
- **Smart JSON Handling**: Processes any size JSON with automatic chunking
|
712 |
+
- **Sequential QR Codes**: Maintains data integrity across multiple codes
|
713 |
+
- **Modern Design**: Clean, responsive interface with visual feedback
|
714 |
|
715 |
+
### π‘ Tips
|
716 |
|
717 |
+
1. **URLs**: Enter multiple URLs separated by commas or newlines
|
718 |
+
2. **Files**: Upload text files or ZIP archives containing text files
|
719 |
+
3. **JSON**: Use the example button to see the expected format
|
720 |
+
4. **QR Codes**: Choose whether to combine data into sequential codes
|
721 |
+
5. **Processing**: Monitor the status for real-time feedback
|
722 |
|
723 |
+
### π¨ Output
|
724 |
|
725 |
+
- Generated QR codes are saved in the `output/qr_codes` directory
|
726 |
+
- Each QR code contains metadata for proper sequencing
|
727 |
+
- Hover over QR codes in the gallery to see details
|
728 |
+
""")
|
729 |
|
730 |
+
return interface
|
731 |
|
732 |
def main():
|
733 |
"""Initialize and launch the application"""
|