Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -194,29 +194,134 @@ class EnhancedFileProcessor:
|
|
194 |
"""Process uploaded file with enhanced error handling and complete extraction"""
|
195 |
if not file:
|
196 |
return []
|
197 |
-
|
198 |
dataset = []
|
199 |
try:
|
200 |
file_size = os.path.getsize(file.name)
|
201 |
if file_size > self.max_file_size:
|
202 |
-
logger.warning(f"File size ({
|
203 |
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
213 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
except Exception as e:
|
215 |
-
logger.error(f"
|
216 |
return []
|
217 |
|
218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
219 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
def _is_archive(self, filepath: str) -> bool:
|
221 |
"""Check if file is an archive"""
|
222 |
return any(filepath.lower().endswith(ext) for ext in [
|
@@ -569,7 +674,47 @@ def create_modern_interface():
|
|
569 |
with gr.Tab("π File Input"):
|
570 |
file_input = gr.File(
|
571 |
label="Upload Files",
|
572 |
-
file_types=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
573 |
file_count="multiple"
|
574 |
)
|
575 |
|
@@ -610,133 +755,133 @@ def create_modern_interface():
|
|
610 |
)
|
611 |
|
612 |
# Load example data
|
613 |
-
|
614 |
-
|
615 |
-
|
616 |
-
|
617 |
-
|
618 |
-
|
619 |
-
|
620 |
-
|
621 |
-
|
622 |
-
|
623 |
-
|
624 |
-
|
625 |
-
|
626 |
-
|
627 |
-
|
628 |
-
|
629 |
-
|
630 |
-
|
631 |
-
|
632 |
-
}
|
633 |
-
],
|
634 |
-
"metadata": {
|
635 |
-
"timestamp": datetime.now().isoformat(),
|
636 |
-
"version": "2.0",
|
637 |
-
"source": "example"
|
638 |
}
|
|
|
|
|
|
|
|
|
|
|
639 |
}
|
640 |
-
|
641 |
-
|
642 |
-
def clear_input():
|
643 |
-
return ""
|
644 |
|
645 |
-
|
646 |
-
|
647 |
-
try:
|
648 |
-
results = []
|
649 |
-
url_processor = EnhancedURLProcessor()
|
650 |
-
file_processor = EnhancedFileProcessor()
|
651 |
|
652 |
-
|
653 |
-
|
654 |
-
|
655 |
-
|
656 |
-
|
657 |
-
|
658 |
-
|
659 |
-
|
660 |
-
|
661 |
-
|
662 |
-
|
663 |
-
|
664 |
-
|
665 |
-
url_list = re.split(r'[,\n]', urls)
|
666 |
-
url_list = [url.strip() for url in url_list if url.strip()]
|
667 |
-
|
668 |
-
for url in url_list:
|
669 |
-
validation = url_processor.validate_url(url)
|
670 |
-
if validation['is_valid']:
|
671 |
-
content = url_processor.fetch_content(url)
|
672 |
-
if content:
|
673 |
-
results.append({
|
674 |
-
'source': 'url',
|
675 |
-
'url': url,
|
676 |
-
'content': content,
|
677 |
-
'timestamp': datetime.now().isoformat()
|
678 |
-
})
|
679 |
-
|
680 |
-
# Process files
|
681 |
-
if files:
|
682 |
-
for file in files:
|
683 |
-
file_results = file_processor.process_file(file)
|
684 |
-
if file_results:
|
685 |
-
results.extend(file_results)
|
686 |
-
|
687 |
-
# Generate QR codes
|
688 |
-
if results:
|
689 |
-
qr_paths = generate_qr_codes(results, combine)
|
690 |
-
if qr_paths:
|
691 |
-
return (
|
692 |
-
results,
|
693 |
-
[str(path) for path in qr_paths],
|
694 |
-
f"β
Successfully processed {len(results)} items and generated {len(qr_paths)} QR codes!"
|
695 |
-
)
|
696 |
else:
|
697 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
698 |
else:
|
699 |
-
return None, [], "
|
700 |
-
|
701 |
-
|
702 |
-
logger.error(f"Processing error: {e}")
|
703 |
-
return None, [], f"β Error: {str(e)}"
|
704 |
-
|
705 |
-
# Set up event handlers
|
706 |
-
example_btn.click(load_example, outputs=[text_input])
|
707 |
-
clear_btn.click(clear_input, outputs=[text_input])
|
708 |
-
process_btn.click(
|
709 |
-
process_inputs,
|
710 |
-
inputs=[url_input, file_input, text_input, combine_data],
|
711 |
-
outputs=[output_json, output_gallery, output_text]
|
712 |
-
)
|
713 |
|
714 |
-
|
715 |
-
|
716 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
717 |
|
718 |
-
|
719 |
-
|
720 |
-
|
721 |
-
|
722 |
-
|
723 |
|
724 |
-
|
725 |
|
726 |
-
|
727 |
-
|
728 |
-
|
729 |
-
|
730 |
-
|
731 |
|
732 |
-
|
733 |
|
734 |
-
|
735 |
-
|
736 |
-
|
737 |
-
|
738 |
|
739 |
-
|
740 |
|
741 |
def main():
|
742 |
"""Initialize and launch the application"""
|
|
|
194 |
"""Process uploaded file with enhanced error handling and complete extraction"""
|
195 |
if not file:
|
196 |
return []
|
197 |
+
|
198 |
dataset = []
|
199 |
try:
|
200 |
file_size = os.path.getsize(file.name)
|
201 |
if file_size > self.max_file_size:
|
202 |
+
logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
|
203 |
return []
|
204 |
+
|
205 |
+
if file.name.endswith('.pdf'):
|
206 |
+
dataset.extend(self._process_pdf(file))
|
207 |
+
elif file.name.endswith('.docx'):
|
208 |
+
dataset.extend(self._process_docx(file))
|
209 |
+
elif file.name.endswith('.csv'):
|
210 |
+
dataset.extend(self._process_csv(file))
|
211 |
+
elif file.name.endswith('.json'):
|
212 |
+
dataset.extend(self._process_json(file))
|
213 |
+
elif file.name.endswith('.xml'):
|
214 |
+
dataset.extend(self._process_xml(file))
|
215 |
+
elif file.name.endswith('.md'):
|
216 |
+
dataset.extend(self._process_markdown(file))
|
217 |
+
# Add additional conditions for other file types...
|
218 |
+
|
219 |
+
except Exception as e:
|
220 |
+
logger.error(f"Error processing file: {str(e)}")
|
221 |
+
return []
|
222 |
+
|
223 |
+
return dataset
|
224 |
+
|
225 |
+
def _process_pdf(self, file) -> List[Dict]:
|
226 |
+
"""Process a PDF file and extract text"""
|
227 |
+
try:
|
228 |
+
content_parts = []
|
229 |
+
with open(file.name, 'rb') as f:
|
230 |
+
reader = PyPDF2.PdfReader(f)
|
231 |
+
for page in reader.pages:
|
232 |
+
content_parts.append(page.extract_text() or "")
|
233 |
+
complete_content = ''.join(content_parts)
|
234 |
+
return [{
|
235 |
+
'source': 'pdf',
|
236 |
+
'filename': os.path.basename(file.name),
|
237 |
+
'content': complete_content,
|
238 |
+
'timestamp': datetime.now().isoformat()
|
239 |
+
}]
|
240 |
+
except Exception as e:
|
241 |
+
logger.error(f"PDF processing error: {e}")
|
242 |
+
return []
|
243 |
|
244 |
+
def _process_docx(self, file) -> List[Dict]:
|
245 |
+
"""Process a DOCX file and extract text"""
|
246 |
+
try:
|
247 |
+
content_parts = []
|
248 |
+
doc = docx.Document(file.name)
|
249 |
+
for para in doc.paragraphs:
|
250 |
+
content_parts.append(para.text)
|
251 |
+
complete_content = '\n'.join(content_parts)
|
252 |
+
return [{
|
253 |
+
'source': 'docx',
|
254 |
+
'filename': os.path.basename(file.name),
|
255 |
+
'content': complete_content,
|
256 |
+
'timestamp': datetime.now().isoformat()
|
257 |
+
}]
|
258 |
+
except Exception as e:
|
259 |
+
logger.error(f"DOCX processing error: {e}")
|
260 |
+
return []
|
261 |
|
262 |
+
def _process_csv(self, file) -> List[Dict]:
|
263 |
+
"""Process a CSV file and extract text"""
|
264 |
+
try:
|
265 |
+
import pandas as pd
|
266 |
+
df = pd.read_csv(file.name)
|
267 |
+
content = df.to_string(index=False)
|
268 |
+
return [{
|
269 |
+
'source': 'csv',
|
270 |
+
'filename': os.path.basename(file.name),
|
271 |
+
'content': content,
|
272 |
+
'timestamp': datetime.now().isoformat()
|
273 |
+
}]
|
274 |
except Exception as e:
|
275 |
+
logger.error(f"CSV processing error: {e}")
|
276 |
return []
|
277 |
|
278 |
+
def _process_json(self, file) -> List[Dict]:
|
279 |
+
"""Process a JSON file and extract text"""
|
280 |
+
try:
|
281 |
+
with open(file.name, 'r') as f:
|
282 |
+
content = json.load(f)
|
283 |
+
return [{
|
284 |
+
'source': 'json',
|
285 |
+
'filename': os.path.basename(file.name),
|
286 |
+
'content': json.dumps(content, indent=2),
|
287 |
+
'timestamp': datetime.now().isoformat()
|
288 |
+
}]
|
289 |
+
except Exception as e:
|
290 |
+
logger.error(f"JSON processing error: {e}")
|
291 |
+
return []
|
292 |
+
|
293 |
+
def _process_xml(self, file) -> List[Dict]:
|
294 |
+
"""Process an XML file and extract text"""
|
295 |
+
try:
|
296 |
+
with open(file.name, 'r') as f:
|
297 |
+
content = f.read()
|
298 |
+
return [{
|
299 |
+
'source': 'xml',
|
300 |
+
'filename': os.path.basename(file.name),
|
301 |
+
'content': content,
|
302 |
+
'timestamp': datetime.now().isoformat()
|
303 |
+
}]
|
304 |
+
except Exception as e:
|
305 |
+
logger.error(f"XML processing error: {e}")
|
306 |
+
return []
|
307 |
|
308 |
+
def _process_markdown(self, file) -> List[Dict]:
|
309 |
+
"""Process a Markdown file and extract text"""
|
310 |
+
try:
|
311 |
+
with open(file.name, 'r') as f:
|
312 |
+
content = f.read()
|
313 |
+
return [{
|
314 |
+
'source': 'markdown',
|
315 |
+
'filename': os.path.basename(file.name),
|
316 |
+
'content': content,
|
317 |
+
'timestamp': datetime.now().isoformat()
|
318 |
+
}]
|
319 |
+
except Exception as e:
|
320 |
+
logger.error(f"Markdown processing error: {e}")
|
321 |
+
return []
|
322 |
+
|
323 |
+
# Add similar methods for other file types as needed...
|
324 |
+
|
325 |
def _is_archive(self, filepath: str) -> bool:
|
326 |
"""Check if file is an archive"""
|
327 |
return any(filepath.lower().endswith(ext) for ext in [
|
|
|
674 |
with gr.Tab("π File Input"):
|
675 |
file_input = gr.File(
|
676 |
label="Upload Files",
|
677 |
+
file_types=[
|
678 |
+
"text/*", # All text files
|
679 |
+
"application/pdf", # PDF files
|
680 |
+
"application/zip", # ZIP files
|
681 |
+
"application/x-zip-compressed", # Compressed ZIP files
|
682 |
+
"application/x-zip", # Another ZIP type
|
683 |
+
"application/x-rar-compressed", # RAR files
|
684 |
+
"application/x-tar", # TAR files
|
685 |
+
"application/gzip", # GZ files
|
686 |
+
"application/x-bzip2", # BZ2 files
|
687 |
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document", # DOCX files
|
688 |
+
"application/msword", # DOC files
|
689 |
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", # XLSX files
|
690 |
+
"application/vnd.ms-excel", # XLS files
|
691 |
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation", # PPTX files
|
692 |
+
"application/vnd.ms-powerpoint", # PPT files
|
693 |
+
"application/json", # JSON files
|
694 |
+
"application/xml", # XML files
|
695 |
+
"text/csv", # CSV files
|
696 |
+
"text/markdown", # Markdown files
|
697 |
+
"application/octet-stream", # Binary files
|
698 |
+
"application/x-7z-compressed", # 7z files
|
699 |
+
"application/x-iso9660-image", # ISO files
|
700 |
+
"application/x-dosexec", # EXE files
|
701 |
+
"application/x-sh", # Shell script files
|
702 |
+
"application/x-php", # PHP files
|
703 |
+
"application/x-python", # Python files
|
704 |
+
"application/x-java-archive", # JAR files
|
705 |
+
"application/x-asp", # ASP files
|
706 |
+
"application/x-c", # C source files
|
707 |
+
"application/x-c++", # C++ source files
|
708 |
+
"application/x-ruby", # Ruby files
|
709 |
+
"application/x-perl", # Perl files
|
710 |
+
"application/x-go", # Go files
|
711 |
+
"application/x-swift", # Swift files
|
712 |
+
"application/x-xml", # XML files
|
713 |
+
"application/x-yaml", # YAML files
|
714 |
+
"application/x-ini", # INI files
|
715 |
+
"application/x-log", # Log files
|
716 |
+
"application/x-configuration", # Configuration files
|
717 |
+
],
|
718 |
file_count="multiple"
|
719 |
)
|
720 |
|
|
|
755 |
)
|
756 |
|
757 |
# Load example data
|
758 |
+
def load_example():
|
759 |
+
example = {
|
760 |
+
"type": "product_catalog",
|
761 |
+
"items": [
|
762 |
+
{
|
763 |
+
"id": "123",
|
764 |
+
"name": "Premium Widget",
|
765 |
+
"description": "High-quality widget with advanced features",
|
766 |
+
"price": 299.99,
|
767 |
+
"category": "electronics",
|
768 |
+
"tags": ["premium", "featured", "new"]
|
769 |
+
},
|
770 |
+
{
|
771 |
+
"id": "456",
|
772 |
+
"name": "Basic Widget",
|
773 |
+
"description": "Reliable widget for everyday use",
|
774 |
+
"price": 149.99,
|
775 |
+
"category": "electronics",
|
776 |
+
"tags": ["basic", "popular"]
|
|
|
|
|
|
|
|
|
|
|
|
|
777 |
}
|
778 |
+
],
|
779 |
+
"metadata": {
|
780 |
+
"timestamp": datetime.now().isoformat(),
|
781 |
+
"version": "2.0",
|
782 |
+
"source": "example"
|
783 |
}
|
784 |
+
}
|
785 |
+
return json.dumps(example, indent=2)
|
|
|
|
|
786 |
|
787 |
+
def clear_input():
|
788 |
+
return ""
|
|
|
|
|
|
|
|
|
789 |
|
790 |
+
def process_inputs(urls, files, text, combine):
|
791 |
+
"""Process all inputs and generate QR codes"""
|
792 |
+
try:
|
793 |
+
results = []
|
794 |
+
url_processor = EnhancedURLProcessor()
|
795 |
+
file_processor = EnhancedFileProcessor()
|
796 |
+
|
797 |
+
# Process JSON input
|
798 |
+
if text and text.strip():
|
799 |
+
try:
|
800 |
+
json_data = json.loads(text)
|
801 |
+
if isinstance(json_data, list):
|
802 |
+
results.extend(json_data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
803 |
else:
|
804 |
+
results.append(json_data)
|
805 |
+
except json.JSONDecodeError as e:
|
806 |
+
return None, [], f"β Invalid JSON format: {str(e)}"
|
807 |
+
|
808 |
+
# Process URLs
|
809 |
+
if urls and urls.strip():
|
810 |
+
url_list = re.split(r'[,\n]', urls)
|
811 |
+
url_list = [url.strip() for url in url_list if url.strip()]
|
812 |
+
|
813 |
+
for url in url_list:
|
814 |
+
validation = url_processor.validate_url(url)
|
815 |
+
if validation['is_valid']:
|
816 |
+
content = url_processor.fetch_content(url)
|
817 |
+
if content:
|
818 |
+
results.append({
|
819 |
+
'source': 'url',
|
820 |
+
'url': url,
|
821 |
+
'content': content,
|
822 |
+
'timestamp': datetime.now().isoformat()
|
823 |
+
})
|
824 |
+
|
825 |
+
# Process files
|
826 |
+
if files:
|
827 |
+
for file in files:
|
828 |
+
file_results = file_processor.process_file(file)
|
829 |
+
if file_results:
|
830 |
+
results.extend(file_results)
|
831 |
+
|
832 |
+
# Generate QR codes
|
833 |
+
if results:
|
834 |
+
qr_paths = generate_qr_codes(results, combine)
|
835 |
+
if qr_paths:
|
836 |
+
return (
|
837 |
+
results,
|
838 |
+
[str(path) for path in qr_paths],
|
839 |
+
f"β
Successfully processed {len(results)} items and generated {len(qr_paths)} QR codes!"
|
840 |
+
)
|
841 |
else:
|
842 |
+
return None, [], "β Failed to generate QR codes"
|
843 |
+
else:
|
844 |
+
return None, [], "β οΈ No valid content to process"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
845 |
|
846 |
+
except Exception as e:
|
847 |
+
logger.error(f"Processing error: {e}")
|
848 |
+
return None, [], f"β Error: {str(e)}"
|
849 |
+
|
850 |
+
# Set up event handlers
|
851 |
+
example_btn.click(load_example, outputs=[text_input])
|
852 |
+
clear_btn.click(clear_input, outputs=[text_input])
|
853 |
+
process_btn.click(
|
854 |
+
process_inputs,
|
855 |
+
inputs=[url_input, file_input, text_input, combine_data],
|
856 |
+
outputs=[output_json, output_gallery, output_text]
|
857 |
+
)
|
858 |
+
|
859 |
+
# Add helpful documentation
|
860 |
+
gr.Markdown("""
|
861 |
+
### π Features
|
862 |
|
863 |
+
- **Complete URL Scraping**: Extracts every character from web pages
|
864 |
+
- **Advanced File Processing**: Full content extraction from text files and archives
|
865 |
+
- **Smart JSON Handling**: Processes any size JSON with automatic chunking
|
866 |
+
- **Sequential QR Codes**: Maintains data integrity across multiple codes
|
867 |
+
- **Modern Design**: Clean, responsive interface with visual feedback
|
868 |
|
869 |
+
### π‘ Tips
|
870 |
|
871 |
+
1. **URLs**: Enter multiple URLs separated by commas or newlines
|
872 |
+
2. **Files**: Upload text files or ZIP archives containing text files
|
873 |
+
3. **JSON**: Use the example button to see the expected format
|
874 |
+
4. **QR Codes**: Choose whether to combine data into sequential codes
|
875 |
+
5. **Processing**: Monitor the status for real-time feedback
|
876 |
|
877 |
+
### π¨ Output
|
878 |
|
879 |
+
- Generated QR codes are saved in the `output/qr_codes` directory
|
880 |
+
- Each QR code contains metadata for proper sequencing
|
881 |
+
- Hover over QR codes in the gallery to see details
|
882 |
+
""")
|
883 |
|
884 |
+
return interface
|
885 |
|
886 |
def main():
|
887 |
"""Initialize and launch the application"""
|