acecalisto3 commited on
Commit
d1233db
Β·
verified Β·
1 Parent(s): 86b88f8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +272 -127
app.py CHANGED
@@ -194,29 +194,134 @@ class EnhancedFileProcessor:
194
  """Process uploaded file with enhanced error handling and complete extraction"""
195
  if not file:
196
  return []
197
-
198
  dataset = []
199
  try:
200
  file_size = os.path.getsize(file.name)
201
  if file_size > self.max_file_size:
202
- logger.warning(f"File size ({{file_size}} bytes) exceeds maximum allowed size")
203
  return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
- with tempfile.TemporaryDirectory() as temp_dir:
206
- temp_dir_path = Path(temp_dir)
207
-
208
- # Handle different archive types
209
- if self._is_archive(file.name):
210
- dataset.extend(self._process_archive(file.name, temp_dir_path))
211
- else:
212
- dataset.extend(self._process_single_file(file))
 
 
 
 
 
 
 
 
 
213
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  except Exception as e:
215
- logger.error(f"Error processing file: {{str(e)}}")
216
  return []
217
 
218
- return dataset
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  def _is_archive(self, filepath: str) -> bool:
221
  """Check if file is an archive"""
222
  return any(filepath.lower().endswith(ext) for ext in [
@@ -569,7 +674,47 @@ def create_modern_interface():
569
  with gr.Tab("πŸ“ File Input"):
570
  file_input = gr.File(
571
  label="Upload Files",
572
- file_types=["*"], # Allow all file types
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
573
  file_count="multiple"
574
  )
575
 
@@ -610,133 +755,133 @@ def create_modern_interface():
610
  )
611
 
612
  # Load example data
613
- def load_example():
614
- example = {
615
- "type": "product_catalog",
616
- "items": [
617
- {
618
- "id": "123",
619
- "name": "Premium Widget",
620
- "description": "High-quality widget with advanced features",
621
- "price": 299.99,
622
- "category": "electronics",
623
- "tags": ["premium", "featured", "new"]
624
- },
625
- {
626
- "id": "456",
627
- "name": "Basic Widget",
628
- "description": "Reliable widget for everyday use",
629
- "price": 149.99,
630
- "category": "electronics",
631
- "tags": ["basic", "popular"]
632
- }
633
- ],
634
- "metadata": {
635
- "timestamp": datetime.now().isoformat(),
636
- "version": "2.0",
637
- "source": "example"
638
  }
 
 
 
 
 
639
  }
640
- return json.dumps(example, indent=2)
641
-
642
- def clear_input():
643
- return ""
644
 
645
- def process_inputs(urls, files, text, combine):
646
- """Process all inputs and generate QR codes"""
647
- try:
648
- results = []
649
- url_processor = EnhancedURLProcessor()
650
- file_processor = EnhancedFileProcessor()
651
 
652
- # Process JSON input
653
- if text and text.strip():
654
- try:
655
- json_data = json.loads(text)
656
- if isinstance(json_data, list):
657
- results.extend(json_data)
658
- else:
659
- results.append(json_data)
660
- except json.JSONDecodeError as e:
661
- return None, [], f"❌ Invalid JSON format: {str(e)}"
662
-
663
- # Process URLs
664
- if urls and urls.strip():
665
- url_list = re.split(r'[,\n]', urls)
666
- url_list = [url.strip() for url in url_list if url.strip()]
667
-
668
- for url in url_list:
669
- validation = url_processor.validate_url(url)
670
- if validation['is_valid']:
671
- content = url_processor.fetch_content(url)
672
- if content:
673
- results.append({
674
- 'source': 'url',
675
- 'url': url,
676
- 'content': content,
677
- 'timestamp': datetime.now().isoformat()
678
- })
679
-
680
- # Process files
681
- if files:
682
- for file in files:
683
- file_results = file_processor.process_file(file)
684
- if file_results:
685
- results.extend(file_results)
686
-
687
- # Generate QR codes
688
- if results:
689
- qr_paths = generate_qr_codes(results, combine)
690
- if qr_paths:
691
- return (
692
- results,
693
- [str(path) for path in qr_paths],
694
- f"βœ… Successfully processed {len(results)} items and generated {len(qr_paths)} QR codes!"
695
- )
696
  else:
697
- return None, [], "❌ Failed to generate QR codes"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
698
  else:
699
- return None, [], "⚠️ No valid content to process"
700
-
701
- except Exception as e:
702
- logger.error(f"Processing error: {e}")
703
- return None, [], f"❌ Error: {str(e)}"
704
-
705
- # Set up event handlers
706
- example_btn.click(load_example, outputs=[text_input])
707
- clear_btn.click(clear_input, outputs=[text_input])
708
- process_btn.click(
709
- process_inputs,
710
- inputs=[url_input, file_input, text_input, combine_data],
711
- outputs=[output_json, output_gallery, output_text]
712
- )
713
 
714
- # Add helpful documentation
715
- gr.Markdown("""
716
- ### πŸš€ Features
 
 
 
 
 
 
 
 
 
 
 
 
 
717
 
718
- - **Complete URL Scraping**: Extracts every character from web pages
719
- - **Advanced File Processing**: Full content extraction from text files and archives
720
- - **Smart JSON Handling**: Processes any size JSON with automatic chunking
721
- - **Sequential QR Codes**: Maintains data integrity across multiple codes
722
- - **Modern Design**: Clean, responsive interface with visual feedback
723
 
724
- ### πŸ’‘ Tips
725
 
726
- 1. **URLs**: Enter multiple URLs separated by commas or newlines
727
- 2. **Files**: Upload text files or ZIP archives containing text files
728
- 3. **JSON**: Use the example button to see the expected format
729
- 4. **QR Codes**: Choose whether to combine data into sequential codes
730
- 5. **Processing**: Monitor the status for real-time feedback
731
 
732
- ### 🎨 Output
733
 
734
- - Generated QR codes are saved in the `output/qr_codes` directory
735
- - Each QR code contains metadata for proper sequencing
736
- - Hover over QR codes in the gallery to see details
737
- """)
738
 
739
- return interface
740
 
741
  def main():
742
  """Initialize and launch the application"""
 
194
  """Process uploaded file with enhanced error handling and complete extraction"""
195
  if not file:
196
  return []
197
+
198
  dataset = []
199
  try:
200
  file_size = os.path.getsize(file.name)
201
  if file_size > self.max_file_size:
202
+ logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
203
  return []
204
+
205
+ if file.name.endswith('.pdf'):
206
+ dataset.extend(self._process_pdf(file))
207
+ elif file.name.endswith('.docx'):
208
+ dataset.extend(self._process_docx(file))
209
+ elif file.name.endswith('.csv'):
210
+ dataset.extend(self._process_csv(file))
211
+ elif file.name.endswith('.json'):
212
+ dataset.extend(self._process_json(file))
213
+ elif file.name.endswith('.xml'):
214
+ dataset.extend(self._process_xml(file))
215
+ elif file.name.endswith('.md'):
216
+ dataset.extend(self._process_markdown(file))
217
+ # Add additional conditions for other file types...
218
+
219
+ except Exception as e:
220
+ logger.error(f"Error processing file: {str(e)}")
221
+ return []
222
+
223
+ return dataset
224
+
225
+ def _process_pdf(self, file) -> List[Dict]:
226
+ """Process a PDF file and extract text"""
227
+ try:
228
+ content_parts = []
229
+ with open(file.name, 'rb') as f:
230
+ reader = PyPDF2.PdfReader(f)
231
+ for page in reader.pages:
232
+ content_parts.append(page.extract_text() or "")
233
+ complete_content = ''.join(content_parts)
234
+ return [{
235
+ 'source': 'pdf',
236
+ 'filename': os.path.basename(file.name),
237
+ 'content': complete_content,
238
+ 'timestamp': datetime.now().isoformat()
239
+ }]
240
+ except Exception as e:
241
+ logger.error(f"PDF processing error: {e}")
242
+ return []
243
 
244
+ def _process_docx(self, file) -> List[Dict]:
245
+ """Process a DOCX file and extract text"""
246
+ try:
247
+ content_parts = []
248
+ doc = docx.Document(file.name)
249
+ for para in doc.paragraphs:
250
+ content_parts.append(para.text)
251
+ complete_content = '\n'.join(content_parts)
252
+ return [{
253
+ 'source': 'docx',
254
+ 'filename': os.path.basename(file.name),
255
+ 'content': complete_content,
256
+ 'timestamp': datetime.now().isoformat()
257
+ }]
258
+ except Exception as e:
259
+ logger.error(f"DOCX processing error: {e}")
260
+ return []
261
 
262
+ def _process_csv(self, file) -> List[Dict]:
263
+ """Process a CSV file and extract text"""
264
+ try:
265
+ import pandas as pd
266
+ df = pd.read_csv(file.name)
267
+ content = df.to_string(index=False)
268
+ return [{
269
+ 'source': 'csv',
270
+ 'filename': os.path.basename(file.name),
271
+ 'content': content,
272
+ 'timestamp': datetime.now().isoformat()
273
+ }]
274
  except Exception as e:
275
+ logger.error(f"CSV processing error: {e}")
276
  return []
277
 
278
+ def _process_json(self, file) -> List[Dict]:
279
+ """Process a JSON file and extract text"""
280
+ try:
281
+ with open(file.name, 'r') as f:
282
+ content = json.load(f)
283
+ return [{
284
+ 'source': 'json',
285
+ 'filename': os.path.basename(file.name),
286
+ 'content': json.dumps(content, indent=2),
287
+ 'timestamp': datetime.now().isoformat()
288
+ }]
289
+ except Exception as e:
290
+ logger.error(f"JSON processing error: {e}")
291
+ return []
292
+
293
+ def _process_xml(self, file) -> List[Dict]:
294
+ """Process an XML file and extract text"""
295
+ try:
296
+ with open(file.name, 'r') as f:
297
+ content = f.read()
298
+ return [{
299
+ 'source': 'xml',
300
+ 'filename': os.path.basename(file.name),
301
+ 'content': content,
302
+ 'timestamp': datetime.now().isoformat()
303
+ }]
304
+ except Exception as e:
305
+ logger.error(f"XML processing error: {e}")
306
+ return []
307
 
308
+ def _process_markdown(self, file) -> List[Dict]:
309
+ """Process a Markdown file and extract text"""
310
+ try:
311
+ with open(file.name, 'r') as f:
312
+ content = f.read()
313
+ return [{
314
+ 'source': 'markdown',
315
+ 'filename': os.path.basename(file.name),
316
+ 'content': content,
317
+ 'timestamp': datetime.now().isoformat()
318
+ }]
319
+ except Exception as e:
320
+ logger.error(f"Markdown processing error: {e}")
321
+ return []
322
+
323
+ # Add similar methods for other file types as needed...
324
+
325
  def _is_archive(self, filepath: str) -> bool:
326
  """Check if file is an archive"""
327
  return any(filepath.lower().endswith(ext) for ext in [
 
674
  with gr.Tab("πŸ“ File Input"):
675
  file_input = gr.File(
676
  label="Upload Files",
677
+ file_types=[
678
+ "text/*", # All text files
679
+ "application/pdf", # PDF files
680
+ "application/zip", # ZIP files
681
+ "application/x-zip-compressed", # Compressed ZIP files
682
+ "application/x-zip", # Another ZIP type
683
+ "application/x-rar-compressed", # RAR files
684
+ "application/x-tar", # TAR files
685
+ "application/gzip", # GZ files
686
+ "application/x-bzip2", # BZ2 files
687
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document", # DOCX files
688
+ "application/msword", # DOC files
689
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", # XLSX files
690
+ "application/vnd.ms-excel", # XLS files
691
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation", # PPTX files
692
+ "application/vnd.ms-powerpoint", # PPT files
693
+ "application/json", # JSON files
694
+ "application/xml", # XML files
695
+ "text/csv", # CSV files
696
+ "text/markdown", # Markdown files
697
+ "application/octet-stream", # Binary files
698
+ "application/x-7z-compressed", # 7z files
699
+ "application/x-iso9660-image", # ISO files
700
+ "application/x-dosexec", # EXE files
701
+ "application/x-sh", # Shell script files
702
+ "application/x-php", # PHP files
703
+ "application/x-python", # Python files
704
+ "application/x-java-archive", # JAR files
705
+ "application/x-asp", # ASP files
706
+ "application/x-c", # C source files
707
+ "application/x-c++", # C++ source files
708
+ "application/x-ruby", # Ruby files
709
+ "application/x-perl", # Perl files
710
+ "application/x-go", # Go files
711
+ "application/x-swift", # Swift files
712
+ "application/x-xml", # XML files
713
+ "application/x-yaml", # YAML files
714
+ "application/x-ini", # INI files
715
+ "application/x-log", # Log files
716
+ "application/x-configuration", # Configuration files
717
+ ],
718
  file_count="multiple"
719
  )
720
 
 
755
  )
756
 
757
  # Load example data
758
+ def load_example():
759
+ example = {
760
+ "type": "product_catalog",
761
+ "items": [
762
+ {
763
+ "id": "123",
764
+ "name": "Premium Widget",
765
+ "description": "High-quality widget with advanced features",
766
+ "price": 299.99,
767
+ "category": "electronics",
768
+ "tags": ["premium", "featured", "new"]
769
+ },
770
+ {
771
+ "id": "456",
772
+ "name": "Basic Widget",
773
+ "description": "Reliable widget for everyday use",
774
+ "price": 149.99,
775
+ "category": "electronics",
776
+ "tags": ["basic", "popular"]
 
 
 
 
 
 
777
  }
778
+ ],
779
+ "metadata": {
780
+ "timestamp": datetime.now().isoformat(),
781
+ "version": "2.0",
782
+ "source": "example"
783
  }
784
+ }
785
+ return json.dumps(example, indent=2)
 
 
786
 
787
+ def clear_input():
788
+ return ""
 
 
 
 
789
 
790
+ def process_inputs(urls, files, text, combine):
791
+ """Process all inputs and generate QR codes"""
792
+ try:
793
+ results = []
794
+ url_processor = EnhancedURLProcessor()
795
+ file_processor = EnhancedFileProcessor()
796
+
797
+ # Process JSON input
798
+ if text and text.strip():
799
+ try:
800
+ json_data = json.loads(text)
801
+ if isinstance(json_data, list):
802
+ results.extend(json_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
803
  else:
804
+ results.append(json_data)
805
+ except json.JSONDecodeError as e:
806
+ return None, [], f"❌ Invalid JSON format: {str(e)}"
807
+
808
+ # Process URLs
809
+ if urls and urls.strip():
810
+ url_list = re.split(r'[,\n]', urls)
811
+ url_list = [url.strip() for url in url_list if url.strip()]
812
+
813
+ for url in url_list:
814
+ validation = url_processor.validate_url(url)
815
+ if validation['is_valid']:
816
+ content = url_processor.fetch_content(url)
817
+ if content:
818
+ results.append({
819
+ 'source': 'url',
820
+ 'url': url,
821
+ 'content': content,
822
+ 'timestamp': datetime.now().isoformat()
823
+ })
824
+
825
+ # Process files
826
+ if files:
827
+ for file in files:
828
+ file_results = file_processor.process_file(file)
829
+ if file_results:
830
+ results.extend(file_results)
831
+
832
+ # Generate QR codes
833
+ if results:
834
+ qr_paths = generate_qr_codes(results, combine)
835
+ if qr_paths:
836
+ return (
837
+ results,
838
+ [str(path) for path in qr_paths],
839
+ f"βœ… Successfully processed {len(results)} items and generated {len(qr_paths)} QR codes!"
840
+ )
841
  else:
842
+ return None, [], "❌ Failed to generate QR codes"
843
+ else:
844
+ return None, [], "⚠️ No valid content to process"
 
 
 
 
 
 
 
 
 
 
 
845
 
846
+ except Exception as e:
847
+ logger.error(f"Processing error: {e}")
848
+ return None, [], f"❌ Error: {str(e)}"
849
+
850
+ # Set up event handlers
851
+ example_btn.click(load_example, outputs=[text_input])
852
+ clear_btn.click(clear_input, outputs=[text_input])
853
+ process_btn.click(
854
+ process_inputs,
855
+ inputs=[url_input, file_input, text_input, combine_data],
856
+ outputs=[output_json, output_gallery, output_text]
857
+ )
858
+
859
+ # Add helpful documentation
860
+ gr.Markdown("""
861
+ ### πŸš€ Features
862
 
863
+ - **Complete URL Scraping**: Extracts every character from web pages
864
+ - **Advanced File Processing**: Full content extraction from text files and archives
865
+ - **Smart JSON Handling**: Processes any size JSON with automatic chunking
866
+ - **Sequential QR Codes**: Maintains data integrity across multiple codes
867
+ - **Modern Design**: Clean, responsive interface with visual feedback
868
 
869
+ ### πŸ’‘ Tips
870
 
871
+ 1. **URLs**: Enter multiple URLs separated by commas or newlines
872
+ 2. **Files**: Upload text files or ZIP archives containing text files
873
+ 3. **JSON**: Use the example button to see the expected format
874
+ 4. **QR Codes**: Choose whether to combine data into sequential codes
875
+ 5. **Processing**: Monitor the status for real-time feedback
876
 
877
+ ### 🎨 Output
878
 
879
+ - Generated QR codes are saved in the `output/qr_codes` directory
880
+ - Each QR code contains metadata for proper sequencing
881
+ - Hover over QR codes in the gallery to see details
882
+ """)
883
 
884
+ return interface
885
 
886
  def main():
887
  """Initialize and launch the application"""