acecalisto3 commited on
Commit
5909e94
Β·
verified Β·
1 Parent(s): 21309b3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +127 -272
app.py CHANGED
@@ -194,134 +194,29 @@ class EnhancedFileProcessor:
194
  """Process uploaded file with enhanced error handling and complete extraction"""
195
  if not file:
196
  return []
197
-
198
  dataset = []
199
  try:
200
  file_size = os.path.getsize(file.name)
201
  if file_size > self.max_file_size:
202
- logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
203
  return []
204
-
205
- if file.name.endswith('.pdf'):
206
- dataset.extend(self._process_pdf(file))
207
- elif file.name.endswith('.docx'):
208
- dataset.extend(self._process_docx(file))
209
- elif file.name.endswith('.csv'):
210
- dataset.extend(self._process_csv(file))
211
- elif file.name.endswith('.json'):
212
- dataset.extend(self._process_json(file))
213
- elif file.name.endswith('.xml'):
214
- dataset.extend(self._process_xml(file))
215
- elif file.name.endswith('.md'):
216
- dataset.extend(self._process_markdown(file))
217
- # Add additional conditions for other file types...
218
-
219
- except Exception as e:
220
- logger.error(f"Error processing file: {str(e)}")
221
- return []
222
-
223
- return dataset
224
-
225
- def _process_pdf(self, file) -> List[Dict]:
226
- """Process a PDF file and extract text"""
227
- try:
228
- content_parts = []
229
- with open(file.name, 'rb') as f:
230
- reader = PyPDF2.PdfReader(f)
231
- for page in reader.pages:
232
- content_parts.append(page.extract_text() or "")
233
- complete_content = ''.join(content_parts)
234
- return [{
235
- 'source': 'pdf',
236
- 'filename': os.path.basename(file.name),
237
- 'content': complete_content,
238
- 'timestamp': datetime.now().isoformat()
239
- }]
240
- except Exception as e:
241
- logger.error(f"PDF processing error: {e}")
242
- return []
243
 
244
- def _process_docx(self, file) -> List[Dict]:
245
- """Process a DOCX file and extract text"""
246
- try:
247
- content_parts = []
248
- doc = docx.Document(file.name)
249
- for para in doc.paragraphs:
250
- content_parts.append(para.text)
251
- complete_content = '\n'.join(content_parts)
252
- return [{
253
- 'source': 'docx',
254
- 'filename': os.path.basename(file.name),
255
- 'content': complete_content,
256
- 'timestamp': datetime.now().isoformat()
257
- }]
258
- except Exception as e:
259
- logger.error(f"DOCX processing error: {e}")
260
- return []
261
-
262
- def _process_csv(self, file) -> List[Dict]:
263
- """Process a CSV file and extract text"""
264
- try:
265
- import pandas as pd
266
- df = pd.read_csv(file.name)
267
- content = df.to_string(index=False)
268
- return [{
269
- 'source': 'csv',
270
- 'filename': os.path.basename(file.name),
271
- 'content': content,
272
- 'timestamp': datetime.now().isoformat()
273
- }]
274
- except Exception as e:
275
- logger.error(f"CSV processing error: {e}")
276
- return []
277
-
278
- def _process_json(self, file) -> List[Dict]:
279
- """Process a JSON file and extract text"""
280
- try:
281
- with open(file.name, 'r') as f:
282
- content = json.load(f)
283
- return [{
284
- 'source': 'json',
285
- 'filename': os.path.basename(file.name),
286
- 'content': json.dumps(content, indent=2),
287
- 'timestamp': datetime.now().isoformat()
288
- }]
289
- except Exception as e:
290
- logger.error(f"JSON processing error: {e}")
291
- return []
292
 
293
- def _process_xml(self, file) -> List[Dict]:
294
- """Process an XML file and extract text"""
295
- try:
296
- with open(file.name, 'r') as f:
297
- content = f.read()
298
- return [{
299
- 'source': 'xml',
300
- 'filename': os.path.basename(file.name),
301
- 'content': content,
302
- 'timestamp': datetime.now().isoformat()
303
- }]
304
  except Exception as e:
305
- logger.error(f"XML processing error: {e}")
306
  return []
307
 
308
- def _process_markdown(self, file) -> List[Dict]:
309
- """Process a Markdown file and extract text"""
310
- try:
311
- with open(file.name, 'r') as f:
312
- content = f.read()
313
- return [{
314
- 'source': 'markdown',
315
- 'filename': os.path.basename(file.name),
316
- 'content': content,
317
- 'timestamp': datetime.now().isoformat()
318
- }]
319
- except Exception as e:
320
- logger.error(f"Markdown processing error: {e}")
321
- return []
322
 
323
- # Add similar methods for other file types as needed...
324
-
325
  def _is_archive(self, filepath: str) -> bool:
326
  """Check if file is an archive"""
327
  return any(filepath.lower().endswith(ext) for ext in [
@@ -665,47 +560,7 @@ def create_modern_interface():
665
  with gr.Tab("πŸ“ File Input"):
666
  file_input = gr.File(
667
  label="Upload Files",
668
- file_types=[
669
- "text/*", # All text files
670
- "application/pdf", # PDF files
671
- "application/zip", # ZIP files
672
- "application/x-zip-compressed", # Compressed ZIP files
673
- "application/x-zip", # Another ZIP type
674
- "application/x-rar-compressed", # RAR files
675
- "application/x-tar", # TAR files
676
- "application/gzip", # GZ files
677
- "application/x-bzip2", # BZ2 files
678
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document", # DOCX files
679
- "application/msword", # DOC files
680
- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", # XLSX files
681
- "application/vnd.ms-excel", # XLS files
682
- "application/vnd.openxmlformats-officedocument.presentationml.presentation", # PPTX files
683
- "application/vnd.ms-powerpoint", # PPT files
684
- "application/json", # JSON files
685
- "application/xml", # XML files
686
- "text/csv", # CSV files
687
- "text/markdown", # Markdown files
688
- "application/octet-stream", # Binary files
689
- "application/x-7z-compressed", # 7z files
690
- "application/x-iso9660-image", # ISO files
691
- "application/x-dosexec", # EXE files
692
- "application/x-sh", # Shell script files
693
- "application/x-php", # PHP files
694
- "application/x-python", # Python files
695
- "application/x-java-archive", # JAR files
696
- "application/x-asp", # ASP files
697
- "application/x-c", # C source files
698
- "application/x-c++", # C++ source files
699
- "application/x-ruby", # Ruby files
700
- "application/x-perl", # Perl files
701
- "application/x-go", # Go files
702
- "application/x-swift", # Swift files
703
- "application/x-xml", # XML files
704
- "application/x-yaml", # YAML files
705
- "application/x-ini", # INI files
706
- "application/x-log", # Log files
707
- "application/x-configuration", # Configuration files
708
- ],
709
  file_count="multiple"
710
  )
711
 
@@ -746,133 +601,133 @@ def create_modern_interface():
746
  )
747
 
748
  # Load example data
749
- def load_example():
750
- example = {
751
- "type": "product_catalog",
752
- "items": [
753
- {
754
- "id": "123",
755
- "name": "Premium Widget",
756
- "description": "High-quality widget with advanced features",
757
- "price": 299.99,
758
- "category": "electronics",
759
- "tags": ["premium", "featured", "new"]
760
- },
761
- {
762
- "id": "456",
763
- "name": "Basic Widget",
764
- "description": "Reliable widget for everyday use",
765
- "price": 149.99,
766
- "category": "electronics",
767
- "tags": ["basic", "popular"]
 
 
 
 
 
 
768
  }
769
- ],
770
- "metadata": {
771
- "timestamp": datetime.now().isoformat(),
772
- "version": "2.0",
773
- "source": "example"
774
  }
775
- }
776
- return json.dumps(example, indent=2)
777
 
778
- def clear_input():
779
- return ""
780
 
781
- def process_inputs(urls, files, text, combine):
782
- """Process all inputs and generate QR codes"""
783
- try:
784
- results = []
785
- url_processor = EnhancedURLProcessor()
786
- file_processor = EnhancedFileProcessor()
787
-
788
- # Process JSON input
789
- if text and text.strip():
790
- try:
791
- json_data = json.loads(text)
792
- if isinstance(json_data, list):
793
- results.extend(json_data)
794
- else:
795
- results.append(json_data)
796
- except json.JSONDecodeError as e:
797
- return None, [], f"❌ Invalid JSON format: {str(e)}"
798
-
799
- # Process URLs
800
- if urls and urls.strip():
801
- url_list = re.split(r'[,\n]', urls)
802
- url_list = [url.strip() for url in url_list if url.strip()]
803
-
804
- for url in url_list:
805
- validation = url_processor.validate_url(url)
806
- if validation['is_valid']:
807
- content = url_processor.fetch_content(url)
808
- if content:
809
- results.append({
810
- 'source': 'url',
811
- 'url': url,
812
- 'content': content,
813
- 'timestamp': datetime.now().isoformat()
814
- })
815
-
816
- # Process files
817
- if files:
818
- for file in files:
819
- file_results = file_processor.process_file(file)
820
  if file_results:
821
- results.extend(file_results)
822
-
823
- # Generate QR codes
824
- if results:
825
- qr_paths = generate_qr_codes(results, combine)
826
- if qr_paths:
827
- return (
828
- results,
829
- [str(path) for path in qr_paths],
830
- f"βœ… Successfully processed {len(results)} items and generated {len(qr_paths)} QR codes!"
831
- )
 
 
832
  else:
833
- return None, [], "❌ Failed to generate QR codes"
834
- else:
835
- return None, [], "⚠️ No valid content to process"
 
 
 
 
 
 
 
 
 
 
 
836
 
837
- except Exception as e:
838
- logger.error(f"Processing error: {e}")
839
- return None, [], f"❌ Error: {str(e)}"
840
-
841
- # Set up event handlers
842
- example_btn.click(load_example, outputs=[text_input])
843
- clear_btn.click(clear_input, outputs=[text_input])
844
- process_btn.click(
845
- process_inputs,
846
- inputs=[url_input, file_input, text_input, combine_data],
847
- outputs=[output_json, output_gallery, output_text]
848
- )
849
-
850
- # Add helpful documentation
851
- gr.Markdown("""
852
- ### πŸš€ Features
853
 
854
- - **Complete URL Scraping**: Extracts every character from web pages
855
- - **Advanced File Processing**: Full content extraction from text files and archives
856
- - **Smart JSON Handling**: Processes any size JSON with automatic chunking
857
- - **Sequential QR Codes**: Maintains data integrity across multiple codes
858
- - **Modern Design**: Clean, responsive interface with visual feedback
859
 
860
- ### πŸ’‘ Tips
861
 
862
- 1. **URLs**: Enter multiple URLs separated by commas or newlines
863
- 2. **Files**: Upload text files or ZIP archives containing text files
864
- 3. **JSON**: Use the example button to see the expected format
865
- 4. **QR Codes**: Choose whether to combine data into sequential codes
866
- 5. **Processing**: Monitor the status for real-time feedback
867
 
868
- ### 🎨 Output
869
 
870
- - Generated QR codes are saved in the `output/qr_codes` directory
871
- - Each QR code contains metadata for proper sequencing
872
- - Hover over QR codes in the gallery to see details
873
- """)
874
 
875
- return interface
876
 
877
  def main():
878
  """Initialize and launch the application"""
 
194
  """Process uploaded file with enhanced error handling and complete extraction"""
195
  if not file:
196
  return []
197
+
198
  dataset = []
199
  try:
200
  file_size = os.path.getsize(file.name)
201
  if file_size > self.max_file_size:
202
+ logger.warning(f"File size ({{file_size}} bytes) exceeds maximum allowed size")
203
  return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
+ with tempfile.TemporaryDirectory() as temp_dir:
206
+ temp_dir_path = Path(temp_dir)
207
+
208
+ # Handle different archive types
209
+ if self._is_archive(file.name):
210
+ dataset.extend(self._process_archive(file.name, temp_dir_path))
211
+ else:
212
+ dataset.extend(self._process_single_file(file))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
 
 
 
 
 
 
 
 
 
 
 
 
214
  except Exception as e:
215
+ logger.error(f"Error processing file: {{str(e)}}")
216
  return []
217
 
218
+ return dataset
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
 
 
220
  def _is_archive(self, filepath: str) -> bool:
221
  """Check if file is an archive"""
222
  return any(filepath.lower().endswith(ext) for ext in [
 
560
  with gr.Tab("πŸ“ File Input"):
561
  file_input = gr.File(
562
  label="Upload Files",
563
+ file_types=["*"], # Allow all file types
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
564
  file_count="multiple"
565
  )
566
 
 
601
  )
602
 
603
  # Load example data
604
+ def load_example():
605
+ example = {
606
+ "type": "product_catalog",
607
+ "items": [
608
+ {
609
+ "id": "123",
610
+ "name": "Premium Widget",
611
+ "description": "High-quality widget with advanced features",
612
+ "price": 299.99,
613
+ "category": "electronics",
614
+ "tags": ["premium", "featured", "new"]
615
+ },
616
+ {
617
+ "id": "456",
618
+ "name": "Basic Widget",
619
+ "description": "Reliable widget for everyday use",
620
+ "price": 149.99,
621
+ "category": "electronics",
622
+ "tags": ["basic", "popular"]
623
+ }
624
+ ],
625
+ "metadata": {
626
+ "timestamp": datetime.now().isoformat(),
627
+ "version": "2.0",
628
+ "source": "example"
629
  }
 
 
 
 
 
630
  }
631
+ return json.dumps(example, indent=2)
 
632
 
633
+ def clear_input():
634
+ return ""
635
 
636
+ def process_inputs(urls, files, text, combine):
637
+ """Process all inputs and generate QR codes"""
638
+ try:
639
+ results = []
640
+ url_processor = EnhancedURLProcessor()
641
+ file_processor = EnhancedFileProcessor()
642
+
643
+ # Process JSON input
644
+ if text and text.strip():
645
+ try:
646
+ json_data = json.loads(text)
647
+ if isinstance(json_data, list):
648
+ results.extend(json_data)
649
+ else:
650
+ results.append(json_data)
651
+ except json.JSONDecodeError as e:
652
+ return None, [], f"❌ Invalid JSON format: {str(e)}"
653
+
654
+ # Process URLs
655
+ if urls and urls.strip():
656
+ url_list = re.split(r'[,\n]', urls)
657
+ url_list = [url.strip() for url in url_list if url.strip()]
658
+
659
+ for url in url_list:
660
+ validation = url_processor.validate_url(url)
661
+ if validation['is_valid']:
662
+ content = url_processor.fetch_content(url)
663
+ if content:
664
+ results.append({
665
+ 'source': 'url',
666
+ 'url': url,
667
+ 'content': content,
668
+ 'timestamp': datetime.now().isoformat()
669
+ })
670
+
671
+ # Process files
672
+ if files:
673
+ for file in files:
674
+ file_results = file_processor.process_file(file)
675
  if file_results:
676
+ results.extend(file_results)
677
+
678
+ # Generate QR codes
679
+ if results:
680
+ qr_paths = generate_qr_codes(results, combine)
681
+ if qr_paths:
682
+ return (
683
+ results,
684
+ [str(path) for path in qr_paths],
685
+ f"βœ… Successfully processed {len(results)} items and generated {len(qr_paths)} QR codes!"
686
+ )
687
+ else:
688
+ return None, [], "❌ Failed to generate QR codes"
689
  else:
690
+ return None, [], "⚠️ No valid content to process"
691
+
692
+ except Exception as e:
693
+ logger.error(f"Processing error: {e}")
694
+ return None, [], f"❌ Error: {str(e)}"
695
+
696
+ # Set up event handlers
697
+ example_btn.click(load_example, outputs=[text_input])
698
+ clear_btn.click(clear_input, outputs=[text_input])
699
+ process_btn.click(
700
+ process_inputs,
701
+ inputs=[url_input, file_input, text_input, combine_data],
702
+ outputs=[output_json, output_gallery, output_text]
703
+ )
704
 
705
+ # Add helpful documentation
706
+ gr.Markdown("""
707
+ ### πŸš€ Features
 
 
 
 
 
 
 
 
 
 
 
 
 
708
 
709
+ - **Complete URL Scraping**: Extracts every character from web pages
710
+ - **Advanced File Processing**: Full content extraction from text files and archives
711
+ - **Smart JSON Handling**: Processes any size JSON with automatic chunking
712
+ - **Sequential QR Codes**: Maintains data integrity across multiple codes
713
+ - **Modern Design**: Clean, responsive interface with visual feedback
714
 
715
+ ### πŸ’‘ Tips
716
 
717
+ 1. **URLs**: Enter multiple URLs separated by commas or newlines
718
+ 2. **Files**: Upload text files or ZIP archives containing text files
719
+ 3. **JSON**: Use the example button to see the expected format
720
+ 4. **QR Codes**: Choose whether to combine data into sequential codes
721
+ 5. **Processing**: Monitor the status for real-time feedback
722
 
723
+ ### 🎨 Output
724
 
725
+ - Generated QR codes are saved in the `output/qr_codes` directory
726
+ - Each QR code contains metadata for proper sequencing
727
+ - Hover over QR codes in the gallery to see details
728
+ """)
729
 
730
+ return interface
731
 
732
  def main():
733
  """Initialize and launch the application"""