Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Apr 25

Commit

5909e94

verified ·

1 Parent(s): 21309b3

Update app.py

Browse files

Files changed (1) hide show

app.py +127 -272

app.py CHANGED Viewed

@@ -194,134 +194,29 @@ class EnhancedFileProcessor:
         """Process uploaded file with enhanced error handling and complete extraction"""
         if not file:
             return []
         dataset = []
         try:
             file_size = os.path.getsize(file.name)
             if file_size > self.max_file_size:
-                logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
                 return []
-            if file.name.endswith('.pdf'):
-                dataset.extend(self._process_pdf(file))
-            elif file.name.endswith('.docx'):
-                dataset.extend(self._process_docx(file))
-            elif file.name.endswith('.csv'):
-                dataset.extend(self._process_csv(file))
-            elif file.name.endswith('.json'):
-                dataset.extend(self._process_json(file))
-            elif file.name.endswith('.xml'):
-                dataset.extend(self._process_xml(file))
-            elif file.name.endswith('.md'):
-                dataset.extend(self._process_markdown(file))
-            # Add additional conditions for other file types...
-        except Exception as e:
-            logger.error(f"Error processing file: {str(e)}")
-            return []
-        return dataset
-    def _process_pdf(self, file) -> List[Dict]:
-        """Process a PDF file and extract text"""
-        try:
-            content_parts = []
-            with open(file.name, 'rb') as f:
-                reader = PyPDF2.PdfReader(f)
-                for page in reader.pages:
-                    content_parts.append(page.extract_text() or "")
-            complete_content = ''.join(content_parts)
-            return [{
-                'source': 'pdf',
-                'filename': os.path.basename(file.name),
-                'content': complete_content,
-                'timestamp': datetime.now().isoformat()
-            }]
-        except Exception as e:
-            logger.error(f"PDF processing error: {e}")
-            return []
-    def _process_docx(self, file) -> List[Dict]:
-        """Process a DOCX file and extract text"""
-        try:
-            content_parts = []
-            doc = docx.Document(file.name)
-            for para in doc.paragraphs:
-                content_parts.append(para.text)
-            complete_content = '\n'.join(content_parts)
-            return [{
-                'source': 'docx',
-                'filename': os.path.basename(file.name),
-                'content': complete_content,
-                'timestamp': datetime.now().isoformat()
-            }]
-        except Exception as e:
-            logger.error(f"DOCX processing error: {e}")
-            return []
-    def _process_csv(self, file) -> List[Dict]:
-        """Process a CSV file and extract text"""
-        try:
-            import pandas as pd
-            df = pd.read_csv(file.name)
-            content = df.to_string(index=False)
-            return [{
-                'source': 'csv',
-                'filename': os.path.basename(file.name),
-                'content': content,
-                'timestamp': datetime.now().isoformat()
-            }]
-        except Exception as e:
-            logger.error(f"CSV processing error: {e}")
-            return []
-    def _process_json(self, file) -> List[Dict]:
-        """Process a JSON file and extract text"""
-        try:
-            with open(file.name, 'r') as f:
-                content = json.load(f)
-            return [{
-                'source': 'json',
-                'filename': os.path.basename(file.name),
-                'content': json.dumps(content, indent=2),
-                'timestamp': datetime.now().isoformat()
-            }]
-        except Exception as e:
-            logger.error(f"JSON processing error: {e}")
-            return []
-    def _process_xml(self, file) -> List[Dict]:
-        """Process an XML file and extract text"""
-        try:
-            with open(file.name, 'r') as f:
-                content = f.read()
-            return [{
-                'source': 'xml',
-                'filename': os.path.basename(file.name),
-                'content': content,
-                'timestamp': datetime.now().isoformat()
-            }]
         except Exception as e:
-            logger.error(f"XML processing error: {e}")
             return []
-    def _process_markdown(self, file) -> List[Dict]:
-        """Process a Markdown file and extract text"""
-        try:
-            with open(file.name, 'r') as f:
-                content = f.read()
-            return [{
-                'source': 'markdown',
-                'filename': os.path.basename(file.name),
-                'content': content,
-                'timestamp': datetime.now().isoformat()
-            }]
-        except Exception as e:
-            logger.error(f"Markdown processing error: {e}")
-            return []
-    # Add similar methods for other file types as needed...
     def _is_archive(self, filepath: str) -> bool:
         """Check if file is an archive"""
         return any(filepath.lower().endswith(ext) for ext in [
@@ -665,47 +560,7 @@ def create_modern_interface():
         with gr.Tab("📁 File Input"):
             file_input = gr.File(
                 label="Upload Files",
-                file_types=[
-                    "text/*",  # All text files
-                    "application/pdf",  # PDF files
-                    "application/zip",  # ZIP files
-                    "application/x-zip-compressed",  # Compressed ZIP files
-                    "application/x-zip",  # Another ZIP type
-                    "application/x-rar-compressed",  # RAR files
-                    "application/x-tar",  # TAR files
-                    "application/gzip",  # GZ files
-                    "application/x-bzip2",  # BZ2 files
-                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",  # DOCX files
-                    "application/msword",  # DOC files
-                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",  # XLSX files
-                    "application/vnd.ms-excel",  # XLS files
-                    "application/vnd.openxmlformats-officedocument.presentationml.presentation",  # PPTX files
-                    "application/vnd.ms-powerpoint",  # PPT files
-                    "application/json",  # JSON files
-                    "application/xml",  # XML files
-                    "text/csv",  # CSV files
-                    "text/markdown",  # Markdown files
-                    "application/octet-stream",  # Binary files
-                    "application/x-7z-compressed",  # 7z files
-                    "application/x-iso9660-image",  # ISO files
-                    "application/x-dosexec",  # EXE files
-                    "application/x-sh",  # Shell script files
-                    "application/x-php",  # PHP files
-                    "application/x-python",  # Python files
-                    "application/x-java-archive",  # JAR files
-                    "application/x-asp",  # ASP files
-                    "application/x-c",  # C source files
-                    "application/x-c++",  # C++ source files
-                    "application/x-ruby",  # Ruby files
-                    "application/x-perl",  # Perl files
-                    "application/x-go",  # Go files
-                    "application/x-swift",  # Swift files
-                    "application/x-xml",  # XML files
-                    "application/x-yaml",  # YAML files
-                    "application/x-ini",  # INI files
-                    "application/x-log",  # Log files
-                    "application/x-configuration",  # Configuration files
-                ],
                 file_count="multiple"
             )
@@ -746,133 +601,133 @@ def create_modern_interface():
         )
         # Load example data
-    def load_example():
-        example = {
-            "type": "product_catalog",
-            "items": [
-                {
-                    "id": "123",
-                    "name": "Premium Widget",
-                    "description": "High-quality widget with advanced features",
-                    "price": 299.99,
-                    "category": "electronics",
-                    "tags": ["premium", "featured", "new"]
-                },
-                {
-                    "id": "456",
-                    "name": "Basic Widget",
-                    "description": "Reliable widget for everyday use",
-                    "price": 149.99,
-                    "category": "electronics",
-                    "tags": ["basic", "popular"]
                 }
-            ],
-            "metadata": {
-                "timestamp": datetime.now().isoformat(),
-                "version": "2.0",
-                "source": "example"
             }
-        }
-        return json.dumps(example, indent=2)
-    def clear_input():
-        return ""
-    def process_inputs(urls, files, text, combine):
-        """Process all inputs and generate QR codes"""
-        try:
-            results = []
-            url_processor = EnhancedURLProcessor()
-            file_processor = EnhancedFileProcessor()
-            # Process JSON input
-            if text and text.strip():
-                try:
-                    json_data = json.loads(text)
-                    if isinstance(json_data, list):
-                        results.extend(json_data)
-                    else:
-                        results.append(json_data)
-                except json.JSONDecodeError as e:
-                    return None, [], f"❌ Invalid JSON format: {str(e)}"
-            # Process URLs
-            if urls and urls.strip():
-                url_list = re.split(r'[,\n]', urls)
-                url_list = [url.strip() for url in url_list if url.strip()]
-                for url in url_list:
-                    validation = url_processor.validate_url(url)
-                    if validation['is_valid']:
-                        content = url_processor.fetch_content(url)
-                        if content:
-                            results.append({
-                                'source': 'url',
-                                'url': url,
-                                'content': content,
-                                'timestamp': datetime.now().isoformat()
-                            })
-            # Process files
-            if files:
-                for file in files:
-                    file_results = file_processor.process_file(file)
                         if file_results:
-                        results.extend(file_results)
-            # Generate QR codes
-            if results:
-                qr_paths = generate_qr_codes(results, combine)
-                if qr_paths:
-                    return (
-                        results,
-                        [str(path) for path in qr_paths],
-                        f"✅ Successfully processed {len(results)} items and generated {len(qr_paths)} QR codes!"
-                    )
                 else:
-                    return None, [], "❌ Failed to generate QR codes"
-            else:
-                return None, [], "⚠️ No valid content to process"
-        except Exception as e:
-            logger.error(f"Processing error: {e}")
-            return None, [], f"❌ Error: {str(e)}"
-    # Set up event handlers
-    example_btn.click(load_example, outputs=[text_input])
-    clear_btn.click(clear_input, outputs=[text_input])
-    process_btn.click(
-        process_inputs,
-        inputs=[url_input, file_input, text_input, combine_data],
-        outputs=[output_json, output_gallery, output_text]
-    )
-    # Add helpful documentation
-    gr.Markdown("""
-    ### 🚀 Features
-    - **Complete URL Scraping**: Extracts every character from web pages
-    - **Advanced File Processing**: Full content extraction from text files and archives
-    - **Smart JSON Handling**: Processes any size JSON with automatic chunking
-    - **Sequential QR Codes**: Maintains data integrity across multiple codes
-    - **Modern Design**: Clean, responsive interface with visual feedback
-    ### 💡 Tips
-    1. **URLs**: Enter multiple URLs separated by commas or newlines
-    2. **Files**: Upload text files or ZIP archives containing text files
-    3. **JSON**: Use the example button to see the expected format
-    4. **QR Codes**: Choose whether to combine data into sequential codes
-    5. **Processing**: Monitor the status for real-time feedback
-    ### 🎨 Output
-    - Generated QR codes are saved in the `output/qr_codes` directory
-    - Each QR code contains metadata for proper sequencing
-    - Hover over QR codes in the gallery to see details
-    """)
-return interface
 def main():
     """Initialize and launch the application"""

         """Process uploaded file with enhanced error handling and complete extraction"""
         if not file:
             return []
         dataset = []
         try:
             file_size = os.path.getsize(file.name)
             if file_size > self.max_file_size:
+                logger.warning(f"File size ({{file_size}} bytes) exceeds maximum allowed size")
                 return []
+            with tempfile.TemporaryDirectory() as temp_dir:
+                temp_dir_path = Path(temp_dir)
+                # Handle different archive types
+                if self._is_archive(file.name):
+                    dataset.extend(self._process_archive(file.name, temp_dir_path))
+                else:
+                    dataset.extend(self._process_single_file(file))
         except Exception as e:
+            logger.error(f"Error processing file: {{str(e)}}")
             return []
+        return dataset
     def _is_archive(self, filepath: str) -> bool:
         """Check if file is an archive"""
         return any(filepath.lower().endswith(ext) for ext in [
         with gr.Tab("📁 File Input"):
             file_input = gr.File(
                 label="Upload Files",
+                file_types=["*"],  # Allow all file types
                 file_count="multiple"
             )
         )
         # Load example data
+        def load_example():
+            example = {
+                "type": "product_catalog",
+                "items": [
+                    {
+                        "id": "123",
+                        "name": "Premium Widget",
+                        "description": "High-quality widget with advanced features",
+                        "price": 299.99,
+                        "category": "electronics",
+                        "tags": ["premium", "featured", "new"]
+                    },
+                    {
+                        "id": "456",
+                        "name": "Basic Widget",
+                        "description": "Reliable widget for everyday use",
+                        "price": 149.99,
+                        "category": "electronics",
+                        "tags": ["basic", "popular"]
+                    }
+                ],
+                "metadata": {
+                    "timestamp": datetime.now().isoformat(),
+                    "version": "2.0",
+                    "source": "example"
                 }
             }
+            return json.dumps(example, indent=2)
+        def clear_input():
+            return ""
+        def process_inputs(urls, files, text, combine):
+            """Process all inputs and generate QR codes"""
+            try:
+                results = []
+                url_processor = EnhancedURLProcessor()
+                file_processor = EnhancedFileProcessor()
+                # Process JSON input
+                if text and text.strip():
+                    try:
+                        json_data = json.loads(text)
+                        if isinstance(json_data, list):
+                            results.extend(json_data)
+                        else:
+                            results.append(json_data)
+                    except json.JSONDecodeError as e:
+                        return None, [], f"❌ Invalid JSON format: {str(e)}"
+                # Process URLs
+                if urls and urls.strip():
+                    url_list = re.split(r'[,\n]', urls)
+                    url_list = [url.strip() for url in url_list if url.strip()]
+                    for url in url_list:
+                        validation = url_processor.validate_url(url)
+                        if validation['is_valid']:
+                            content = url_processor.fetch_content(url)
+                            if content:
+                                results.append({
+                                    'source': 'url',
+                                    'url': url,
+                                    'content': content,
+                                    'timestamp': datetime.now().isoformat()
+                                })
+                # Process files
+                if files:
+                    for file in files:
+                        file_results = file_processor.process_file(file)
                         if file_results:
+                            results.extend(file_results)
+                # Generate QR codes
+                if results:
+                    qr_paths = generate_qr_codes(results, combine)
+                    if qr_paths:
+                        return (
+                            results,
+                            [str(path) for path in qr_paths],
+                            f"✅ Successfully processed {len(results)} items and generated {len(qr_paths)} QR codes!"
+                        )
+                    else:
+                        return None, [], "❌ Failed to generate QR codes"
                 else:
+                    return None, [], "⚠️ No valid content to process"
+            except Exception as e:
+                logger.error(f"Processing error: {e}")
+                return None, [], f"❌ Error: {str(e)}"
+        # Set up event handlers
+        example_btn.click(load_example, outputs=[text_input])
+        clear_btn.click(clear_input, outputs=[text_input])
+        process_btn.click(
+            process_inputs,
+            inputs=[url_input, file_input, text_input, combine_data],
+            outputs=[output_json, output_gallery, output_text]
+        )
+        # Add helpful documentation
+        gr.Markdown("""
+        ### 🚀 Features
+        - **Complete URL Scraping**: Extracts every character from web pages
+        - **Advanced File Processing**: Full content extraction from text files and archives
+        - **Smart JSON Handling**: Processes any size JSON with automatic chunking
+        - **Sequential QR Codes**: Maintains data integrity across multiple codes
+        - **Modern Design**: Clean, responsive interface with visual feedback
+        ### 💡 Tips
+        1. **URLs**: Enter multiple URLs separated by commas or newlines
+        2. **Files**: Upload text files or ZIP archives containing text files
+        3. **JSON**: Use the example button to see the expected format
+        4. **QR Codes**: Choose whether to combine data into sequential codes
+        5. **Processing**: Monitor the status for real-time feedback
+        ### 🎨 Output
+        - Generated QR codes are saved in the `output/qr_codes` directory
+        - Each QR code contains metadata for proper sequencing
+        - Hover over QR codes in the gallery to see details
+        """)
+    return interface
 def main():
     """Initialize and launch the application"""