Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Apr 25

Commit

d1233db

verified ·

1 Parent(s): 86b88f8

Update app.py

Browse files

Files changed (1) hide show

app.py +272 -127

app.py CHANGED Viewed

@@ -194,29 +194,134 @@ class EnhancedFileProcessor:
         """Process uploaded file with enhanced error handling and complete extraction"""
         if not file:
             return []
         dataset = []
         try:
             file_size = os.path.getsize(file.name)
             if file_size > self.max_file_size:
-                logger.warning(f"File size ({{file_size}} bytes) exceeds maximum allowed size")
                 return []
-            with tempfile.TemporaryDirectory() as temp_dir:
-                temp_dir_path = Path(temp_dir)
-                # Handle different archive types
-                if self._is_archive(file.name):
-                    dataset.extend(self._process_archive(file.name, temp_dir_path))
-                else:
-                    dataset.extend(self._process_single_file(file))
         except Exception as e:
-            logger.error(f"Error processing file: {{str(e)}}")
             return []
-        return dataset
     def _is_archive(self, filepath: str) -> bool:
         """Check if file is an archive"""
         return any(filepath.lower().endswith(ext) for ext in [
@@ -569,7 +674,47 @@ def create_modern_interface():
         with gr.Tab("📁 File Input"):
             file_input = gr.File(
                 label="Upload Files",
-                file_types=["*"],  # Allow all file types
                 file_count="multiple"
             )
@@ -610,133 +755,133 @@ def create_modern_interface():
         )
         # Load example data
-        def load_example():
-            example = {
-                "type": "product_catalog",
-                "items": [
-                    {
-                        "id": "123",
-                        "name": "Premium Widget",
-                        "description": "High-quality widget with advanced features",
-                        "price": 299.99,
-                        "category": "electronics",
-                        "tags": ["premium", "featured", "new"]
-                    },
-                    {
-                        "id": "456",
-                        "name": "Basic Widget",
-                        "description": "Reliable widget for everyday use",
-                        "price": 149.99,
-                        "category": "electronics",
-                        "tags": ["basic", "popular"]
-                    }
-                ],
-                "metadata": {
-                    "timestamp": datetime.now().isoformat(),
-                    "version": "2.0",
-                    "source": "example"
                 }
             }
-            return json.dumps(example, indent=2)
-        def clear_input():
-            return ""
-        def process_inputs(urls, files, text, combine):
-            """Process all inputs and generate QR codes"""
-            try:
-                results = []
-                url_processor = EnhancedURLProcessor()
-                file_processor = EnhancedFileProcessor()
-                # Process JSON input
-                if text and text.strip():
-                    try:
-                        json_data = json.loads(text)
-                        if isinstance(json_data, list):
-                            results.extend(json_data)
-                        else:
-                            results.append(json_data)
-                    except json.JSONDecodeError as e:
-                        return None, [], f"❌ Invalid JSON format: {str(e)}"
-                # Process URLs
-                if urls and urls.strip():
-                    url_list = re.split(r'[,\n]', urls)
-                    url_list = [url.strip() for url in url_list if url.strip()]
-                    for url in url_list:
-                        validation = url_processor.validate_url(url)
-                        if validation['is_valid']:
-                            content = url_processor.fetch_content(url)
-                            if content:
-                                results.append({
-                                    'source': 'url',
-                                    'url': url,
-                                    'content': content,
-                                    'timestamp': datetime.now().isoformat()
-                                })
-                # Process files
-                if files:
-                    for file in files:
-                        file_results = file_processor.process_file(file)
-                        if file_results:
-                            results.extend(file_results)
-                # Generate QR codes
-                if results:
-                    qr_paths = generate_qr_codes(results, combine)
-                    if qr_paths:
-                        return (
-                            results,
-                            [str(path) for path in qr_paths],
-                            f"✅ Successfully processed {len(results)} items and generated {len(qr_paths)} QR codes!"
-                        )
                     else:
-                        return None, [], "❌ Failed to generate QR codes"
                 else:
-                    return None, [], "⚠️ No valid content to process"
-            except Exception as e:
-                logger.error(f"Processing error: {e}")
-                return None, [], f"❌ Error: {str(e)}"
-        # Set up event handlers
-        example_btn.click(load_example, outputs=[text_input])
-        clear_btn.click(clear_input, outputs=[text_input])
-        process_btn.click(
-            process_inputs,
-            inputs=[url_input, file_input, text_input, combine_data],
-            outputs=[output_json, output_gallery, output_text]
-        )
-        # Add helpful documentation
-        gr.Markdown("""
-        ### 🚀 Features
-        - **Complete URL Scraping**: Extracts every character from web pages
-        - **Advanced File Processing**: Full content extraction from text files and archives
-        - **Smart JSON Handling**: Processes any size JSON with automatic chunking
-        - **Sequential QR Codes**: Maintains data integrity across multiple codes
-        - **Modern Design**: Clean, responsive interface with visual feedback
-        ### 💡 Tips
-        1. **URLs**: Enter multiple URLs separated by commas or newlines
-        2. **Files**: Upload text files or ZIP archives containing text files
-        3. **JSON**: Use the example button to see the expected format
-        4. **QR Codes**: Choose whether to combine data into sequential codes
-        5. **Processing**: Monitor the status for real-time feedback
-        ### 🎨 Output
-        - Generated QR codes are saved in the `output/qr_codes` directory
-        - Each QR code contains metadata for proper sequencing
-        - Hover over QR codes in the gallery to see details
-        """)
-    return interface
 def main():
     """Initialize and launch the application"""

         """Process uploaded file with enhanced error handling and complete extraction"""
         if not file:
             return []
         dataset = []
         try:
             file_size = os.path.getsize(file.name)
             if file_size > self.max_file_size:
+                logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
                 return []
+            if file.name.endswith('.pdf'):
+                dataset.extend(self._process_pdf(file))
+            elif file.name.endswith('.docx'):
+                dataset.extend(self._process_docx(file))
+            elif file.name.endswith('.csv'):
+                dataset.extend(self._process_csv(file))
+            elif file.name.endswith('.json'):
+                dataset.extend(self._process_json(file))
+            elif file.name.endswith('.xml'):
+                dataset.extend(self._process_xml(file))
+            elif file.name.endswith('.md'):
+                dataset.extend(self._process_markdown(file))
+            # Add additional conditions for other file types...
+        except Exception as e:
+            logger.error(f"Error processing file: {str(e)}")
+            return []
+        return dataset
+    def _process_pdf(self, file) -> List[Dict]:
+        """Process a PDF file and extract text"""
+        try:
+            content_parts = []
+            with open(file.name, 'rb') as f:
+                reader = PyPDF2.PdfReader(f)
+                for page in reader.pages:
+                    content_parts.append(page.extract_text() or "")
+            complete_content = ''.join(content_parts)
+            return [{
+                'source': 'pdf',
+                'filename': os.path.basename(file.name),
+                'content': complete_content,
+                'timestamp': datetime.now().isoformat()
+            }]
+        except Exception as e:
+            logger.error(f"PDF processing error: {e}")
+            return []
+    def _process_docx(self, file) -> List[Dict]:
+        """Process a DOCX file and extract text"""
+        try:
+            content_parts = []
+            doc = docx.Document(file.name)
+            for para in doc.paragraphs:
+                content_parts.append(para.text)
+            complete_content = '\n'.join(content_parts)
+            return [{
+                'source': 'docx',
+                'filename': os.path.basename(file.name),
+                'content': complete_content,
+                'timestamp': datetime.now().isoformat()
+            }]
+        except Exception as e:
+            logger.error(f"DOCX processing error: {e}")
+            return []
+    def _process_csv(self, file) -> List[Dict]:
+        """Process a CSV file and extract text"""
+        try:
+            import pandas as pd
+            df = pd.read_csv(file.name)
+            content = df.to_string(index=False)
+            return [{
+                'source': 'csv',
+                'filename': os.path.basename(file.name),
+                'content': content,
+                'timestamp': datetime.now().isoformat()
+            }]
         except Exception as e:
+            logger.error(f"CSV processing error: {e}")
             return []
+    def _process_json(self, file) -> List[Dict]:
+        """Process a JSON file and extract text"""
+        try:
+            with open(file.name, 'r') as f:
+                content = json.load(f)
+            return [{
+                'source': 'json',
+                'filename': os.path.basename(file.name),
+                'content': json.dumps(content, indent=2),
+                'timestamp': datetime.now().isoformat()
+            }]
+        except Exception as e:
+            logger.error(f"JSON processing error: {e}")
+            return []
+    def _process_xml(self, file) -> List[Dict]:
+        """Process an XML file and extract text"""
+        try:
+            with open(file.name, 'r') as f:
+                content = f.read()
+            return [{
+                'source': 'xml',
+                'filename': os.path.basename(file.name),
+                'content': content,
+                'timestamp': datetime.now().isoformat()
+            }]
+        except Exception as e:
+            logger.error(f"XML processing error: {e}")
+            return []
+    def _process_markdown(self, file) -> List[Dict]:
+        """Process a Markdown file and extract text"""
+        try:
+            with open(file.name, 'r') as f:
+                content = f.read()
+            return [{
+                'source': 'markdown',
+                'filename': os.path.basename(file.name),
+                'content': content,
+                'timestamp': datetime.now().isoformat()
+            }]
+        except Exception as e:
+            logger.error(f"Markdown processing error: {e}")
+            return []
+    # Add similar methods for other file types as needed...
     def _is_archive(self, filepath: str) -> bool:
         """Check if file is an archive"""
         return any(filepath.lower().endswith(ext) for ext in [
         with gr.Tab("📁 File Input"):
             file_input = gr.File(
                 label="Upload Files",
+                file_types=[
+                    "text/*",  # All text files
+                    "application/pdf",  # PDF files
+                    "application/zip",  # ZIP files
+                    "application/x-zip-compressed",  # Compressed ZIP files
+                    "application/x-zip",  # Another ZIP type
+                    "application/x-rar-compressed",  # RAR files
+                    "application/x-tar",  # TAR files
+                    "application/gzip",  # GZ files
+                    "application/x-bzip2",  # BZ2 files
+                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",  # DOCX files
+                    "application/msword",  # DOC files
+                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",  # XLSX files
+                    "application/vnd.ms-excel",  # XLS files
+                    "application/vnd.openxmlformats-officedocument.presentationml.presentation",  # PPTX files
+                    "application/vnd.ms-powerpoint",  # PPT files
+                    "application/json",  # JSON files
+                    "application/xml",  # XML files
+                    "text/csv",  # CSV files
+                    "text/markdown",  # Markdown files
+                    "application/octet-stream",  # Binary files
+                    "application/x-7z-compressed",  # 7z files
+                    "application/x-iso9660-image",  # ISO files
+                    "application/x-dosexec",  # EXE files
+                    "application/x-sh",  # Shell script files
+                    "application/x-php",  # PHP files
+                    "application/x-python",  # Python files
+                    "application/x-java-archive",  # JAR files
+                    "application/x-asp",  # ASP files
+                    "application/x-c",  # C source files
+                    "application/x-c++",  # C++ source files
+                    "application/x-ruby",  # Ruby files
+                    "application/x-perl",  # Perl files
+                    "application/x-go",  # Go files
+                    "application/x-swift",  # Swift files
+                    "application/x-xml",  # XML files
+                    "application/x-yaml",  # YAML files
+                    "application/x-ini",  # INI files
+                    "application/x-log",  # Log files
+                    "application/x-configuration",  # Configuration files
+                ],
                 file_count="multiple"
             )
         )
         # Load example data
+    def load_example():
+        example = {
+            "type": "product_catalog",
+            "items": [
+                {
+                    "id": "123",
+                    "name": "Premium Widget",
+                    "description": "High-quality widget with advanced features",
+                    "price": 299.99,
+                    "category": "electronics",
+                    "tags": ["premium", "featured", "new"]
+                },
+                {
+                    "id": "456",
+                    "name": "Basic Widget",
+                    "description": "Reliable widget for everyday use",
+                    "price": 149.99,
+                    "category": "electronics",
+                    "tags": ["basic", "popular"]
                 }
+            ],
+            "metadata": {
+                "timestamp": datetime.now().isoformat(),
+                "version": "2.0",
+                "source": "example"
             }
+        }
+        return json.dumps(example, indent=2)
+    def clear_input():
+        return ""
+    def process_inputs(urls, files, text, combine):
+        """Process all inputs and generate QR codes"""
+        try:
+            results = []
+            url_processor = EnhancedURLProcessor()
+            file_processor = EnhancedFileProcessor()
+            # Process JSON input
+            if text and text.strip():
+                try:
+                    json_data = json.loads(text)
+                    if isinstance(json_data, list):
+                        results.extend(json_data)
                     else:
+                        results.append(json_data)
+                except json.JSONDecodeError as e:
+                    return None, [], f"❌ Invalid JSON format: {str(e)}"
+            # Process URLs
+            if urls and urls.strip():
+                url_list = re.split(r'[,\n]', urls)
+                url_list = [url.strip() for url in url_list if url.strip()]
+                for url in url_list:
+                    validation = url_processor.validate_url(url)
+                    if validation['is_valid']:
+                        content = url_processor.fetch_content(url)
+                        if content:
+                            results.append({
+                                'source': 'url',
+                                'url': url,
+                                'content': content,
+                                'timestamp': datetime.now().isoformat()
+                            })
+            # Process files
+            if files:
+                for file in files:
+                    file_results = file_processor.process_file(file)
+                        if file_results:
+                        results.extend(file_results)
+            # Generate QR codes
+            if results:
+                qr_paths = generate_qr_codes(results, combine)
+                if qr_paths:
+                    return (
+                        results,
+                        [str(path) for path in qr_paths],
+                        f"✅ Successfully processed {len(results)} items and generated {len(qr_paths)} QR codes!"
+                    )
                 else:
+                    return None, [], "❌ Failed to generate QR codes"
+            else:
+                return None, [], "⚠️ No valid content to process"
+        except Exception as e:
+            logger.error(f"Processing error: {e}")
+            return None, [], f"❌ Error: {str(e)}"
+    # Set up event handlers
+    example_btn.click(load_example, outputs=[text_input])
+    clear_btn.click(clear_input, outputs=[text_input])
+    process_btn.click(
+        process_inputs,
+        inputs=[url_input, file_input, text_input, combine_data],
+        outputs=[output_json, output_gallery, output_text]
+    )
+    # Add helpful documentation
+    gr.Markdown("""
+    ### 🚀 Features
+    - **Complete URL Scraping**: Extracts every character from web pages
+    - **Advanced File Processing**: Full content extraction from text files and archives
+    - **Smart JSON Handling**: Processes any size JSON with automatic chunking
+    - **Sequential QR Codes**: Maintains data integrity across multiple codes
+    - **Modern Design**: Clean, responsive interface with visual feedback
+    ### 💡 Tips
+    1. **URLs**: Enter multiple URLs separated by commas or newlines
+    2. **Files**: Upload text files or ZIP archives containing text files
+    3. **JSON**: Use the example button to see the expected format
+    4. **QR Codes**: Choose whether to combine data into sequential codes
+    5. **Processing**: Monitor the status for real-time feedback
+    ### 🎨 Output
+    - Generated QR codes are saved in the `output/qr_codes` directory
+    - Each QR code contains metadata for proper sequencing
+    - Hover over QR codes in the gallery to see details
+    """)
+return interface
 def main():
     """Initialize and launch the application"""