Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Apr 5

Commit

d7b800d

verified ·

1 Parent(s): 89dda1c

Update app.py

Browse files

Files changed (1) hide show

app.py +269 -534

app.py CHANGED Viewed

@@ -1,560 +1,295 @@
-import json
 import os
-import re
-import time
 import logging
-import mimetypes
 import zipfile
 import tempfile
-from datetime import datetime
-from typing import List, Dict, Optional, Union
-from pathlib import Path
-from urllib.parse import urlparse
-import requests
-import validators
-import gradio as gr
-from diskcache import Cache
-from bs4 import BeautifulSoup
-from fake_useragent import UserAgent
-from cleantext import clean
-import qrcode
-# Setup logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
-    handlers=[
-        logging.StreamHandler(),
-        logging.FileHandler('app.log', encoding='utf-8')
-    ]
-)
-logger = logging.getLogger(__name__)
-# Ensure output directories exist
-Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
-class URLProcessor:
     def __init__(self):
-        self.session = requests.Session()
-        self.timeout = 10  # seconds
-        self.session.headers.update({
-            'User -Agent': UserAgent().random,
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
-            'Accept-Language': 'en-US,en;q=0.5',
-            'Accept-Encoding': 'gzip, deflate, br',
-            'Connection': 'keep-alive',
-            'Upgrade-Insecure-Requests': '1'
-        })
-    def advanced_text_cleaning(self, text: str) -> str:
-        """Robust text cleaning with version compatibility"""
-        try:
-            cleaned_text = clean(
-                text,
-                fix_unicode=True,
-                to_ascii=True,
-                lower=True,
-                no_line_breaks=True,
-                no_urls=True,
-                no_emails=True,
-                no_phone_numbers=True,
-                no_numbers=False,
-                no_digits=False,
-                no_currency_symbols=True,
-                no_punct=False
-            ).strip()
-            return cleaned_text
-        except Exception as e:
-            logger.warning(f"Text cleaning error: {e}. Using fallback method.")
-            text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)  # Remove control characters
-            text = text.encode('ascii', 'ignore').decode('ascii')  # Remove non-ASCII characters
-            text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
-            return text.strip()
-    def validate_url(self, url: str) -> Dict:
-        """Validate URL format and accessibility"""
-        try:
-            if not validators.url(url):
-                return {'is_valid': False, 'message': 'Invalid URL format'}
-            response = self.session.head(url, timeout=self.timeout)
-            response.raise_for_status()
-            return {'is_valid': True, 'message': 'URL is valid and accessible'}
-        except Exception as e:
-            return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
-    def fetch_content(self, url: str) -> Optional[Dict]:
-        """Universal content fetcher with special case handling"""
-        try:
-            # Google Drive document handling
-            if 'drive.google.com' in url:
-                return self._handle_google_drive(url)
-            # Google Calendar ICS handling
-            if 'calendar.google.com' in url and 'ical' in url:
-                return self._handle_google_calendar(url)
-            # Standard HTML processing
-            return self._fetch_html_content(url)
-        except Exception as e:
-            logger.error(f"Content fetch failed: {e}")
-            return None
-    def _handle_google_drive(self, url: str) -> Optional[Dict]:
-        """Process Google Drive file links"""
         try:
-            file_id = re.search(r'/file/d/([a-zA-Z0-9_-]+)', url)
-            if not file_id:
-                logger.error(f"Invalid Google Drive URL: {url}")
-                return None
-            direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
-            response = self.session.get(direct_url, timeout=self.timeout)
-            response.raise_for_status()
-            return {
-                'content': response.text,
-                'content_type': response.headers.get('Content-Type', ''),
-                'timestamp': datetime.now().isoformat()
-            }
         except Exception as e:
-            logger.error(f"Google Drive processing failed: {e}")
-            return None
-    def _handle_google_calendar(self, url: str) -> Optional[Dict]:
-        """Process Google Calendar ICS feeds"""
         try:
-            response = self.session.get(url, timeout=self.timeout)
-            response.raise_for_status()
-            return {
-                'content': response.text,
-                'content_type': 'text/calendar',
-                'timestamp': datetime.now().isoformat()
-            }
         except Exception as e:
-            logger.error(f"Calendar fetch failed: {e}")
-            return None
-    def _fetch_html_content(self, url: str) -> Optional[Dict]:
-        """Standard HTML content processing"""
         try:
-            response = self.session.get(url, timeout=self.timeout)
-            response.raise_for_status()
-            soup = BeautifulSoup(response.text, 'html.parser')
-            # Remove unwanted elements
-            for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
-                element.decompose()
-            # Extract main content
-            main_content = soup.find('main') or soup.find('article') or soup.body
-            if main_content is None:
-                logger.warning(f"No main content found for URL: {url}")
-                return {
-                    'content': '',
-                    'content_type': response.headers.get('Content-Type', ''),
-                    'timestamp': datetime.now().isoformat()
-                }
-            # Clean and structure content
-            text_content = main_content.get_text(separator='\n', strip=True)
-            cleaned_content = self.advanced_text_cleaning(text_content)
-            return {
-                'content': cleaned_content,
-                'content_type': response.headers.get('Content-Type', ''),
-                'timestamp': datetime.now().isoformat()
-            }
         except Exception as e:
-            logger.error(f"HTML processing failed: {e}")
-            return None
-class FileProcessor:
-    """Class to handle file processing"""
-    def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024):  # 2GB default
-        self.max_file_size = max_file_size
-        self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
-    def is_text_file(self, filepath: str) -> bool:
-        """Check if file is a text file"""
-        try:
-            mime_type, _ = mimetypes.guess_type(filepath)
-            return (mime_type and mime_type.startswith('text/')) or \
-                   (os.path.splitext(filepath)[1].lower() in self.supported_text_extensions)
-        except Exception:
-            return False
-    def process_file(self, file) -> List[Dict]:
-        """Process uploaded file with enhanced error handling"""
-        if not file:
-            return []
-        dataset = []
-        try:
-            file_size = os.path.getsize(file.name)
-            if file_size > self.max_file_size:
-                logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
-                return []
-            with tempfile.TemporaryDirectory() as temp_dir:
-                if zipfile.is_zipfile(file.name):
-                    dataset.extend(self._process_zip_file(file.name, temp_dir))
-                else:
-                    dataset.extend(self._process_single_file(file))
-        except Exception as e:
-            logger.error(f"Error processing file: {str(e)}")
-            return []
-        return dataset
-    def chunk_data(self, data, max_size=2953):  # 2953 is the max size for version 1 QR code
-        """Chunk data into smaller pieces if it exceeds max_size."""
-        json_str = json.dumps(data, ensure_ascii=False)
-        if len(json_str) <= max_size:
-            return [json_str]
-        # Split into chunks
-        chunks = []
-        while json_str:
-            chunk = json_str[:max_size]
-            chunks.append(chunk)
-            json_str = json_str[max_size:]
-        return chunks
-def _process_single_file(self, file) -> List[Dict]:
-    """Process a single file"""
-    try:
-        file_stat = os.stat(file.name)
-        # For very large files, read in chunks and summarize
-        if file_stat.st_size > 100 * 1024 * 1024:  # 100MB
-            logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
-            # Read first and last 1MB for extremely large files
-            content = ""
-            with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
-                content = f.read(1 * 1024 * 1024)  # First 1MB
-                content += "\n...[Content truncated due to large file size]...\n"
-                # Seek to the last 1MB
-                f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
-                content += f.read()  # Last 1MB
-        else:
-            # Regular file processing
-            with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
-                content = f.read()
-        return [{
-            'source': 'filename',  # Assuming 'source' should be a string value
-            'filename': os.path.basename(file.name),
-            'file_size': file_stat.st_size,
-            'mime_type': mimetypes.guess_type(file.name)[0],
-            'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
-            'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
-            'content': content,
-            'timestamp': datetime.now().isoformat()
-        }]
-    except Exception as e:
-        logger.error(f"File processing error: {e}")
-        return []
-def clean_json(data: Union[str, Dict]) -> Optional[Dict]:
-    """Clean and validate JSON data"""
-    try:
-        # If it's a string, try to parse it
-        if isinstance(data, str):
-            # Remove any existing content and extra whitespace
-            data = data.strip()
-            data = json.loads(data)
-        # Convert to string and back to ensure proper JSON format
-        cleaned = json.loads(json.dumps(data))
-        return cleaned
-    except json.JSONDecodeError as e:
-        logger.error(f"JSON cleaning error: {e}")
-        return None
-    except Exception as e:
-        logger.error(f"Unexpected error while cleaning JSON: {e}")
-        return None
-def generate_qr_code(data: Union[str, Dict], combined: bool = True) -> List[str]:
-    """Generate QR code(s) from data"""
-    try:
-        output_dir = Path('output/qr_codes')
-        output_dir.mkdir(parents=True, exist_ok=True)
-        if combined:
-            # Generate single QR code for all data
-            cleaned_data = clean_json(data)
-            if cleaned_data is None:  # Check if cleaning failed
-                logger.error("Failed to clean data for QR code generation.")
-                return []
-            qr = qrcode.QRCode(
-                version=None,
-                error_correction=qrcode.constants.ERROR_CORRECT_L,
-                box_size=10,
-                border=4,
-            )
-            json_str = json.dumps(cleaned_data, ensure_ascii=False)
-            qr.add_data(json_str)
-            qr.make(fit=True)
-            img = qr.make_image(fill_color="black", back_color="white")
-            output_path = output_dir / f'combined_qr_{int(time.time())}.png'
-            img.save(str(output_path))
-            return [str(output_path)]
-        else:
-            # Generate separate QR codes for each item
-            if isinstance(data, list):
-                paths = []
-                for idx, item in enumerate(data):
-                    cleaned_item = clean_json(item)
-                    if cleaned_item is None:  # Check if cleaning failed
-                        logger.error(f"Failed to clean item {idx} for QR code generation.")
-                        continue  # Skip this item
-                    qr = qrcode.QRCode(
-                        version=None,
-                        error_correction=qrcode.constants.ERROR_CORRECT_L,
-                        box_size=10,
-                        border=4,
-                    )
-                    json_str = json.dumps(cleaned_item, ensure_ascii=False)
-                    qr.add_data(json_str)
-                    qr.make(fit=True)
-                    img = qr.make_image(fill_color="black", back_color="white")
-                    output_path = output_dir / f'item_{idx}_qr_{int(time.time())}.png'
-                    img.save(str(output_path))
-                    paths.append(str(output_path))
-                return paths
             else:
-                # Single item, not combined
-                cleaned_item = clean_json(data)
-                if cleaned_item is None:  # Check if cleaning failed
-                    logger.error("Failed to clean single item for QR code generation.")
-                    return []
-                qr = qrcode.QRCode(
-                    version=None,
-                    error_correction=qrcode.constants.ERROR_CORRECT_L,
-                    box_size=10,
-                    border=4,
-                )
-                json_str = json.dumps(cleaned_item, ensure_ascii=False)
-                qr.add_data(json_str)
-                qr.make(fit=True)
-                img = qr.make_image(fill_color="black", back_color="white")
-                output_path = output_dir / f'single_qr_{int(time.time())}.png'
-                img.save(str(output_path))
-                return [str(output_path)]
-        return []
-    except Exception as e:
-        logger.error(f"QR generation error: {e}")
-        return []
-def create_interface():
-    """Create a comprehensive Gradio interface with advanced features"""
-    css = """
-    .container { max-width: 1200px; margin: auto; }
-    .warning { background-color: #fff3cd; color: #856404; padding: 10px; border-radius: 4px; }
-    .error { background-color: #f8d7da; color: #721c24; padding: 10px; border-radius: 4px; }
-    .success { background-color: #d4edda; color: #155724; padding: 10px; border-radius: 4px; }
-    """
-    with gr.Blocks(css=css, title="Advanced Data Processor & QR Generator") as interface:
-        gr.Markdown("# 🌐 Advanced Data Processing & QR Code Generator")
-        with gr.Tab("URL Processing"):
-            url_input = gr.Textbox(
-                label="Enter URLs (comma or newline separated)",
-                lines=5,
-                placeholder="https://example1.com\nhttps://example2.com",
-                value=""
-            )
-        with gr.Tab("File Input"):
-            file_input = gr.File(
-                label="Upload text file or ZIP archive",
-                file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"]
-            )
-        with gr.Tab("Notepad"):
-            text_input = gr.TextArea(
-                label="JSON Data Input",
-                lines=15,
-                placeholder="Paste your JSON data here...",
-                value=""
-            )
-            with gr.Row():
-                example_btn = gr.Button("📝 Load Example JSON", variant="secondary")
-                clear_btn = gr.Button("🗑️ Clear Input", variant="secondary")
-        with gr.Row():
-            combine_data = gr.Checkbox(
-                label="Combine all data into single QR code",
-                value=True,
-                info="Generate one QR code for all data, or separate QR codes for each item"
-            )
-            process_btn = gr.Button("🔄 Process & Generate QR", variant="primary", scale=2)
-        output_json = gr.JSON(label="Processed JSON Data")
-        output_gallery = gr.Gallery(label="Generated QR Codes", columns=2, height=400)
-        output_text = gr.Textbox(label="Processing Status", interactive=False)
-        def load_example():
-            example_json = {
-                "type": "product_catalog",
-                "items": [
-                    {
-                        "id": "123",
-                        "name": "Test Product",
-                        "description": "This is a test product description",
-                        "price": 29.99,
-                        "category": "electronics",
-                        "tags": ["test", "sample", "demo"]
-                    },
-                    {
-                        "id": "456",
-                        "name": "Another Product",
-                        "description": "Another test product description",
-                        "price": 49.99,
-                        "category": "accessories",
-                        "tags": ["sample", "test"]
-                    }
-                ],
-                "metadata": {
-                    "timestamp": datetime.now().isoformat(),
-                    "version": "1.0",
-                    "source": "example"
-                }
-            }
-            return json.dumps(example_json, indent=2)
-        def clear_input():
-            return ""
-        def process_all_inputs(urls, file, text, combine):
-            """Process all input types and generate QR codes"""
-            try:
-                results = []
-                # Process text input first (since it's direct JSON)
-                if text and text.strip():
-                    try:
-                        # Try to parse as JSON
-                        json_data = json.loads(text)
-                        if isinstance(json_data, list):
-                            results.extend(json_data)
-                        else:
-                            results.append(json_data)
-                    except json.JSONDecodeError as e:
-                        return None, [], f"❌ Invalid JSON format: {str(e)}"
-                # Process URLs if provided
-                if urls and urls.strip():
-                    processor = URLProcessor()
-                    url_list = re.split(r'[,\n]', urls)
-                    url_list = [url.strip() for url in url_list if url.strip()]
-                    for url in url_list:
-                        validation = processor.validate_url(url)
-                        if validation.get('is_valid'):
-                            content = processor.fetch_content(url)
-                            if content:
-                                results.append({
-                                    'source': 'url',
-                                    'url': url,
-                                    'content': content,
-                                    'timestamp': datetime.now().isoformat()
-                                })
-                # Process files if provided
-                if file:
-                    file_processor = FileProcessor()
-                    file_results = file_processor.process_file(file)
-                    if file_results:
-                        results.extend(file_results)
-                # Generate QR codes
-                if results:
-                    if combine:
-                        # Chunk the data if necessary
-                        combined_data = []
-                        for item in results:
-                            combined_data.extend(file_processor.chunk_data(item))
-                        qr_paths = generate_qr_code(combined_data, combined=False)
-                    else:
-                        qr_paths = generate_qr_code(results, combined=combine)
-                    if qr_paths:
-                        return (
-                            results,
-                            [str(path) for path in qr_paths],
-                            f"✅ Successfully processed {len(results)} items and generated {len(qr_paths)} QR code(s)!"
-                        )
-                    else:
-                        return None, [], "❌ Failed to generate QR codes. Please check the input data."
-                else:
-                    return None, [], "⚠️ No valid content to process. Please provide some input data."
-            except Exception as e:
-                logger.error(f"Processing error: {e}")
-                return None, [], f"❌ Error: {str(e)}"
-        # Set up event handlers
-        example_btn.click(load_example, outputs=[text_input])
-        clear_btn.click(clear_input, outputs=[text_input])
-        process_btn.click(
-            process_all_inputs,
-            inputs=[url_input, file_input, text_input, combine_data],
-            outputs=[output_json, output_gallery, output_text]
-        )
-        gr.Markdown("""
-        ### Features
-        - **URL Processing**: Extract content from websites
-        - **File Processing**: Handle text files and archives
-        - **Notepad**: Direct JSON data input/manipulation
-        - **JSON Cleaning**: Automatic JSON validation and formatting
-        - **QR Generation**: Generate QR codes with embedded JSON data
-        - **Flexible Output**: Choose between combined or separate QR codes
-        ### Usage Tips
-        1. Use the **Notepad** tab for direct JSON input
-        2. Click "Load Example JSON" to see a sample format
-        3. Choose whether to combine all data into a single QR code
-        4. The generated QR codes will contain the complete JSON data
-        """)
-    return interface
-def main():
-    # Configure system settings
-    mimetypes.init()
-    # Create output directories
-    Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
-    # Create and launch interface
-    interface = create_interface()
-    # Launch with proper configuration for Hugging Face
-    interface.launch(
-        share=False,
-        debug=False  # Set to False for production
-    )
 if __name__ == "__main__":
-    main()

+import gradio as gr
 import os
 import logging
 import zipfile
+import io
+from pypdf import PdfReader
 import tempfile
+import traceback
+logging.basicConfig(level=logging.INFO)
+class FileProcessor:
     def __init__(self):
+        pass
+    def process_file(self, file_obj): # Modified to accept file_obj directly
+        if file_obj is None:
+            return "Error: No file uploaded."
+        file_path = file_obj.name
+        logging.info(f"Processing file: {file_path}")
+        file_extension = os.path.splitext(file_path)[1].lower()
         try:
+            if file_extension == '.pdf':
+                return self._process_pdf_file(file_path)
+            elif file_extension == '.zip':
+                return self._process_zip_file(file_path)
+            elif file_extension == '.txt':
+                return self._process_txt_file(file_path)
+            else:
+                error_message = f"Error: Unsupported file type: {file_extension}. Please upload .pdf, .txt, or .zip files."
+                logging.warning(error_message)
+                return error_message
         except Exception as e:
+            error_message = f"Fatal error processing file: {os.path.basename(file_path)}. Please try again or contact support.  Technical details logged."
+            logging.error(f"Unhandled exception processing file: {file_path} - {e}")
+            logging.error(traceback.format_exc()) # Log full traceback for debugging
+            return error_message
+        finally:
+            try:
+                if os.path.exists(file_path):
+                    os.remove(file_path)
+                    logging.info(f"Temporary file removed: {file_path}")
+            except OSError as e:
+                logging.error(f"Error removing temporary file {file_path}: {e}")
+    def _process_pdf_file(self, file_path):
+        text = ""
+        try:
+            with open(file_path, 'rb') as f: # Open in binary mode for PdfReader
+                reader = PdfReader(f)
+                if not reader.is_encrypted: # Check if PDF is encrypted before processing
+                    for page in reader.pages:
+                        text += page.extract_text()
+                    logging.info(f"Successfully processed PDF file: {file_path}")
+                    if not text.strip(): # Check if extracted text is empty
+                        return "Warning: PDF processed, but no text content found. The PDF might contain images or scanned content."
+                    return text
+                else:
+                    error_message = f"Error: Encrypted PDF file: {os.path.basename(file_path)}. Processing of encrypted PDFs is not supported."
+                    logging.warning(error_message)
+                    return error_message
+        except FileNotFoundError:
+            error_message = f"Error: PDF file not found: {os.path.basename(file_path)}. Please ensure the file was uploaded correctly."
+            logging.error(f"File not found: {file_path}")
+            return error_message
+        except PdfReader.errors.PdfStreamError as e: # Specific error for corrupted PDF streams
+            error_message = f"Error: Corrupted PDF file: {os.path.basename(file_path)}. The PDF file appears to be damaged or invalid. Error details: {e}"
+            logging.error(f"Corrupted PDF stream error: {file_path} - {e}")
+            return error_message
+        except Exception as e: # Catch-all for other PDF processing errors
+            error_message = f"Error processing PDF file: {os.path.basename(file_path)}.  It might be corrupted or use unsupported features. Error details logged."
+            logging.error(f"General PDF processing error: {file_path} - {e}")
+            logging.error(traceback.format_exc()) # Log full traceback for debugging
+            return error_message
+    def _process_zip_file(self, file_path):
+        extracted_text = ""
+        error_occurred = False
+        try:
+            with zipfile.ZipFile(file_path, 'r') as zf:
+                if not zf.namelist(): # Check for empty ZIP file
+                    return "Warning: ZIP file is empty and contains no files to process."
+                for filename in zf.namelist():
+                    try:
+                        if filename.lower().endswith('.pdf'):
+                            with zf.open(filename) as pdf_file:
+                                pdf_content = pdf_file.read()
+                                text = self._process_pdf_content(io.BytesIO(pdf_content), filename=filename) # Pass filename for better error context
+                                extracted_text += f"File: {filename}\nContent:\n{text}\n\n"
+                                logging.info(f"Successfully processed PDF within ZIP: {filename}")
+                        elif filename.lower().endswith('.txt'):
+                            with zf.open(filename) as txt_file:
+                                text = txt_file.read().decode('utf-8', errors='ignore') # Handle potential encoding issues in TXT
+                                extracted_text += f"File: {filename}\nContent:\n{text}\n\n"
+                                logging.info(f"Successfully processed TXT within ZIP: {filename}")
+                        else:
+                            logging.warning(f"Skipping unsupported file type within ZIP: {filename}")
+                    except Exception as e: # Catch errors for individual files within ZIP
+                        error_message = f"Error processing file '{filename}' within ZIP: {os.path.basename(file_path)}. Error: {e}"
+                        logging.error(error_message)
+                        logging.error(traceback.format_exc()) # Log traceback for inner ZIP errors
+                        extracted_text += f"File: {filename}\nError processing file. See logs for details.\n\n" # User-friendly error in output
+                        error_occurred = True # Flag that an error occurred within the zip
+            if not error_occurred:
+                logging.info(f"Successfully processed ZIP file: {file_path}")
+            else:
+                logging.warning(f"ZIP file processed with some errors: {file_path}. Check output for details.")
+            return extracted_text
+        except zipfile.BadZipFile: # Specific error for invalid ZIP file
+            error_message = f"Error: Invalid or corrupted ZIP file: {os.path.basename(file_path)}. Please ensure it is a valid ZIP archive."
+            logging.error(f"Bad ZIP file error: {file_path}")
+            return error_message
+        except Exception as e: # Catch-all for other ZIP processing errors
+            error_message = f"Error processing ZIP file: {os.path.basename(file_path)}.  It might be corrupted or have an unexpected structure. Error details logged."
+            logging.error(f"General ZIP processing error: {file_path} - {e}")
+            logging.error(traceback.format_exc()) # Log full traceback for debugging
+            return error_message
+    def _process_pdf_content(self, pdf_content_stream, filename=""): # Added filename for context
+        text = ""
         try:
+            reader = PdfReader(pdf_content_stream)
+            if not reader.is_encrypted:
+                for page in reader.pages:
+                    text += page.extract_text()
+                if not text.strip():
+                    logging.warning(f"PDF content processed from '{filename}', but no text found.") # Filename context
+                    return "Warning: PDF content processed, but no text content found."
+                return text
+            else:
+                error_message = f"Error: Encrypted PDF content found in '{filename}'. Processing encrypted PDFs is not supported."
+                logging.warning(error_message)
+                return error_message
+        except PdfReader.errors.PdfStreamError as e:
+            error_message = f"Error: Corrupted PDF content in '{filename}'. PDF stream error: {e}" # Filename context
+            logging.error(error_message)
+            return error_message
         except Exception as e:
+            error_message = f"Error processing PDF content from '{filename}'. Error details logged." # Filename context
+            logging.error(f"Error processing PDF content from stream (file: {filename}) - {e}")
+            logging.error(traceback.format_exc())
+            return error_message
+    def _process_txt_file(self, file_path):
+        text = ""
         try:
+            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file: # Handle potential encoding issues
+                text = file.read()
+            logging.info(f"Successfully processed TXT file: {file_path}")
+            if not text.strip(): # Check for empty TXT
+                return "Warning: TXT file processed, but it is empty."
+            return text
+        except FileNotFoundError:
+            error_message = f"Error: TXT file not found: {os.path.basename(file_path)}. Please ensure the file was uploaded correctly."
+            logging.error(f"File not found: {file_path}")
+            return error_message
         except Exception as e:
+            error_message = f"Error processing TXT file: {os.path.basename(file_path)}. Error details logged."
+            logging.error(f"Error processing TXT file: {file_path} - {e}")
+            logging.error(traceback.format_exc())
+            return error_message
+# Initialize FileProcessor
+file_processor = FileProcessor()
+def process_file_and_respond(file_obj): # No change needed here as file_obj is now directly processed
+    return file_processor.process_file(file_obj)
+def test_functionality_enhanced():
+    temp_dir = tempfile.TemporaryDirectory()
+    test_dir = temp_dir.name
+    # --- Create test files in temporary directory ---
+    def create_test_file(filepath, content, mode='w'): # Helper function for file creation
+        with open(filepath, mode, encoding='utf-8') as f: # Default text mode
+            f.write(content)
+    def create_binary_test_file(filepath, content_binary, mode='wb'): # Helper for binary file creation
+        with open(filepath, mode) as f:
+            f.write(content_binary)
+    pdf_content = "This is a test PDF file.\nWith multiple lines."
+    txt_content = "This is a test TXT file.\nAnother line of text."
+    zip_content_pdf = "PDF content inside ZIP."
+    zip_content_txt = "TXT content inside ZIP."
+    empty_txt_content = ""
+    encrypted_pdf_content = "%PDF-1.5\n%����\n1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n2 0 obj\n<< /Type /Pages /Kids [ 3 0 R ] /Count 1 >>\nendobj\n3 0 obj\n<< /Type /Page /MediaBox [ 0 0 612 792 ] /Contents 4 0 R /Parent 2 0 R >>\nendobj\n4 0 obj\n<< /Length 5 >>\nstream\nBT\n/F1 12 Tf\n72 712 Td\n(This is an encrypted PDF - fake content) Tj\nET\nendstream\nendobj\n5 0 obj\n<< /Length 44 >>\nstream\n/Filter /FlateDecode\n/Length 44\nstream\nxœ+��\x0e@E\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\nendstream\nendstream\nendobj\nxref\n0 6\n0000000000 65535 f\n0000000015 00000 n\n0000000062 00000 n\n0000000112 00000 n\n0000000179 00000 n\n0000000259 00000 n\ntrailer\n<< /Size 6 /Root 1 0 R >>\nstartxref\n369\n%%EOF\n" # Minimal PDF structure - not actually encrypted, but enough to test encryption check
+    pdf_file_path = os.path.join(test_dir, "test.pdf")
+    txt_file_path = os.path.join(test_dir, "test.txt")
+    zip_file_path = os.path.join(test_dir, "test.zip")
+    unsupported_file_path = os.path.join(test_dir, "test.csv")
+    corrupted_pdf_path = os.path.join(test_dir, "corrupted.pdf")
+    empty_txt_path = os.path.join(test_dir, "empty.txt")
+    empty_zip_path = os.path.join(test_dir, "empty.zip")
+    encrypted_pdf_path = os.path.join(test_dir, "encrypted.pdf")
+    create_test_file(pdf_file_path, pdf_content)
+    create_test_file(txt_file_path, txt_content)
+    create_test_file(unsupported_file_path, "test csv content")
+    create_test_file(empty_txt_path, empty_txt_content)
+    create_binary_test_file(encrypted_pdf_path, encrypted_pdf_content.encode('latin-1')) # Encrypted PDF test - use latin-1 to avoid encoding issues with PDF structure
+    # Create a "corrupted" PDF by just writing plain text to a .pdf file.
+    create_test_file(corrupted_pdf_path, "This is NOT a valid PDF file.")
+    with zipfile.ZipFile(zip_file_path, 'w') as zf:
+        zf.writestr("zip_test.pdf", zip_content_pdf)
+        zf.writestr("zip_test.txt", zip_content_txt)
+    with zipfile.ZipFile(empty_zip_path, 'w') as zf: # Create empty zip
+        pass
+    # --- Test cases ---
+    test_cases = [
+        {"name": "PDF Processing", "file_path": pdf_file_path, "expected_content": pdf_content, "expect_error": False},
+        {"name": "TXT Processing", "file_path": txt_file_path, "expected_content": txt_content, "expect_error": False},
+        {"name": "ZIP Processing (PDF & TXT)", "file_path": zip_file_path, "expected_content_in": [zip_content_pdf, zip_content_txt], "expect_error": False},
+        {"name": "Unsupported File Type", "file_path": unsupported_file_path, "expected_content": "Unsupported file type", "expect_error": True},
+        {"name": "Corrupted PDF Processing", "file_path": corrupted_pdf_path, "expected_content": "Error processing PDF file", "expect_error": True},
+        {"name": "Empty TXT File", "file_path": empty_txt_path, "expected_content": "Warning: TXT file processed, but it is empty.", "expect_error": False},
+        {"name": "Empty ZIP File", "file_path": empty_zip_path, "expected_content": "Warning: ZIP file is empty", "expect_error": False},
+        {"name": "Encrypted PDF File", "file_path": encrypted_pdf_path, "expected_content": "Error: Encrypted PDF file", "expect_error": True},
+    ]
+    all_tests_passed = True
+    for case in test_cases:
+        print(f"\n--- Test Case: {case['name']} ---")
+        result = file_processor.process_file(SimpleFileObject(case['file_path'])) # Use SimpleFileObject to simulate file upload
+        print(f"Result: {result[:100]}...") # Print first 100 chars of result
+        if case.get("expect_error"):
+            if case["expected_content"] not in result:
+                print(f"  ❌ FAIL: Expected error message containing '{case['expected_content']}', but got: {result}")
+                all_tests_passed = False
             else:
+                print(f"  ✅ PASS: Expected error message found.")
+        elif case.get("expected_content_in"): # For cases expecting multiple contents (like ZIP)
+             all_contents_found = True
+             for expected_content in case["expected_content_in"]:
+                 if expected_content not in result:
+                     print(f"  ❌ FAIL: Expected content '{expected_content}' not found in result for {case['name']}. Got: {result[:100]}...")
+                     all_contents_found = False
+                     all_tests_passed = False
+                     break
+             if all_contents_found:
+                 print(f"  ✅ PASS: All expected contents found.")
+        elif case.get("expected_content"):
+            if case["expected_content"] not in result:
+                print(f"  ❌ FAIL: Expected content '{case['expected_content']}', but got: {result[:100]}...")
+                all_tests_passed = False
+            else:
+                print(f"  ✅ PASS: Expected content found.")
+    if all_tests_passed:
+        print("\n🎉 All enhanced tests completed successfully! 🎉")
+    else:
+        print("\n⚠️ Some enhanced tests FAILED. See details above. ⚠️")
+    temp_dir.cleanup() # Clean up temporary directory and files
+class SimpleFileObject: # Mock file object for testing
+    def __init__(self, file_path):
+        self.name = file_path
+iface = gr.Interface(
+    fn=process_file_and_respond,
+    inputs=gr.File(file_types=[".pdf", ".txt", ".zip"]),
+    outputs="text",
+    title="Robust File Processing Agent",
+    description="Upload a PDF, TXT, or ZIP file to process its content. Enhanced for error handling and robustness."
+)
 if __name__ == "__main__":
+    test_functionality_enhanced() # Run enhanced tests
+    iface.launch(debug=True)