Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Mar 19

Commit

6a91fa4

verified ·

1 Parent(s): 172a6d7

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -65

app.py CHANGED Viewed

@@ -15,9 +15,6 @@ import gradio as gr
 from bs4 import BeautifulSoup
 from fake_useragent import UserAgent
 from cleantext import clean
-from starlette.applications import Starlette
-from starlette.responses import JSONResponse
-from starlette.routing import Route
 # Setup logging with detailed configuration
 logging.basicConfig(
@@ -154,12 +151,19 @@ class URLProcessor:
 class FileProcessor:
     """Class to handle file processing"""
     def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024):  # 2GB default
         self.max_file_size = max_file_size
         self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
-        self.processed_zip_count = 0
-        self.max_zip_files = 5
     def process_files(self, files: Union[List[gr.File], List[str]]) -> List[Dict]:
         """Process multiple uploaded files and return a single JSON extraction"""
@@ -167,76 +171,70 @@ class FileProcessor:
             return []
         combined_data = []
-        self.processed_zip_count = 0
         try:
             for file in files:
                 # Check if the file is a Gradio File object or a string path
-                file_path = file.name if isinstance(file, gr.File) else file
-                # Log the file path being processed
-                logger.info(f"Processing file: {file_path}")
-                # Skip if it's a directory
-                if os.path.isdir(file_path):
-                    logger.warning(f"Skipping directory: {file_path}")
-                    continue
-                # Skip if file doesn't exist
-                if not os.path.exists(file_path):
-                    logger.warning(f"File does not exist: {file_path}")
-                    continue
-                # Check file size
-                file_size = os.path.getsize(file_path)
-                if file_size > self.max_file_size:
-                    logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
-                    continue  # Skip this file
-                # Process based on file type
-                if zipfile.is_zipfile(file_path):
-                    if self.processed_zip_count >= self.max_zip_files:
-                        logger.warning(f"Maximum number of ZIP files ({self.max_zip_files}) reached, skipping {file_path}")
-                        continue
-                    self.processed_zip_count += 1
-                    zip_results = self._process_zip_file(file_path)
-                    combined_data.extend(zip_results)
                 else:
-                    file_results = self._process_single_file(file_path)
-                    combined_data.extend(file_results)
         except Exception as e:
             logger.error(f"Error processing files: {str(e)}")
-        return combined_data
     def _process_zip_file(self, zip_path: str) -> List[Dict]:
-        """Process ZIP file contents more efficiently"""
         results = []
         try:
-            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-                # Get list of files in the zip
-                file_list = [file for file in zip_ref.namelist()
-                            if not file.endswith('/') and not file.startswith('__MACOSX')]
-                # Process each file directly from the zip without extracting all files
-                for filename in file_list:
-                    if any(filename.lower().endswith(ext) for ext in self.supported_text_extensions):
-                        try:
-                            with zip_ref.open(filename) as file:
-                                content = file.read().decode('utf-8', errors='ignore')
-                            if content.strip():
-                                results.append({
-                                    "source": "zip_file",
-                                    "filename": filename,
-                                    "content": content,
-                                    "timestamp": datetime.now().isoformat()
-                                })
-                        except Exception as e:
-                            logger.error(f"Error reading file {filename} from zip: {str(e)}")
         except Exception as e:
-            logger.error(f"Error processing ZIP file {zip_path}: {str(e)}")
-        return results
 class Chatbot:
     """Simple chatbot that uses provided JSON data for responses."""
@@ -418,10 +416,10 @@ def create_interface():
 def main():
     # Configure system settings
     mimetypes.init()
     # Create and launch interface
     interface = create_interface()
     # Launch with proper configuration
     interface.launch(
         server_name="0.0.0.0",

 from bs4 import BeautifulSoup
 from fake_useragent import UserAgent
 from cleantext import clean
 # Setup logging with detailed configuration
 logging.basicConfig(
 class FileProcessor:
     """Class to handle file processing"""
     def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024):  # 2GB default
         self.max_file_size = max_file_size
         self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
+    def is_text_file(self, filepath: str) -> bool:
+        """Check if file is a text file"""
+        try:
+            mime_type, _ = mimetypes.guess_type(filepath)
+            return (mime_type and mime_type.startswith('text/')) or \
+                   (os.path.splitext(filepath)[1].lower() in self.supported_text_extensions)
+        except Exception:
+            return False
     def process_files(self, files: Union[List[gr.File], List[str]]) -> List[Dict]:
         """Process multiple uploaded files and return a single JSON extraction"""
             return []
         combined_data = []
         try:
             for file in files:
                 # Check if the file is a Gradio File object or a string path
+                file_name = file.name if isinstance(file, gr.File) else file
+                if os.path.isfile(file_name):
+                    file_size = os.path.getsize(file_name)
+                    if file_size > self.max_file_size:
+                        logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
+                        continue  # Skip this file
+                    if zipfile.is_zipfile(file_name):
+                        combined_data.extend(self._process_zip_file(file_name))
+                    else:
+                        combined_data.extend(self._process_single_file(file_name))
                 else:
+                    logger.warning(f"Skipping directory: {file_name}")
         except Exception as e:
             logger.error(f"Error processing files: {str(e)}")
+            return []
     def _process_zip_file(self, zip_path: str) -> List[Dict]:
+        """Process ZIP file contents"""
         results = []
+        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+            with tempfile.TemporaryDirectory() as temp_dir:
+                zip_ref.extractall(temp_dir)
+                for root, _, files in os.walk(temp_dir):
+                    for filename in files:
+                        filepath = os.path.join(root, filename)
+                        if self.is_text_file(filepath):
+                            try:
+                                with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
+                                    content = f.read()
+                                if content.strip():
+                                    results.append({
+                                        "source": "file",
+                                        "filename": filename,
+                                        "content": content,
+                                        "timestamp": datetime.now().isoformat()
+                                    })
+                            except Exception as e:
+                                logger.error(f"Error reading file {filename}: {str(e)}")
+        return results
+    def _process_single_file(self, file_path: str) -> List[Dict]:
         try:
+            file_stat = os.stat(file_path)
+            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                content = f.read()
+            return [{
+                'source': 'file',
+                'filename': os.path.basename(file_path),
+                'file_size': file_stat.st_size,
+                'mime_type': mimetypes.guess_type(file_path)[0],
+                'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
+                'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
+                'content': content,
+                'timestamp': datetime.now().isoformat()
+            }]
         except Exception as e:
+            logger.error(f"File processing error: {e}")
+            return []
 class Chatbot:
     """Simple chatbot that uses provided JSON data for responses."""
 def main():
     # Configure system settings
     mimetypes.init()
     # Create and launch interface
     interface = create_interface()
     # Launch with proper configuration
     interface.launch(
         server_name="0.0.0.0",