Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Mar 19

Commit

dad6950

verified ·

1 Parent(s): 4dd743a

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -25

app.py CHANGED Viewed

@@ -134,17 +134,23 @@ class URLProcessor:
             for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
                 element.decompose()
-            main_content = soup.find('main') or soup.find('article') or soup.body
-            text_content = main_content.get_text(separator='\n', strip=True)
-            cleaned_content = self.advanced_text_cleaning(text_content)
-            return {
-                'content': cleaned_content,
-                'content_type': response.headers.get('Content-Type', ''),
-                'timestamp': datetime.now().isoformat()
-            }
         except Exception as e:
             logger.error(f"HTML processing failed: {e}")
             return None
@@ -203,7 +209,7 @@ class FileProcessor:
                     zip_results = self._process_zip_file(file_path)
                     combined_data.extend(zip_results)
                 elif self.is_text_file(file_path):
-                    file_results = self.process_single_file(file_path)
                     combined_data.extend(file_results)
                 else:
                     logger.warning(f"Unsupported file type: {file_path}")
@@ -212,22 +218,27 @@ class FileProcessor:
             logger.error(f"Error processing files: {str(e)}")
         return combined_data
-    def process_single_file(self, file_path: str) -> List[Dict]:
-        """Process a single file and extract its content."""
         results = []
-        try:
-            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
-                content = file.read()
-                if content.strip():
-                    results.append({
-                        "source": "file",
-                        "filename": os.path.basename(file_path),
-                        "content": content,
-                        "timestamp": datetime.now().isoformat()
-                    })
-        except Exception as e:
-            logger.error(f"Error reading file {file_path}: {str(e)}")
         return results
     def _process_single_file(self, file) -> List[Dict]:

             for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
                 element.decompose()
+            # Try to find the main content in a more robust way
+            main_content = soup.find('main') or soup.find('article') or soup.body
+            if main_content:
+                text_content = main_content.get_text(separator='\n', strip=True)
+                cleaned_content = self.advanced_text_cleaning(text_content)
+                return {
+                    'content': cleaned_content,
+                    'content_type': response.headers.get('Content-Type', ''),
+                    'timestamp': datetime.now().isoformat()
+                }
+            else:
+                logger.warning(f"No main content found for URL: {url}")
+                return None
         except Exception as e:
             logger.error(f"HTML processing failed: {e}")
             return None
                     zip_results = self._process_zip_file(file_path)
                     combined_data.extend(zip_results)
                 elif self.is_text_file(file_path):
+                    file_results = self._process_single_file(file_path)
                     combined_data.extend(file_results)
                 else:
                     logger.warning(f"Unsupported file type: {file_path}")
             logger.error(f"Error processing files: {str(e)}")
         return combined_data
+    def _process_zip_file(self, zip_path: str, temp_dir: str) -> List[Dict]:
+        """Process ZIP file contents"""
         results = []
+        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+            zip_ref.extractall(temp_dir)
+            for root, _, files in os.walk(temp_dir):
+                for filename in files:
+                    filepath = os.path.join(root, filename)
+                    if self.is_text_file(filepath):
+                        try:
+                            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
+                                content = f.read()
+                            if content.strip():
+                                results.append({
+                                    "source": "file",
+                                    "filename": filename,
+                                    "content": content,
+                                    "timestamp": datetime.now().isoformat()
+                                })
+                        except Exception as e:
+                            logger.error(f"Error reading file {filename}: {str(e)}")
         return results
     def _process_single_file(self, file) -> List[Dict]: