Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Mar 19

Commit

345d19b

verified ·

1 Parent(s): 815015e

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -35

app.py CHANGED Viewed

@@ -32,7 +32,7 @@ class URLProcessor:
         self.session = requests.Session()
         self.timeout = 10  # seconds
         self.session.headers.update({
-            'User -Agent': UserAgent().random,
             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
             'Accept-Language': 'en-US,en;q=0.5',
             'Accept-Encoding': 'gzip, deflate, br',
@@ -97,7 +97,7 @@ class URLProcessor:
                 logger.error(f"Invalid Google Drive URL: {url}")
                 return None
-            direct_url = f"https://drive.google.com/uc? export=download&id={file_id.group(1)}"
             response = self.session.get(direct_url, timeout=self.timeout)
             response.raise_for_status()
@@ -149,7 +149,7 @@ class URLProcessor:
             else:
                 logger.warning(f"No main content found for URL: {url}")
                 return None
         except Exception as e:
             logger.error(f"HTML processing failed: {e}")
             return None
@@ -202,7 +202,7 @@ class FileProcessor:
                     zip_results = self._process_zip_file(file_path)
                     combined_data.extend(zip_results)
                 elif self.is_text_file(file_path):
-                    file_results = self._process_single_file(file_path)
                     combined_data.extend(file_results)
                 else:
                     logger.warning(f"Unsupported file type: {file_path}")
@@ -212,29 +212,7 @@ class FileProcessor:
         return combined_data
-    def _process_zip_file(self, zip_path: str) -> List[Dict]:
-        """Process ZIP file contents"""
-        results = []
-        temp_dir = tempfile.mkdtemp()
-        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-            zip_ref.extractall(temp_dir)
-            for root, _, files in os.walk(temp_dir):
-                for filename in files:
-                    filepath = os.path.join(root, filename)
-                    if self.is_text_file(filepath):
-                        try:
-                            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
-                                content = f.read()
-                                if content.strip():
-                                    results.append({
-                                        "source": "file",
-                                        "filename": filename,
-                                        "content": content,
-                                        "timestamp": datetime.now().isoformat()
-                                    })
-                                except Exception as e:
-                                    logger.error(f"Error reading file {filename}: {str(e)}")
-                                    return results
     def _process_single_file(self, file) -> List[Dict]:
         try:
@@ -255,6 +233,38 @@ class FileProcessor:
         except Exception as e:
             logger.error(f"File processing error: {e}")
             return []
 class Chatbot:
     """Simple chatbot that uses provided JSON data for responses."""
@@ -295,7 +305,7 @@ def create_interface():
         with gr.Tab("URL Processing"):
             url_input = gr.Textbox(
-                label="Enter URLs (comma or newline separated)",
                 lines=5,
                 placeholder="https://example1.com\nhttps://example2.com"
             )
@@ -308,7 +318,7 @@ def create_interface():
         with gr.Tab("Text Input"):
             text_input = gr.Textbox(
-                label="Raw Text Input",
                 lines=5,
                 placeholder="Paste your text here..."
             )
@@ -358,13 +368,13 @@ def create_interface():
                 if file:
                     results.extend(file_processor.process_files(file))
                 if text:
                     cleaned_text = processor.advanced_text_cleaning(text)
                     results.append({
                         'source': 'direct_input',
                         'content': cleaned_text,
-                        'timestamp': datetime.now(). isoformat()
                     })
                 if results:
@@ -393,8 +403,8 @@ def create_interface():
             return chatbot.chat(user_input)
         process_btn.click(
-            process_all_inputs,
-            inputs=[url_input, file_input, text_input],
             outputs=[output_file, output_text]
         )
@@ -434,8 +444,8 @@ def main():
         server_port=7860,
         share=False,
         inbrowser=False,  # Disable browser opening in container
-        debug=False       # Disable debug mode for production
     )
 if __name__ == "__main__":
-    main()

         self.session = requests.Session()
         self.timeout = 10  # seconds
         self.session.headers.update({
+            'User-Agent': UserAgent().random,
             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
             'Accept-Language': 'en-US,en;q=0.5',
             'Accept-Encoding': 'gzip, deflate, br',
                 logger.error(f"Invalid Google Drive URL: {url}")
                 return None
+            direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
             response = self.session.get(direct_url, timeout=self.timeout)
             response.raise_for_status()
             else:
                 logger.warning(f"No main content found for URL: {url}")
                 return None
         except Exception as e:
             logger.error(f"HTML processing failed: {e}")
             return None
                     zip_results = self._process_zip_file(file_path)
                     combined_data.extend(zip_results)
                 elif self.is_text_file(file_path):
+                    file_results = self._process_single_file(file) # Changed file_path to file
                     combined_data.extend(file_results)
                 else:
                     logger.warning(f"Unsupported file type: {file_path}")
         return combined_data
     def _process_single_file(self, file) -> List[Dict]:
         try:
         except Exception as e:
             logger.error(f"File processing error: {e}")
             return []
+    def _process_zip_file(self, zip_file_path: str) -> List[Dict]:
+        """Process a ZIP file and extract data from text files within."""
+        extracted_data = []
+        try:
+            with zipfile.ZipFile(zip_file_path, 'r') as zf:
+                for name in zf.namelist():
+                    if self.is_text_file(name):
+                        try:
+                            file_info = zf.getinfo(name)
+                            with zf.open(name) as f:
+                                content = f.read().decode('utf-8', errors='ignore')
+                            # Use file_info for file size and date/time
+                            extracted_data.append({
+                                'source': 'zip',
+                                'filename': name,
+                                'file_size': file_info.file_size,  # Get file size from ZipInfo
+                                'mime_type': mimetypes.guess_type(name)[0],
+                                'created': datetime(*file_info.date_time).isoformat(), # Get date from ZipInfo
+                                'modified': datetime(*file_info.date_time).isoformat(),
+                                'content': content,
+                                'timestamp': datetime.now().isoformat()
+                            })
+                        except Exception as e:
+                            logger.error(f"Error processing file {name} from ZIP: {e}")
+        except zipfile.BadZipFile:
+            logger.error(f"Error: {zip_file_path} is not a valid ZIP file.")
+        except Exception as e:
+            logger.error(f"Error processing ZIP file {zip_file_path}: {e}")
+        return extracted_data
 class Chatbot:
     """Simple chatbot that uses provided JSON data for responses."""
         with gr.Tab("URL Processing"):
             url_input = gr.Textbox(
+                label="Enter URLs (comma or newline separated)",
                 lines=5,
                 placeholder="https://example1.com\nhttps://example2.com"
             )
         with gr.Tab("Text Input"):
             text_input = gr.Textbox(
+                label="Raw Text Input",
                 lines=5,
                 placeholder="Paste your text here..."
             )
                 if file:
                     results.extend(file_processor.process_files(file))
                 if text:
                     cleaned_text = processor.advanced_text_cleaning(text)
                     results.append({
                         'source': 'direct_input',
                         'content': cleaned_text,
+                        'timestamp': datetime.now().isoformat()
                     })
                 if results:
             return chatbot.chat(user_input)
         process_btn.click(
+            process_all_inputs,
+            inputs=[url_input, file_input, text_input],
             outputs=[output_file, output_text]
         )
         server_port=7860,
         share=False,
         inbrowser=False,  # Disable browser opening in container
+        debug=False      # Disable debug mode for production
     )
 if __name__ == "__main__":
+    main()