HF_Agents_Final_Project

Sleeping

App Files Files Community

Yago Bolivar commited on May 12

Commit

c5a6e89

1 Parent(s): c511b4a

refactor: enhance file identification logic with improved type mapping and error handling

Browse files

Files changed (1) hide show

src/file_processing_tool.py +71 -73

src/file_processing_tool.py CHANGED Viewed

@@ -1,95 +1,98 @@
 import os
-import mimetypes # For a more robust way to guess types
 class FileIdentifier:
     def __init__(self):
-        # Initialize mimetypes database
         mimetypes.init()
-        self.file_type_actions = {
-            "audio": "speech-to-text",
-            "spreadsheet": "spreadsheet_parser",
-            "image": "ocr_vision_reasoning",
-            "python_code": "safe_code_interpreter",
-            "pdf": "pdf_text_extractor",
-            "text": "text_file_reader",
-            "csv": "csv_parser",
-            # Add more mappings as needed
         }
     def identify_file(self, filepath):
         """
         Identifies the file type and suggests a processing action.
-        Returns a dictionary with 'file_type', 'mime_type', and 'suggested_action'.
         """
         if not os.path.exists(filepath):
             return {
-                "error": "File not found",
-                "filepath": filepath
-            }
-        if not os.path.isfile(filepath):
-            return {
-                "error": "Path is not a file",
-                "filepath": filepath
             }
-        _, extension = os.path.splitext(filepath)
-        extension = extension.lower()
-        # Primary detection by extension (as per downloaded_files.md)
-        file_type = "unknown"
-        if extension == ".mp3":
-            file_type = "audio"
-        elif extension == ".xlsx":
-            file_type = "spreadsheet"
-        elif extension == ".png": # Assuming .png for images as per downloaded_files.md
-            file_type = "image"
-        elif extension == ".py":
-            file_type = "python_code"
-        elif extension == ".pdf":
-            file_type = "pdf"
-        elif extension == ".txt":
-            file_type = "text"
-        elif extension == ".csv":
-            file_type = "csv"
-        # Fallback or complementary check using MIME types
-        mime_type, _ = mimetypes.guess_type(filepath)
-        # If extension-based detection was unknown, try to infer from MIME type
-        if file_type == "unknown" and mime_type:
             if mime_type.startswith("audio/"):
-                file_type = "audio"
-            elif mime_type in ["application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]:
-                file_type = "spreadsheet"
             elif mime_type.startswith("image/"):
-                file_type = "image"
-            elif mime_type in ["text/x-python", "application/x-python-code"]:
-                file_type = "python_code"
             elif mime_type == "application/pdf":
-                file_type = "pdf"
-            elif mime_type.startswith("text/plain"):
-                 file_type = "text"
             elif mime_type == "text/csv":
-                file_type = "csv"
-        suggested_action = self.file_type_actions.get(file_type, "manual_inspection_required")
         return {
             "filepath": filepath,
-            "extension": extension,
             "mime_type": mime_type,
-            "determined_type": file_type,
             "suggested_action": suggested_action
         }
-# Example Usage:
 if __name__ == "__main__":
     identifier = FileIdentifier()
-    # Create dummy files for testing
     dummy_files_dir = "dummy_files_for_test"
     os.makedirs(dummy_files_dir, exist_ok=True)
     test_files_info = {
         "audio_sample.mp3": "audio content",
         "report_data.xlsx": "excel content",
@@ -98,27 +101,22 @@ if __name__ == "__main__":
         "document.pdf": "pdf content",
         "notes.txt": "text content",
         "data.csv": "col1,col2\n1,2",
-        "unknown_file.zip": "zip content"
     }
     for filename, content in test_files_info.items():
         with open(os.path.join(dummy_files_dir, filename), "w") as f:
-            # For binary files, this is not ideal, but for testing identification it's okay
-            # Real .xlsx, .png, .mp3, .pdf would be binary
-            if filename.endswith(('.xlsx', '.png', '.mp3', '.pdf', '.zip')):
-                 with open(os.path.join(dummy_files_dir, filename), "wb") as fb: # Ensure binary files are writable
-                    fb.write(b"dummy binary content") # placeholder
-            else:
-                f.write(content)
     test_filepaths = [os.path.join(dummy_files_dir, f) for f in test_files_info.keys()]
-    test_filepaths.append("non_existent_file.doc") # Test non-existent file
-    for filepath in test_filepaths:
-        result = identifier.identify_file(filepath)
         print(result)
-    # Clean up dummy files
     # import shutil
     # shutil.rmtree(dummy_files_dir)
-    print(f"\nNote: Dummy files created in '{dummy_files_dir}'. You may want to remove this directory.")

 import os
+import mimetypes
 class FileIdentifier:
     def __init__(self):
         mimetypes.init()
+        # Mapping from simple type to action and common extensions
+        self.file_type_map = {
+            "audio": {"action": "speech-to-text", "extensions": [".mp3", ".wav", ".flac", ".aac", ".ogg"]},
+            "spreadsheet": {"action": "spreadsheet_parser", "extensions": [".xlsx", ".xls", ".ods"]},
+            "image": {"action": "ocr_vision_reasoning", "extensions": [".png", ".jpg", ".jpeg", ".gif", ".bmp"]},
+            "python_code": {"action": "safe_code_interpreter", "extensions": [".py"]},
+            "pdf": {"action": "pdf_text_extractor", "extensions": [".pdf"]},
+            "text": {"action": "text_file_reader", "extensions": [".txt", ".md", ".rtf"]},
+            "csv": {"action": "csv_parser", "extensions": [".csv"]},
+            # Add more specific types if needed
         }
+        # For quick lookup from extension to simple type
+        self.extension_to_type = {}
+        for simple_type, details in self.file_type_map.items():
+            for ext in details["extensions"]:
+                self.extension_to_type[ext] = simple_type
     def identify_file(self, filepath):
         """
         Identifies the file type and suggests a processing action.
+        Returns a dictionary with 'filepath', 'determined_type', 'mime_type',
+        'suggested_action', or an 'error'.
         """
         if not os.path.exists(filepath):
             return {
+                "filepath": filepath,
+                "error": "File not found"
             }
+        mime_type, encoding = mimetypes.guess_type(filepath)
+        file_extension = os.path.splitext(filepath)[1].lower()
+        determined_type = "unknown"
+        suggested_action = "unknown_handler"
+        # Prioritize extension-based mapping for specific known types
+        if file_extension in self.extension_to_type:
+            determined_type = self.extension_to_type[file_extension]
+            suggested_action = self.file_type_map[determined_type]["action"]
+        elif mime_type:
+            # Fallback to MIME type if extension is not specifically mapped
+            # This part might need more sophisticated mapping from MIME to your simple types
             if mime_type.startswith("audio/"):
+                determined_type = "audio"
+                suggested_action = self.file_type_map["audio"]["action"]
             elif mime_type.startswith("image/"):
+                determined_type = "image"
+                suggested_action = self.file_type_map["image"]["action"]
             elif mime_type == "application/pdf":
+                determined_type = "pdf"
+                suggested_action = self.file_type_map["pdf"]["action"]
             elif mime_type == "text/csv":
+                determined_type = "csv"
+                suggested_action = self.file_type_map["csv"]["action"]
+            elif mime_type.startswith("text/"): # General text
+                # Check if it's python by extension, as text/x-python might not always be guessed
+                if file_extension == ".py":
+                    determined_type = "python_code"
+                    suggested_action = self.file_type_map["python_code"]["action"]
+                else:
+                    determined_type = "text"
+                    suggested_action = self.file_type_map["text"]["action"]
+            elif file_extension == ".xlsx" or mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
+                determined_type = "spreadsheet"
+                suggested_action = self.file_type_map["spreadsheet"]["action"]
+            # Add more MIME-based rules if necessary
+        # If still unknown, but has a common extension not yet caught
+        if determined_type == "unknown" and file_extension:
+             # A final check for common types if MIME was unhelpful or generic
+            if file_extension in self.extension_to_type: # Redundant if first check comprehensive
+                determined_type = self.extension_to_type[file_extension]
+                suggested_action = self.file_type_map[determined_type]["action"]
         return {
             "filepath": filepath,
+            "determined_type": determined_type,
+            "file_extension": file_extension,
             "mime_type": mime_type,
             "suggested_action": suggested_action
         }
+# Example Usage (optional, can be kept for testing this module directly):
 if __name__ == "__main__":
     identifier = FileIdentifier()
     dummy_files_dir = "dummy_files_for_test"
     os.makedirs(dummy_files_dir, exist_ok=True)
     test_files_info = {
         "audio_sample.mp3": "audio content",
         "report_data.xlsx": "excel content",
         "document.pdf": "pdf content",
         "notes.txt": "text content",
         "data.csv": "col1,col2\n1,2",
+        "archive.zip": "zip content", # Example of an unmapped type by default
+        "unknown_file.dat": "binary data"
     }
     for filename, content in test_files_info.items():
         with open(os.path.join(dummy_files_dir, filename), "w") as f:
+            f.write(content) # Simple write for testing existence and extension
     test_filepaths = [os.path.join(dummy_files_dir, f) for f in test_files_info.keys()]
+    test_filepaths.append("non_existent_file.doc")
+    for filepath_to_test in test_filepaths:
+        result = identifier.identify_file(filepath_to_test)
         print(result)
+    # Consider cleaning up dummy files if you run this main block frequently
     # import shutil
     # shutil.rmtree(dummy_files_dir)
+    print(f"\nNote: Dummy files created in '{dummy_files_dir}'. You may want to remove this directory after testing.")