Yago Bolivar commited on
Commit
c5a6e89
·
1 Parent(s): c511b4a

refactor: enhance file identification logic with improved type mapping and error handling

Browse files
Files changed (1) hide show
  1. src/file_processing_tool.py +71 -73
src/file_processing_tool.py CHANGED
@@ -1,95 +1,98 @@
1
  import os
2
- import mimetypes # For a more robust way to guess types
3
 
4
  class FileIdentifier:
5
  def __init__(self):
6
- # Initialize mimetypes database
7
  mimetypes.init()
8
- self.file_type_actions = {
9
- "audio": "speech-to-text",
10
- "spreadsheet": "spreadsheet_parser",
11
- "image": "ocr_vision_reasoning",
12
- "python_code": "safe_code_interpreter",
13
- "pdf": "pdf_text_extractor",
14
- "text": "text_file_reader",
15
- "csv": "csv_parser",
16
- # Add more mappings as needed
 
17
  }
 
 
 
 
 
18
 
19
  def identify_file(self, filepath):
20
  """
21
  Identifies the file type and suggests a processing action.
22
- Returns a dictionary with 'file_type', 'mime_type', and 'suggested_action'.
 
23
  """
24
  if not os.path.exists(filepath):
25
  return {
26
- "error": "File not found",
27
- "filepath": filepath
28
- }
29
- if not os.path.isfile(filepath):
30
- return {
31
- "error": "Path is not a file",
32
- "filepath": filepath
33
  }
34
 
35
- _, extension = os.path.splitext(filepath)
36
- extension = extension.lower()
37
 
38
- # Primary detection by extension (as per downloaded_files.md)
39
- file_type = "unknown"
40
- if extension == ".mp3":
41
- file_type = "audio"
42
- elif extension == ".xlsx":
43
- file_type = "spreadsheet"
44
- elif extension == ".png": # Assuming .png for images as per downloaded_files.md
45
- file_type = "image"
46
- elif extension == ".py":
47
- file_type = "python_code"
48
- elif extension == ".pdf":
49
- file_type = "pdf"
50
- elif extension == ".txt":
51
- file_type = "text"
52
- elif extension == ".csv":
53
- file_type = "csv"
54
 
55
- # Fallback or complementary check using MIME types
56
- mime_type, _ = mimetypes.guess_type(filepath)
57
-
58
- # If extension-based detection was unknown, try to infer from MIME type
59
- if file_type == "unknown" and mime_type:
 
 
60
  if mime_type.startswith("audio/"):
61
- file_type = "audio"
62
- elif mime_type in ["application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]:
63
- file_type = "spreadsheet"
64
  elif mime_type.startswith("image/"):
65
- file_type = "image"
66
- elif mime_type in ["text/x-python", "application/x-python-code"]:
67
- file_type = "python_code"
68
  elif mime_type == "application/pdf":
69
- file_type = "pdf"
70
- elif mime_type.startswith("text/plain"):
71
- file_type = "text"
72
  elif mime_type == "text/csv":
73
- file_type = "csv"
74
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
- suggested_action = self.file_type_actions.get(file_type, "manual_inspection_required")
77
 
78
  return {
79
  "filepath": filepath,
80
- "extension": extension,
 
81
  "mime_type": mime_type,
82
- "determined_type": file_type,
83
  "suggested_action": suggested_action
84
  }
85
 
86
- # Example Usage:
87
  if __name__ == "__main__":
88
  identifier = FileIdentifier()
89
-
90
- # Create dummy files for testing
91
  dummy_files_dir = "dummy_files_for_test"
92
  os.makedirs(dummy_files_dir, exist_ok=True)
 
93
  test_files_info = {
94
  "audio_sample.mp3": "audio content",
95
  "report_data.xlsx": "excel content",
@@ -98,27 +101,22 @@ if __name__ == "__main__":
98
  "document.pdf": "pdf content",
99
  "notes.txt": "text content",
100
  "data.csv": "col1,col2\n1,2",
101
- "unknown_file.zip": "zip content"
 
102
  }
 
103
  for filename, content in test_files_info.items():
104
  with open(os.path.join(dummy_files_dir, filename), "w") as f:
105
- # For binary files, this is not ideal, but for testing identification it's okay
106
- # Real .xlsx, .png, .mp3, .pdf would be binary
107
- if filename.endswith(('.xlsx', '.png', '.mp3', '.pdf', '.zip')):
108
- with open(os.path.join(dummy_files_dir, filename), "wb") as fb: # Ensure binary files are writable
109
- fb.write(b"dummy binary content") # placeholder
110
- else:
111
- f.write(content)
112
-
113
 
114
  test_filepaths = [os.path.join(dummy_files_dir, f) for f in test_files_info.keys()]
115
- test_filepaths.append("non_existent_file.doc") # Test non-existent file
116
 
117
- for filepath in test_filepaths:
118
- result = identifier.identify_file(filepath)
119
  print(result)
120
 
121
- # Clean up dummy files
122
  # import shutil
123
  # shutil.rmtree(dummy_files_dir)
124
- print(f"\nNote: Dummy files created in '{dummy_files_dir}'. You may want to remove this directory.")
 
1
  import os
2
+ import mimetypes
3
 
4
  class FileIdentifier:
5
  def __init__(self):
 
6
  mimetypes.init()
7
+ # Mapping from simple type to action and common extensions
8
+ self.file_type_map = {
9
+ "audio": {"action": "speech-to-text", "extensions": [".mp3", ".wav", ".flac", ".aac", ".ogg"]},
10
+ "spreadsheet": {"action": "spreadsheet_parser", "extensions": [".xlsx", ".xls", ".ods"]},
11
+ "image": {"action": "ocr_vision_reasoning", "extensions": [".png", ".jpg", ".jpeg", ".gif", ".bmp"]},
12
+ "python_code": {"action": "safe_code_interpreter", "extensions": [".py"]},
13
+ "pdf": {"action": "pdf_text_extractor", "extensions": [".pdf"]},
14
+ "text": {"action": "text_file_reader", "extensions": [".txt", ".md", ".rtf"]},
15
+ "csv": {"action": "csv_parser", "extensions": [".csv"]},
16
+ # Add more specific types if needed
17
  }
18
+ # For quick lookup from extension to simple type
19
+ self.extension_to_type = {}
20
+ for simple_type, details in self.file_type_map.items():
21
+ for ext in details["extensions"]:
22
+ self.extension_to_type[ext] = simple_type
23
 
24
  def identify_file(self, filepath):
25
  """
26
  Identifies the file type and suggests a processing action.
27
+ Returns a dictionary with 'filepath', 'determined_type', 'mime_type',
28
+ 'suggested_action', or an 'error'.
29
  """
30
  if not os.path.exists(filepath):
31
  return {
32
+ "filepath": filepath,
33
+ "error": "File not found"
 
 
 
 
 
34
  }
35
 
36
+ mime_type, encoding = mimetypes.guess_type(filepath)
37
+ file_extension = os.path.splitext(filepath)[1].lower()
38
 
39
+ determined_type = "unknown"
40
+ suggested_action = "unknown_handler"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
+ # Prioritize extension-based mapping for specific known types
43
+ if file_extension in self.extension_to_type:
44
+ determined_type = self.extension_to_type[file_extension]
45
+ suggested_action = self.file_type_map[determined_type]["action"]
46
+ elif mime_type:
47
+ # Fallback to MIME type if extension is not specifically mapped
48
+ # This part might need more sophisticated mapping from MIME to your simple types
49
  if mime_type.startswith("audio/"):
50
+ determined_type = "audio"
51
+ suggested_action = self.file_type_map["audio"]["action"]
 
52
  elif mime_type.startswith("image/"):
53
+ determined_type = "image"
54
+ suggested_action = self.file_type_map["image"]["action"]
 
55
  elif mime_type == "application/pdf":
56
+ determined_type = "pdf"
57
+ suggested_action = self.file_type_map["pdf"]["action"]
 
58
  elif mime_type == "text/csv":
59
+ determined_type = "csv"
60
+ suggested_action = self.file_type_map["csv"]["action"]
61
+ elif mime_type.startswith("text/"): # General text
62
+ # Check if it's python by extension, as text/x-python might not always be guessed
63
+ if file_extension == ".py":
64
+ determined_type = "python_code"
65
+ suggested_action = self.file_type_map["python_code"]["action"]
66
+ else:
67
+ determined_type = "text"
68
+ suggested_action = self.file_type_map["text"]["action"]
69
+ elif file_extension == ".xlsx" or mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
70
+ determined_type = "spreadsheet"
71
+ suggested_action = self.file_type_map["spreadsheet"]["action"]
72
+ # Add more MIME-based rules if necessary
73
+
74
+ # If still unknown, but has a common extension not yet caught
75
+ if determined_type == "unknown" and file_extension:
76
+ # A final check for common types if MIME was unhelpful or generic
77
+ if file_extension in self.extension_to_type: # Redundant if first check comprehensive
78
+ determined_type = self.extension_to_type[file_extension]
79
+ suggested_action = self.file_type_map[determined_type]["action"]
80
 
 
81
 
82
  return {
83
  "filepath": filepath,
84
+ "determined_type": determined_type,
85
+ "file_extension": file_extension,
86
  "mime_type": mime_type,
 
87
  "suggested_action": suggested_action
88
  }
89
 
90
+ # Example Usage (optional, can be kept for testing this module directly):
91
  if __name__ == "__main__":
92
  identifier = FileIdentifier()
 
 
93
  dummy_files_dir = "dummy_files_for_test"
94
  os.makedirs(dummy_files_dir, exist_ok=True)
95
+
96
  test_files_info = {
97
  "audio_sample.mp3": "audio content",
98
  "report_data.xlsx": "excel content",
 
101
  "document.pdf": "pdf content",
102
  "notes.txt": "text content",
103
  "data.csv": "col1,col2\n1,2",
104
+ "archive.zip": "zip content", # Example of an unmapped type by default
105
+ "unknown_file.dat": "binary data"
106
  }
107
+
108
  for filename, content in test_files_info.items():
109
  with open(os.path.join(dummy_files_dir, filename), "w") as f:
110
+ f.write(content) # Simple write for testing existence and extension
 
 
 
 
 
 
 
111
 
112
  test_filepaths = [os.path.join(dummy_files_dir, f) for f in test_files_info.keys()]
113
+ test_filepaths.append("non_existent_file.doc")
114
 
115
+ for filepath_to_test in test_filepaths:
116
+ result = identifier.identify_file(filepath_to_test)
117
  print(result)
118
 
119
+ # Consider cleaning up dummy files if you run this main block frequently
120
  # import shutil
121
  # shutil.rmtree(dummy_files_dir)
122
+ print(f"\nNote: Dummy files created in '{dummy_files_dir}'. You may want to remove this directory after testing.")