Alvaro commited on
Commit
787f9a6
·
1 Parent(s): d794d20

Added preprocessing

Browse files
Files changed (2) hide show
  1. output/ufc_fighters_data.csv +0 -0
  2. src/preprocess.py +74 -0
output/ufc_fighters_data.csv CHANGED
The diff for this file is too large to render. See raw diff
 
src/preprocess.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import os
3
+
4
+ def convert_height_to_cm(height_str):
5
+ """
6
+ Converts a height string in the format 'X ft Y' to centimeters.
7
+ Returns the original string if the format is unexpected or empty.
8
+ """
9
+ if not height_str or 'ft' not in height_str:
10
+ return height_str
11
+
12
+ try:
13
+ parts = height_str.split(' ft')
14
+ feet = int(parts[0].strip())
15
+ inches_str = parts[1].strip()
16
+ # Handle cases where inches might be missing (e.g., '6 ft')
17
+ inches = int(inches_str) if inches_str else 0
18
+
19
+ total_inches = (feet * 12) + inches
20
+ cm = total_inches * 2.54
21
+ return round(cm)
22
+ except (ValueError, IndexError):
23
+ # Return original value if parsing fails
24
+ return height_str
25
+
26
+ def preprocess_fighters_csv(file_path='output/ufc_fighters_data.csv'):
27
+ """
28
+ Reads the fighters CSV, converts height to cm, renames the column,
29
+ and saves the changes back to the same file.
30
+ """
31
+ if not os.path.exists(file_path):
32
+ print(f"Error: File not found at {file_path}")
33
+ return
34
+
35
+ try:
36
+ rows = []
37
+ headers = []
38
+ # Read all data from the CSV into memory first
39
+ with open(file_path, 'r', newline='', encoding='utf-8') as csv_file:
40
+ reader = csv.DictReader(csv_file)
41
+ # Return if there's no header or the file is empty
42
+ if not reader.fieldnames:
43
+ print(f"Warning: {file_path} is empty or has no headers.")
44
+ return
45
+ headers = reader.fieldnames
46
+ rows = list(reader)
47
+
48
+ # Check if there's a 'height' column to process
49
+ if 'height' not in headers:
50
+ print("No 'height' column found. Nothing to do.")
51
+ return
52
+
53
+ # Process the rows in memory
54
+ for row in rows:
55
+ # Create a new key for the converted height and remove the old one
56
+ row['height_cm'] = convert_height_to_cm(row.pop('height', ''))
57
+
58
+ # Update the header name
59
+ headers[headers.index('height')] = 'height_cm'
60
+
61
+ # Write the modified data back to the same file, overwriting it
62
+ with open(file_path, 'w', newline='', encoding='utf-8') as csv_file:
63
+ writer = csv.DictWriter(csv_file, fieldnames=headers)
64
+ writer.writeheader()
65
+ writer.writerows(rows)
66
+
67
+ print(f"Successfully processed file: {file_path}")
68
+ print("Converted 'height' column to centimeters and renamed it to 'height_cm'.")
69
+
70
+ except Exception as e:
71
+ print(f"An error occurred: {e}")
72
+
73
+ if __name__ == '__main__':
74
+ preprocess_fighters_csv()