Spaces:
Running
Running
Alvaro
commited on
Commit
·
787f9a6
1
Parent(s):
d794d20
Added preprocessing
Browse files- output/ufc_fighters_data.csv +0 -0
- src/preprocess.py +74 -0
output/ufc_fighters_data.csv
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
src/preprocess.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import csv
|
2 |
+
import os
|
3 |
+
|
4 |
+
def convert_height_to_cm(height_str):
|
5 |
+
"""
|
6 |
+
Converts a height string in the format 'X ft Y' to centimeters.
|
7 |
+
Returns the original string if the format is unexpected or empty.
|
8 |
+
"""
|
9 |
+
if not height_str or 'ft' not in height_str:
|
10 |
+
return height_str
|
11 |
+
|
12 |
+
try:
|
13 |
+
parts = height_str.split(' ft')
|
14 |
+
feet = int(parts[0].strip())
|
15 |
+
inches_str = parts[1].strip()
|
16 |
+
# Handle cases where inches might be missing (e.g., '6 ft')
|
17 |
+
inches = int(inches_str) if inches_str else 0
|
18 |
+
|
19 |
+
total_inches = (feet * 12) + inches
|
20 |
+
cm = total_inches * 2.54
|
21 |
+
return round(cm)
|
22 |
+
except (ValueError, IndexError):
|
23 |
+
# Return original value if parsing fails
|
24 |
+
return height_str
|
25 |
+
|
26 |
+
def preprocess_fighters_csv(file_path='output/ufc_fighters_data.csv'):
|
27 |
+
"""
|
28 |
+
Reads the fighters CSV, converts height to cm, renames the column,
|
29 |
+
and saves the changes back to the same file.
|
30 |
+
"""
|
31 |
+
if not os.path.exists(file_path):
|
32 |
+
print(f"Error: File not found at {file_path}")
|
33 |
+
return
|
34 |
+
|
35 |
+
try:
|
36 |
+
rows = []
|
37 |
+
headers = []
|
38 |
+
# Read all data from the CSV into memory first
|
39 |
+
with open(file_path, 'r', newline='', encoding='utf-8') as csv_file:
|
40 |
+
reader = csv.DictReader(csv_file)
|
41 |
+
# Return if there's no header or the file is empty
|
42 |
+
if not reader.fieldnames:
|
43 |
+
print(f"Warning: {file_path} is empty or has no headers.")
|
44 |
+
return
|
45 |
+
headers = reader.fieldnames
|
46 |
+
rows = list(reader)
|
47 |
+
|
48 |
+
# Check if there's a 'height' column to process
|
49 |
+
if 'height' not in headers:
|
50 |
+
print("No 'height' column found. Nothing to do.")
|
51 |
+
return
|
52 |
+
|
53 |
+
# Process the rows in memory
|
54 |
+
for row in rows:
|
55 |
+
# Create a new key for the converted height and remove the old one
|
56 |
+
row['height_cm'] = convert_height_to_cm(row.pop('height', ''))
|
57 |
+
|
58 |
+
# Update the header name
|
59 |
+
headers[headers.index('height')] = 'height_cm'
|
60 |
+
|
61 |
+
# Write the modified data back to the same file, overwriting it
|
62 |
+
with open(file_path, 'w', newline='', encoding='utf-8') as csv_file:
|
63 |
+
writer = csv.DictWriter(csv_file, fieldnames=headers)
|
64 |
+
writer.writeheader()
|
65 |
+
writer.writerows(rows)
|
66 |
+
|
67 |
+
print(f"Successfully processed file: {file_path}")
|
68 |
+
print("Converted 'height' column to centimeters and renamed it to 'height_cm'.")
|
69 |
+
|
70 |
+
except Exception as e:
|
71 |
+
print(f"An error occurred: {e}")
|
72 |
+
|
73 |
+
if __name__ == '__main__':
|
74 |
+
preprocess_fighters_csv()
|