Spaces:
Paused
Paused
| import pandas as pd | |
| from sklearn.impute import SimpleImputer | |
| from sklearn.preprocessing import LabelEncoder | |
| def load_data(file_path): | |
| """Load dataset from a CSV file.""" | |
| return pd.read_csv(file_path) | |
| def handle_missing_values(df): | |
| """Handle missing values in the dataset.""" | |
| # Impute numerical columns with the median | |
| numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns | |
| imputer = SimpleImputer(strategy='median') | |
| df[numerical_cols] = imputer.fit_transform(df[numerical_cols]) | |
| # Impute categorical columns with the most frequent value | |
| categorical_cols = df.select_dtypes(include=['object']).columns | |
| imputer = SimpleImputer(strategy='most_frequent') | |
| df[categorical_cols] = imputer.fit_transform(df[categorical_cols]) | |
| return df | |
| def encode_categorical_variables(df): | |
| """Encode categorical variables using Label Encoding.""" | |
| categorical_cols = df.select_dtypes(include=['object']).columns | |
| label_encoder = LabelEncoder() | |
| for col in categorical_cols: | |
| df[col] = label_encoder.fit_transform(df[col]) | |
| return df | |
| def preprocess_data(file_path): | |
| """Load, preprocess, and return the dataset.""" | |
| df = load_data(file_path) | |
| df = handle_missing_values(df) | |
| df = encode_categorical_variables(df) | |
| return df | |
| if __name__ == "__main__": | |
| file_path = 'path_to_your_data.csv' # Replace with your actual file path | |
| processed_data = preprocess_data(file_path) | |
| processed_data.to_csv('processed_data.csv', index=False) | |