File size: 4,246 Bytes
f75de9a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
from ast import increment_lineno
from statistics import LinearRegression
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
plt.switch_backend('Agg')
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set(style='whitegrid')
print('Import and setup completed successfully.')
file_path = ''
file_path = r'C:\Users\Donte Patton\Downloads\dataset_2191_sleep.csv'
df = pd.read_csv(file_path, encoding='ascii', delimiter=',')
print('Dataset loaded successfull. Showing first few rows:')
print(df.head())
print('Dataset Info:')
df.info()
print('\nMissing values in each column:')
print(df.isnull().sum())
df.dropna(inplace=True)
print('\nDataframe shape after dropping missing values:', df.shape)
# Removed Year conversion as the column doesn't exist in the dataset
print('\nData types after conversion:')
print(df.dtypes)
numeric_df = df.select_dtypes(include=[np.number])
if numeric_df.shape[1] >= 4:
plt.figure(figsize=(12, 10))
corr = numeric_df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap of Numeric Variables')
plt.show()
else:
print('Not enough numeric columns for a correlation heatmap.')
# Using available numeric columns for pairplot
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
if len(numeric_cols) > 1:
sns.pairplot(df[numeric_cols])
plt.suptitle('Pair Plot of Numeric Features', y=1.02)
plt.show()
else:
print('Not enough numeric columns for pair plot.')
# Plotting distribution of body_weight instead of CO2
plt.figure(figsize=(8, 6))
sns.histplot(df['body_weight'], kde=True, bins=30)
plt.title('Distribution of Body Weight')
plt.xlabel('Body Weight (kg)')
plt.ylabel('Frequency')
plt.show()
# Plotting mean body weight by predation index
plt.figure(figsize=(10, 6))
body_weight_by_predation = df.groupby('predation_index')['body_weight'].mean().reset_index()
sns.barplot(x='predation_index', y='body_weight', data=body_weight_by_predation, palette='viridis')
plt.title('Average Body Weight by Predation Index')
plt.xlabel('Predation Index')
plt.ylabel('Average Body Weight (kg)')
plt.show()
# Create a count plot for predation_index instead of Emissions Category
plt.figure(figsize=(8, 6))
sns.countplot(x='predation_index', data=df, palette='Set2')
plt.title('Count of Records by Predation Index')
plt.xlabel('Predation Index')
plt.ylabel('Count')
plt.show()
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
# Update features to use existing numeric columns
features = ['body_weight', 'brain_weight', 'predation_index', 'sleep_exposure_index', 'danger_index']
# Convert string columns to numeric where needed
model_df = df.copy()
# Convert total_sleep to numeric (it's currently an object/string)
model_df['total_sleep'] = pd.to_numeric(model_df['total_sleep'], errors='coerce')
# Drop any rows with missing values
model_df = model_df.dropna()
# Use available numeric features for prediction
# We'll predict 'total_sleep' using other numeric features
X = model_df[['body_weight', 'brain_weight', 'predation_index', 'sleep_exposure_index', 'danger_index']]
y = model_df['total_sleep']
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('Training set shape:', X_train.shape)
print('Testing set shape:', X_test.shape)
# Train the model
regressor = LinearRegression()
regressor.fit(X_train, y_train)
# Make predictions
y_pred = regressor.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'R^2 score for the predictor: {r2:.3f}')
print(f'RMSE for the predictor: {rmse:.3f}')
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.5, color='teal')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual CO2')
plt.ylabel('Predicted CO2')
plt.title('Actual vs Predicted CO2 Emissions')
plt.show()
|