|
|
|
"""1957_249_949 |
|
|
|
Automatically generated by Colab. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1q6DU2jTXfNY0uMxaBV2w2niCrYcsW86S |
|
""" |
|
|
|
import numpy as np |
|
import pandas as pd |
|
|
|
import os |
|
for dirname, _, filenames in os.walk('/kaggle/input'): |
|
for filename in filenames: |
|
print(os.path.join(dirname, filename)) |
|
|
|
import pandas as pd |
|
import numpy as np |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.ensemble import RandomForestRegressor |
|
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
|
|
data = pd.read_csv('/content/internet_usage.csv') |
|
|
|
data.head() |
|
|
|
data.tail() |
|
|
|
data.describe() |
|
|
|
numeric_cols = data.columns[2:] |
|
data[numeric_cols] = data[numeric_cols].apply(pd.to_numeric, errors='coerce') |
|
data = data.dropna(subset=numeric_cols, how='all') |
|
data = data.fillna(data.mean(numeric_only=True)) |
|
|
|
years = [int(col) for col in numeric_cols] |
|
data['avg_usage'] = data[numeric_cols].mean(axis=1) |
|
data['usage_change'] = data[numeric_cols].iloc[:, -1] - data[numeric_cols].iloc[:, 0] |
|
data['rate_change'] = data['usage_change'] / (years[-1] - years [0]) |
|
|
|
features = ['avg_usage', 'usage_change', 'rate_change'] |
|
target_year = 2023 |
|
target = str(target_year) |
|
|
|
X = data[features] |
|
y= data[target] |
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
|
model = RandomForestRegressor(n_estimators=100, random_state=42) |
|
model.fit(X_train, y_train) |
|
|
|
y_pred = model.predict(X_test) |
|
|
|
mse = mean_squared_error(y_test, y_pred) |
|
mae = mean_absolute_error(y_test, y_pred) |
|
r2 = r2_score(y_test, y_pred) |
|
|
|
print(f"Mean Squared Error: {mse}") |
|
print(f"Mean Absolute Error: {mae}") |
|
print(f"R-squared: {r2}") |
|
|
|
plt.figure(figsize=(10, 6)) |
|
plt.scatter(y_test, y_pred) |
|
plt.xlabel("Actual Values") |
|
plt.ylabel("Predicted Values") |
|
plt.title("Actual vs. Predicted Values") |
|
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red') |
|
plt.show() |
|
|
|
feature_importance = model.feature_importances_ |
|
feature_names = X.columns |
|
|
|
plt.figure(figsize=(10, 6)) |
|
sns.barplot(x=feature_importance, y=feature_names) |
|
plt.title("Feature Importance") |
|
plt.show() |
|
|
|
def predict_future_usage(model, data, features, future_years): |
|
predictions = {} |
|
for year in future_years: |
|
new_data = data.copy() |
|
new_data[str(year)] = model.predict(new_data[features]) |
|
predictions[year] = new_data[str(year)] |
|
data[str(year)] = new_data[str(year)] |
|
|
|
return predictions |
|
|
|
future_years = [2024, 2025] |
|
future_predictions = predict_future_usage(model, data, features, future_years) |
|
|
|
print("\nFuture Predictions:") |
|
for year, predictions in future_predictions.items(): |
|
print(f"Predictions for {year}:") |
|
print(predictions.head()) |