Spaces:

cpv2280
/

population-prediction

Sleeping

File size: 5,960 Bytes

import pandas as pd
import gradio as gr
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler


# Load the trained models & transformers
linear_model = joblib.load("linear_model.pkl")
poly_model = joblib.load("poly_model.pkl")
poly_features = joblib.load("poly_features.pkl")
scaler = joblib.load("scaler.pkl")  # Load the saved StandardScaler
knn_model = joblib.load("KNN.pkl")
randforests_model = joblib.load("random_forests.pkl")

# Function to load and preview CSV data
def load_data(file):
    df = pd.read_csv(file)
    print("DEBUG: CSV Data in Gradio:\n", df.head())  # Print first 5 rows
    print("DEBUG: Data Types in Gradio:\n", df.dtypes)  # Check column types
    return df.head()  # Show first 5 rows

# Function to visualize population trends
def plot_population_trend(file, model_choice):
    df = pd.read_csv(file)

    plt.figure(figsize=(8,5))
    sns.scatterplot(x=df.iloc[:, 0], y=df.iloc[:, 1])
    plt.xlabel("Years")
    plt.ylabel("Population")
    plt.title("Population Growth Trend")
    plt.grid()

    plt.figure(figsize=(8,5))
    plt.scatter(df["Year"], df["Population"], label="Actual Population", color="blue", alpha=0.6)

    X = df["Year"].values.reshape(-1, 1)  # Extract Year column

    if model_choice == "Linear Regression":
        X_scaled = scaler.transform(X) 
        predictions = linear_model.predict(X_scaled)
        plt.plot(df["Year"], predictions, label="Linear Regression", color="red", linestyle="dashed")
    elif model_choice == "Polynomial Regression": # Polynomial Regression
        X_scaled = scaler.transform(X)  # Apply scaling
        X_poly = poly_features.transform(X_scaled)  # Transform for Polynomial Regression
        predictions = poly_model.predict(X_poly)
        plt.plot(df["Year"], predictions, label="Polynomial Regression", color="green")
    elif model_choice == "KNN":  # K-Nearest Neighbors (KNN)
            predictions = knn_model.predict(X)
            label = "KNN"
            color = "blue"
            linestyle = "dotted"
            plt.plot(df["Year"], predictions, label="KNN", color="blue")
    else:   #Random Forests 
            predictions = randforests_model.predict(X)
            label = "Random Forests"
            color = "yellow"
            linestyle = "dotted"
            plt.plot(df["Year"], predictions, label="Random Forests", color="yellow")

    

    plt.xlabel("Year")
    plt.ylabel("Population")
    plt.title(f"Population Growth Prediction ({model_choice})")
    plt.legend()
    plt.grid()

    #plt.savefig("population_trend.png")
    #return "population_trend.png"

    plt.savefig("population_trend.png")
    return "population_trend.png"

# Function to predict population using the selected model
def predict_population(file, model_choice):
    df = pd.read_csv(file)

    # Ensure correct column format
    if df.shape[1] < 2:
        return None, "ERROR: CSV must contain two columns (Year, Population).", None
    
    df.columns = ["Year", "Population"]
    df = df.astype({"Year": int, "Population": float})  # Convert data types

    X = df["Year"].values.reshape(-1, 1)  # Extract Year column

    if model_choice == "Linear Regression":
        # Do NOT scale X for Linear Regression
        X_scaled = scaler.transform(X) 
        predictions = linear_model.predict(X_scaled)  
    elif model_choice== "Polynomial Regression":  # Polynomial Regression
        X_scaled = scaler.transform(X)  # Apply the same scaling as training
        X_poly = poly_features.transform(X_scaled)  # Transform for Polynomial Regression
        predictions = poly_model.predict(X_poly)
    elif model_choice == "KNN":
         predictions = knn_model.predict(X)
    else:#random forests 
         predictions = randforests_model.predict(X)


    df["Predicted Population"] = predictions  # Append predictions to DataFrame

    # Extract the test set (2016-2020) before computing MSE & R²
    test_mask = df["Year"].between(2016, 2020)  # Select only test years
    X_test = df.loc[test_mask, "Year"].values.reshape(-1, 1)
    y_test = df.loc[test_mask, "Population"].values
    y_pred_test = df.loc[test_mask, "Predicted Population"].values

    # Compute metrics only on the test set
    mse = mean_squared_error(y_test, y_pred_test)
    r2 = r2_score(y_test, y_pred_test)

    print("DEBUG: Model Choice =", model_choice)
    print("DEBUG: X Values for Prediction:\n", X[:5])  # Print first 5 inputs
    print("DEBUG: Predictions:\n", predictions[:5])  # Print first 5 predictions

    return df, "population_trend.png", f"{model_choice} Results: MSE = {mse:.2f}, R² Score = {r2:.2f}"



# Wrapper function for Gradio
def gradio_interface(file, model_choice):
    preview = load_data(file)
    trend_image = plot_population_trend(file, model_choice)
    predictions, _, performance = predict_population(file, model_choice)
    return preview, trend_image, predictions, performance


# Define the Gradio interface
interface = gr.Interface(
    fn=gradio_interface,  # Use the single wrapper function
    inputs=[
        gr.File(label="Upload CSV File"),
        gr.Radio(["Linear Regression", "Polynomial Regression","KNN", "Random Forests"], label="Choose Model")
    ],
    outputs=[
        gr.Dataframe(label="Preview Data"),
        gr.Image(label="Population Trend"),
        gr.Dataframe(label="Predictions"),
        gr.Textbox(label="Model Performance")
    ],
    title="Population Prediction Tool",
    description="Upload a CSV file with Year and Population data. Choose a model (Linear or Polynomial Regression, KNN or Random Forests) to predict future population trends."
)

# Launch the Gradio App
interface.launch()