Spaces:

Mihkelmj
/

utrecht-pollution-prediction

Sleeping

App Files Files Community

Aksel Joonas Reedi commited on Oct 31, 2024

Commit

8db7b4c

1 Parent(s): 35377fa

typehints

Browse files

Files changed (6) hide show

app.py +28 -19
src/data_api_calls.py +23 -6
src/features_pipeline.py +38 -6
src/helper_functions.py +53 -9
src/past_data_api_calls.py +20 -4
src/predict.py +57 -10

app.py CHANGED Viewed

@@ -2,7 +2,6 @@ import altair as alt
 import pandas as pd
 import plotly.graph_objects as go
 import streamlit as st
 from src.helper_functions import custom_metric_box, pollution_box
 from src.predict import get_data_and_predictions, update_data_and_predictions
@@ -45,8 +44,7 @@ col1, col2 = st.columns((1, 3))
 # Create a 3-column layout
 with col1:
     st.subheader("Current Weather")
     custom_metric_box(
         label="🥵 Temperature",
         value=f"{round(today['mean_temp'] * 0.1)} °C",
@@ -76,38 +74,38 @@ with col1:
 with col2:
     st.subheader("Current Pollution Levels")
     sub1, sub2 = st.columns((1, 1))
     # Ozone (O₃) Pollution Box
     with sub1:
         pollution_box(
             label="O<sub>3</sub>",
             value=f"{round(today['O3'])} µg/m³",
             delta=f"{round(int(today['O3']) - int(previous_day['O3']))} µg/m³",
-            threshold=120
         )
         with st.expander("Learn more about O3", expanded=False):
             st.markdown(
                 """
                 *Ozone (O<sub>3</sub>)*: A harmful gas at ground level that can irritate the respiratory system and aggravate asthma.<br>
                 **Good/Bad**: "Good" means safe levels for most people, while "Bad" suggests harmful levels, especially for sensitive groups.
-                """,
                 unsafe_allow_html=True,
             )
     # Nitrogen Dioxide (NO₂) Pollution Box
     with sub2:
         pollution_box(
             label="NO<sub>2</sub>",
             value=f"{round(today['NO2'])} µg/m³",
             delta=f"{round(int(today['NO2']) - int(previous_day['NO2']))} µg/m³",
-            threshold=40
         )
         with st.expander("Learn more about NO2", expanded=False):
             st.markdown(
                 """
                 *Nitrogen Dioxide (NO<sub>2</sub>)*: A toxic gas that contributes to lung irritation and worsens asthma and other respiratory issues.<br>
                 **Good/Bad**: "Good" means safe air quality, while "Bad" indicates levels that could cause respiratory problems, especially for vulnerable individuals.
-                """,
                 unsafe_allow_html=True,
             )
@@ -118,9 +116,12 @@ with col2:
     def get_simple_color_scale(values, threshold):
         """Returns green for values below the threshold, orange for values between the threshold and 2x the threshold, and red for values above 2x the threshold."""
         return [
-            "#77C124" if v < threshold else
-            "#E68B0A" if v < 2 * threshold else
-            "#E63946" for v in values
         ]
     # O3 Bar Plot (threshold: 40)
@@ -142,13 +143,17 @@ with col2:
     )
     # Add predicted values with reduced opacity
-    predicted_o3_colors = get_simple_color_scale(o3_future_values, 40)  # Color for future values
     fig_o3.add_trace(
         go.Bar(
             x=df["Date"][-3:],  # Dates for predicted values
             y=o3_future_values,
             name="O3 Predicted",
-            marker=dict(color=predicted_o3_colors, opacity=0.5),  # Set opacity to 0.5 for predictions
             hovertemplate="%{x|%d-%b-%Y}<br>%{y} µg/m³<extra></extra>",
         )
     )
@@ -179,7 +184,7 @@ with col2:
             tickangle=-45,
             tickcolor="gray",
         ),
-        showlegend=False  # Disable legend
     )
     st.plotly_chart(fig_o3, key="fig_o3")
@@ -204,13 +209,17 @@ with col2:
     )
     # Add predicted values with reduced opacity
-    predicted_no2_colors = get_simple_color_scale(no2_future_values, 120)  # Color for future values
     fig_no2.add_trace(
         go.Bar(
             x=df["Date"][-3:],  # Dates for predicted values
             y=no2_future_values,
             name="NO2 Predicted",
-            marker=dict(color=predicted_no2_colors, opacity=0.5),  # Set opacity to 0.5 for predictions
             hovertemplate="%{x|%d-%b-%Y}<br>%{y} µg/m³<extra></extra>",
         )
     )
@@ -241,7 +250,7 @@ with col2:
             tickangle=-45,
             tickcolor="gray",
         ),
-        showlegend=False  # Disable legend
     )
-    st.plotly_chart(fig_no2, key="fig_no2")

 import pandas as pd
 import plotly.graph_objects as go
 import streamlit as st
 from src.helper_functions import custom_metric_box, pollution_box
 from src.predict import get_data_and_predictions, update_data_and_predictions
 # Create a 3-column layout
 with col1:
     st.subheader("Current Weather")
     custom_metric_box(
         label="🥵 Temperature",
         value=f"{round(today['mean_temp'] * 0.1)} °C",
 with col2:
     st.subheader("Current Pollution Levels")
     sub1, sub2 = st.columns((1, 1))
     # Ozone (O₃) Pollution Box
     with sub1:
         pollution_box(
             label="O<sub>3</sub>",
             value=f"{round(today['O3'])} µg/m³",
             delta=f"{round(int(today['O3']) - int(previous_day['O3']))} µg/m³",
+            threshold=120,
         )
         with st.expander("Learn more about O3", expanded=False):
             st.markdown(
                 """
                 *Ozone (O<sub>3</sub>)*: A harmful gas at ground level that can irritate the respiratory system and aggravate asthma.<br>
                 **Good/Bad**: "Good" means safe levels for most people, while "Bad" suggests harmful levels, especially for sensitive groups.
+                """,
                 unsafe_allow_html=True,
             )
     # Nitrogen Dioxide (NO₂) Pollution Box
     with sub2:
         pollution_box(
             label="NO<sub>2</sub>",
             value=f"{round(today['NO2'])} µg/m³",
             delta=f"{round(int(today['NO2']) - int(previous_day['NO2']))} µg/m³",
+            threshold=40,
         )
         with st.expander("Learn more about NO2", expanded=False):
             st.markdown(
                 """
                 *Nitrogen Dioxide (NO<sub>2</sub>)*: A toxic gas that contributes to lung irritation and worsens asthma and other respiratory issues.<br>
                 **Good/Bad**: "Good" means safe air quality, while "Bad" indicates levels that could cause respiratory problems, especially for vulnerable individuals.
+                """,
                 unsafe_allow_html=True,
             )
     def get_simple_color_scale(values, threshold):
         """Returns green for values below the threshold, orange for values between the threshold and 2x the threshold, and red for values above 2x the threshold."""
         return [
+            "#77C124"
+            if v < threshold
+            else "#E68B0A"
+            if v < 2 * threshold
+            else "#E63946"
+            for v in values
         ]
     # O3 Bar Plot (threshold: 40)
     )
     # Add predicted values with reduced opacity
+    predicted_o3_colors = get_simple_color_scale(
+        o3_future_values, 40
+    )  # Color for future values
     fig_o3.add_trace(
         go.Bar(
             x=df["Date"][-3:],  # Dates for predicted values
             y=o3_future_values,
             name="O3 Predicted",
+            marker=dict(
+                color=predicted_o3_colors, opacity=0.5
+            ),  # Set opacity to 0.5 for predictions
             hovertemplate="%{x|%d-%b-%Y}<br>%{y} µg/m³<extra></extra>",
         )
     )
             tickangle=-45,
             tickcolor="gray",
         ),
+        showlegend=False,  # Disable legend
     )
     st.plotly_chart(fig_o3, key="fig_o3")
     )
     # Add predicted values with reduced opacity
+    predicted_no2_colors = get_simple_color_scale(
+        no2_future_values, 120
+    )  # Color for future values
     fig_no2.add_trace(
         go.Bar(
             x=df["Date"][-3:],  # Dates for predicted values
             y=no2_future_values,
             name="NO2 Predicted",
+            marker=dict(
+                color=predicted_no2_colors, opacity=0.5
+            ),  # Set opacity to 0.5 for predictions
             hovertemplate="%{x|%d-%b-%Y}<br>%{y} µg/m³<extra></extra>",
         )
     )
             tickangle=-45,
             tickcolor="gray",
         ),
+        showlegend=False,  # Disable legend
     )
+    st.plotly_chart(fig_no2, key="fig_no2")

src/data_api_calls.py CHANGED Viewed

@@ -14,7 +14,11 @@ WEATHER_DATA_FILE = "weather_data.csv"
 POLLUTION_DATA_FILE = "pollution_data.csv"
-def update_weather_data():
     today = date.today().isoformat()
     if os.path.exists(WEATHER_DATA_FILE):
@@ -50,7 +54,11 @@ def update_weather_data():
         sys.exit()
-def update_pollution_data():
     O3 = []
     NO2 = []
     particles = ["NO2", "O3"]
@@ -113,14 +121,21 @@ def update_pollution_data():
     updated_data.to_csv(POLLUTION_DATA_FILE, index=False)
-def get_combined_data():
     weather_df = pd.read_csv(WEATHER_DATA_FILE)
     today = pd.Timestamp.now().normalize()
     seven_days_ago = today - pd.Timedelta(days=7)
     weather_df["date"] = pd.to_datetime(weather_df["date"])
-    weather_df = weather_df[(weather_df["date"] >= seven_days_ago) & (weather_df["date"] <= today)]
     weather_df.insert(1, "NO2", None)
     weather_df.insert(2, "O3", None)
@@ -168,7 +183,9 @@ def get_combined_data():
     pollution_df = pd.read_csv(POLLUTION_DATA_FILE)
     pollution_df["date"] = pd.to_datetime(pollution_df["date"])
-    pollution_df = pollution_df[(pollution_df["date"] >= seven_days_ago) & (pollution_df["date"] <= today)]
     combined_df["NO2"] = pollution_df["NO2"]
     combined_df["O3"] = pollution_df["O3"]

 POLLUTION_DATA_FILE = "pollution_data.csv"
+def update_weather_data() -> None:
+    """
+    Updates weather data by fetching data.
+    If the data file exists, it appends new data. If not, it creates a new file.
+    """
     today = date.today().isoformat()
     if os.path.exists(WEATHER_DATA_FILE):
         sys.exit()
+def update_pollution_data() -> None:
+    """
+    Updates pollution data for NO2 and O3.
+    The new data is appended to the existing pollution data file.
+    """
     O3 = []
     NO2 = []
     particles = ["NO2", "O3"]
     updated_data.to_csv(POLLUTION_DATA_FILE, index=False)
+def get_combined_data() -> pd.DataFrame:
+    """
+    Combines weather and pollution data for the last 7 days.
+    Returns:
+        pd.DataFrame: A DataFrame containing the combined weather and pollution data.
+    """
     weather_df = pd.read_csv(WEATHER_DATA_FILE)
     today = pd.Timestamp.now().normalize()
     seven_days_ago = today - pd.Timedelta(days=7)
     weather_df["date"] = pd.to_datetime(weather_df["date"])
+    weather_df = weather_df[
+        (weather_df["date"] >= seven_days_ago) & (weather_df["date"] <= today)
+    ]
     weather_df.insert(1, "NO2", None)
     weather_df.insert(2, "O3", None)
     pollution_df = pd.read_csv(POLLUTION_DATA_FILE)
     pollution_df["date"] = pd.to_datetime(pollution_df["date"])
+    pollution_df = pollution_df[
+        (pollution_df["date"] >= seven_days_ago) & (pollution_df["date"] <= today)
+    ]
     combined_df["NO2"] = pollution_df["NO2"]
     combined_df["O3"] = pollution_df["O3"]

src/features_pipeline.py CHANGED Viewed

@@ -6,7 +6,6 @@ import numpy as np
 import pandas as pd
 from dotenv import load_dotenv
 from huggingface_hub import hf_hub_download, login
 from src.past_data_api_calls import get_past_combined_data
 warnings.filterwarnings("ignore")
@@ -16,11 +15,44 @@ login(token=os.getenv("HUGGINGFACE_DOWNLOAD_TOKEN"))
 def create_features(
-    data,
-    target_particle,  # Added this parameter
-    lag_days=7,
-    sma_days=7,
-):
     lag_features = [
         "NO2",
         "O3",

 import pandas as pd
 from dotenv import load_dotenv
 from huggingface_hub import hf_hub_download, login
 from src.past_data_api_calls import get_past_combined_data
 warnings.filterwarnings("ignore")
 def create_features(
+    data: pd.DataFrame,
+    target_particle: str,  # Added this parameter
+    lag_days: int = 7,
+    sma_days: int = 7,
+) -> pd.DataFrame:
+    """
+    Create features for predicting air quality particles (NO2 or O3) based on historical weather data.
+    This function performs several feature engineering tasks, including:
+    - Creating lagged features for specified pollutants.
+    - Calculating rolling mean (SMA) features.
+    - Adding sine and cosine transformations of the weekday and month.
+    - Incorporating historical data for the same date in the previous year.
+    Parameters:
+    ----------
+    data : pd.DataFrame
+        A DataFrame containing historical weather and air quality data with a 'date' column.
+    target_particle : str
+        The target particle for prediction, must be either 'O3' or 'NO2'.
+    lag_days : int, optional
+        The number of days for which lagged features will be created. Default is 7.
+    sma_days : int, optional
+        The window size for calculating the simple moving average (SMA). Default is 7.
+    Returns:
+    -------
+    pd.DataFrame
+        A DataFrame containing the transformed features, ready for modeling.
+    Raises:
+    ------
+    ValueError
+        If target_particle is not 'O3' or 'NO2'.
+    """
     lag_features = [
         "NO2",
         "O3",

src/helper_functions.py CHANGED Viewed

@@ -1,9 +1,26 @@
 import streamlit as st
-# Custom function to create styled metric boxes with compact layout
-def custom_metric_box(label, value):
-    st.markdown(f"""
         <div style="
             padding: 5px;
             margin-bottom: 5px;
@@ -19,17 +36,42 @@ def custom_metric_box(label, value):
                 <p style="font-size: 18px; font-weight: bold; margin: 0;">{value}</p>  <!-- Smaller metric -->
             </div>
         </div>
-    """, unsafe_allow_html=True)
-# Custom function to create pollution metric boxes with side-by-side layout for label and value
-# Custom function to create pollution metric boxes with side-by-side layout and fixed width
-def pollution_box(label, value, delta, threshold):
     # Determine if the pollution level is "Good" or "Bad"
     status = "Good" if float(value.split()[0]) < threshold else "Bad"
     status_color = "#77C124" if status == "Good" else "#E68B0A"
     # Render the pollution box
-    st.markdown(f"""
         <div style="
             background: rgba(255, 255, 255, 0.05);
             border-radius: 16px;
@@ -44,4 +86,6 @@ def pollution_box(label, value, delta, threshold):
             <p style="font-size: 36px; font-weight: bold; color: {status_color}; margin: 0;">{status}</p>  <!-- Good/Bad with color -->
             <p style="font-size: 18px; margin: 0;">{value}</p>  <!-- Smaller value where delta used to be -->
         </div>
-    """, unsafe_allow_html=True)

 import streamlit as st
+def custom_metric_box(label: str, value: str) -> None:
+    """
+    Create a styled metric box with a compact layout.
+    This function generates a styled markdown box displaying a label and its corresponding value.
+    Parameters:
+    ----------
+    label : str
+        The text label to display in the metric box.
+    value : str
+        The value to be displayed in the metric box, typically representing a metric.
+    Returns:
+    -------
+    None
+    """
+    st.markdown(
+        f"""
         <div style="
             padding: 5px;
             margin-bottom: 5px;
                 <p style="font-size: 18px; font-weight: bold; margin: 0;">{value}</p>  <!-- Smaller metric -->
             </div>
         </div>
+    """,
+        unsafe_allow_html=True,
+    )
+def pollution_box(label: str, value: str, delta: str, threshold: float) -> None:
+    """
+    Create a pollution metric box with a side-by-side layout and fixed width.
+    This function generates a styled markdown box displaying pollution level status, value, and other related information.
+    Parameters:
+    ----------
+    label : str
+        The text label representing the type of pollution or metric.
+    value : str
+        The value of the pollution metric, typically a string that can be converted to a float.
+    delta : str
+        A string representing the change in pollution level, though not currently used in the rendering.
+    threshold : float
+        The threshold value to determine if the pollution level is "Good" or "Bad".
+    Returns:
+    -------
+    None
+    """
     # Determine if the pollution level is "Good" or "Bad"
     status = "Good" if float(value.split()[0]) < threshold else "Bad"
     status_color = "#77C124" if status == "Good" else "#E68B0A"
     # Render the pollution box
+    st.markdown(
+        f"""
         <div style="
             background: rgba(255, 255, 255, 0.05);
             border-radius: 16px;
             <p style="font-size: 36px; font-weight: bold; color: {status_color}; margin: 0;">{status}</p>  <!-- Good/Bad with color -->
             <p style="font-size: 18px; margin: 0;">{value}</p>  <!-- Smaller value where delta used to be -->
         </div>
+    """,
+        unsafe_allow_html=True,
+    )

src/past_data_api_calls.py CHANGED Viewed

@@ -14,7 +14,11 @@ PAST_WEATHER_DATA_FILE = "past_weather_data.csv"
 PAST_POLLUTION_DATA_FILE = "past_pollution_data.csv"
-def update_past_weather_data():
     last_year_date = date.today() - timedelta(days=365)
     if os.path.exists(PAST_WEATHER_DATA_FILE):
@@ -51,7 +55,13 @@ def update_past_weather_data():
         sys.exit()
-def update_past_pollution_data():
     O3 = []
     NO2 = []
     particles = ["NO2", "O3"]
@@ -65,7 +75,7 @@ def update_past_pollution_data():
         last_date = pd.to_datetime(existing_data["date"]).max()
         if last_date >= pd.to_datetime(last_year_date):
             print("Data is already up to date.")
-            return
         else:
             start_date = last_date.date()
             end_date = last_year_date + timedelta(days=3)
@@ -129,7 +139,13 @@ def update_past_pollution_data():
     return NO2, O3
-def get_past_combined_data():
     update_past_weather_data()
     update_past_pollution_data()

 PAST_POLLUTION_DATA_FILE = "past_pollution_data.csv"
+def update_past_weather_data() -> None:
+    """
+    Updates past weather data.
+    The data is saved to a CSV file. If the file already exists, new data is appended.
+    """
     last_year_date = date.today() - timedelta(days=365)
     if os.path.exists(PAST_WEATHER_DATA_FILE):
         sys.exit()
+def update_past_pollution_data() -> tuple[list[float], list[float]]:
+    """
+    Updates past pollution data for NO2 and O3.
+    Returns:
+        tuple: A tuple containing two lists with NO2 and O3 average values.
+    """
     O3 = []
     NO2 = []
     particles = ["NO2", "O3"]
         last_date = pd.to_datetime(existing_data["date"]).max()
         if last_date >= pd.to_datetime(last_year_date):
             print("Data is already up to date.")
+            return [], []
         else:
             start_date = last_date.date()
             end_date = last_year_date + timedelta(days=3)
     return NO2, O3
+def get_past_combined_data() -> pd.DataFrame:
+    """
+    Retrieves and combines past weather and pollution data.
+    Returns:
+        pd.DataFrame: A DataFrame containing the combined past weather and pollution data.
+    """
     update_past_weather_data()
     update_past_pollution_data()

src/predict.py CHANGED Viewed

@@ -6,7 +6,6 @@ import pandas as pd
 import torch
 from dotenv import load_dotenv
 from huggingface_hub import hf_hub_download, login
 from src.data_api_calls import (
     get_combined_data,
     update_pollution_data,
@@ -18,12 +17,18 @@ load_dotenv()
 login(token=os.getenv("HUGGINGFACE_DOWNLOAD_TOKEN"))
-def load_nn():
     import torch.nn as nn
     from huggingface_hub import PyTorchModelHubMixin
     class AirPollutionNet(nn.Module, PyTorchModelHubMixin):
-        def __init__(self, input_size, layers, dropout_rate):
             super(AirPollutionNet, self).__init__()
             self.layers_list = nn.ModuleList()
             in_features = input_size
@@ -36,7 +41,16 @@ def load_nn():
             self.output = nn.Linear(in_features, 3)  # Output size is 3 for next 3 days
-        def forward(self, x):
             for layer in self.layers_list:
                 x = layer(x)
             x = self.output(x)
@@ -48,7 +62,16 @@ def load_nn():
     return model
-def load_model(particle):
     repo_id = f"elisaklunder/Utrecht-{particle}-Forecasting-Model"
     if particle == "O3":
         file_name = "O3_svr_model.pkl"
@@ -60,7 +83,17 @@ def load_model(particle):
     return model
-def run_model(particle, data):
     input_data = create_features(data=data, target_particle=particle)
     model = load_model(particle)
@@ -83,7 +116,11 @@ def run_model(particle, data):
     return prediction
-def update_data_and_predictions():
     update_weather_data()
     update_pollution_data()
@@ -129,7 +166,16 @@ def update_data_and_predictions():
     combined_data.to_csv(PREDICTIONS_FILE, index=False)
-def get_data_and_predictions():
     week_data = get_combined_data()
     PREDICTIONS_FILE = "predictions_history.csv"
@@ -148,5 +194,6 @@ def get_data_and_predictions():
     return week_data, [o3_predictions], [no2_predictions]
-if __name__=="__main__":
-    update_data_and_predictions()

 import torch
 from dotenv import load_dotenv
 from huggingface_hub import hf_hub_download, login
 from src.data_api_calls import (
     get_combined_data,
     update_pollution_data,
 login(token=os.getenv("HUGGINGFACE_DOWNLOAD_TOKEN"))
+def load_nn() -> torch.nn.Module:
+    """
+    Loads the neural network model for air pollution forecasting.
+    Returns:
+        torch.nn.Module: The loaded neural network model.
+    """
     import torch.nn as nn
     from huggingface_hub import PyTorchModelHubMixin
     class AirPollutionNet(nn.Module, PyTorchModelHubMixin):
+        def __init__(self, input_size: int, layers: list[int], dropout_rate: float):
             super(AirPollutionNet, self).__init__()
             self.layers_list = nn.ModuleList()
             in_features = input_size
             self.output = nn.Linear(in_features, 3)  # Output size is 3 for next 3 days
+        def forward(self, x: torch.Tensor) -> torch.Tensor:
+            """
+            Forward pass of the neural network.
+            Args:
+                x (torch.Tensor): Input tensor.
+            Returns:
+                torch.Tensor: Output tensor after passing through the network.
+            """
             for layer in self.layers_list:
                 x = layer(x)
             x = self.output(x)
     return model
+def load_model(particle: str) -> object:
+    """
+    Loads the forecasting model based on the specified particle.
+    Args:
+        particle (str): The type of particle ("O3" or "NO2").
+    Returns:
+        object: The loaded model (either a neural network or a support vector regression model).
+    """
     repo_id = f"elisaklunder/Utrecht-{particle}-Forecasting-Model"
     if particle == "O3":
         file_name = "O3_svr_model.pkl"
     return model
+def run_model(particle: str, data: pd.DataFrame) -> list:
+    """
+    Runs the model for the specified particle and makes predictions based on the input data.
+    Args:
+        particle (str): The type of particle ("O3" or "NO2").
+        data (pd.DataFrame): The input data for making predictions.
+    Returns:
+        list: The predictions for the specified particle.
+    """
     input_data = create_features(data=data, target_particle=particle)
     model = load_model(particle)
     return prediction
+def update_data_and_predictions() -> None:
+    """
+    Updates the weather and pollution data, makes predictions for O3 and NO2,
+    and stores them in a CSV file.
+    """
     update_weather_data()
     update_pollution_data()
     combined_data.to_csv(PREDICTIONS_FILE, index=False)
+def get_data_and_predictions() -> tuple[pd.DataFrame, list, list]:
+    """
+    Retrieves combined data and today's predictions for O3 and NO2.
+    Returns:
+        tuple: A tuple containing:
+            - week_data (pd.DataFrame): The combined data for the week.
+            - list: Predictions for O3.
+            - list: Predictions for NO2.
+    """
     week_data = get_combined_data()
     PREDICTIONS_FILE = "predictions_history.csv"
     return week_data, [o3_predictions], [no2_predictions]
+if __name__ == "__main__":
+    update_data_and_predictions()