Spaces:

barunsaha
/

qoscope

Paused

App Files Files Community

barunsaha commited on Mar 2

Commit

17d3d6a

1 Parent(s): 10581d3

Add data pre-processing code

Browse files

Files changed (1) hide show

pre_processing_resampling.py +105 -0

pre_processing_resampling.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import pandas as pd
+from sqlalchemy import create_engine
+# Download the zip file and extract it beforehand
+CSV_PATH = 'school_geolocation_measurements/measurements.csv'
+def generate_df_stats(data_frame: pd.DataFrame) -> pd.DataFrame:
+    """
+    Generate and return a new df containing the keys stats of the given df.
+    """
+    _ = data_frame.describe(include='all')
+    _.loc['dtype'] = data_frame.dtypes
+    _.loc['size'] = len(data_frame)
+    _.loc['null count'] = data_frame.isnull().sum()
+    return _
+def resample_daily_with_aggregation(data_frame: pd.DataFrame) -> pd.DataFrame:
+    """
+    Resample the input DataFrame to daily frequency and aggregate values,
+    correctly handling missing dates and school information.
+    :param data_frame: The input data.
+    :return: A DataFrame with resampled values.
+    """
+    # Ensure 'date' is datetime and set as index
+    data_frame['date'] = pd.to_datetime(data_frame['date'])
+    data_frame.set_index('date', inplace=True)
+    resampled_data = []
+    # Iterate only on unique school IDs
+    for school_id, school_name in data_frame[
+        ['school_id_giga', 'school_name']
+    ].drop_duplicates().itertuples(index=False):
+        school_df = data_frame[
+            (data_frame['school_id_giga'] == school_id) & (data_frame['school_name'] == school_name)
+        ].copy()
+        # Resample to daily frequency and aggregate
+        resampled_school_df = school_df.resample('D').agg({
+            'download_speed': 'mean',
+            'upload_speed': 'mean',
+            'latency': 'mean'
+        })
+        # Add back school_id_giga and school_name
+        resampled_school_df['school_id_giga'] = school_id
+        resampled_school_df['school_name'] = school_name
+        # Get the first non-null values for 'server_location', 'country', and 'iso3_format'
+        non_null_values = school_df[['server_location', 'country', 'iso3_format']].dropna().iloc[0]
+        resampled_school_df['server_location'] = non_null_values['server_location']
+        resampled_school_df['country'] = non_null_values['country']
+        resampled_school_df['iso3_format'] = non_null_values['iso3_format']
+        resampled_data.append(resampled_school_df)
+    resampled_df = pd.concat(resampled_data).reset_index()
+    return resampled_df
+# Load the data
+df = pd.read_csv(CSV_PATH)
+df = df.drop(
+    columns=[
+        'school_id_govt', 'detected_isp', 'timestamp', 'detected_isp_asn', 'app_version', 'source'
+    ]
+)
+df['date'] = pd.to_datetime(df['date'])
+df.sort_values(by=['school_id_giga', 'date'], inplace=True)
+# Get the value counts for each school ID
+value_counts = df['school_id_giga'].value_counts()
+print(value_counts)
+# Remove rows with school IDs that are rare
+min_count_threshold = 120
+df_filtered = df[
+    df['school_id_giga'].isin(value_counts[value_counts >= min_count_threshold].index)
+]
+print(f'{df_filtered.shape=}')
+print(df_filtered['school_id_giga'].value_counts())
+# Resample data to daily frequency
+df_daily = resample_daily_with_aggregation(df_filtered.copy())
+print(df_daily.head())
+# Imputation with median (simple, generally robust to outliers, often works)
+df_daily['download_speed'] = df_daily['download_speed'].fillna(df_daily['download_speed'].median())
+df_daily['upload_speed'] = df_daily['upload_speed'].fillna(df_daily['upload_speed'].median())
+df_daily['latency'] = df_daily['latency'].fillna(df_daily['latency'].median())
+# Export to CSV and SQLite db
+engine = create_engine('sqlite:///resampled_daily_avg.sqlite', echo=False)
+df_daily.to_sql(name='school_measurements', con=engine)
+df_daily.to_csv('resampled_daily_avg.csv', index=False)