qoscope / pre_processing_resampling.py
barunsaha's picture
Add data pre-processing code
17d3d6a
import pandas as pd
from sqlalchemy import create_engine
# Download the zip file and extract it beforehand
CSV_PATH = 'school_geolocation_measurements/measurements.csv'
def generate_df_stats(data_frame: pd.DataFrame) -> pd.DataFrame:
"""
Generate and return a new df containing the keys stats of the given df.
"""
_ = data_frame.describe(include='all')
_.loc['dtype'] = data_frame.dtypes
_.loc['size'] = len(data_frame)
_.loc['null count'] = data_frame.isnull().sum()
return _
def resample_daily_with_aggregation(data_frame: pd.DataFrame) -> pd.DataFrame:
"""
Resample the input DataFrame to daily frequency and aggregate values,
correctly handling missing dates and school information.
:param data_frame: The input data.
:return: A DataFrame with resampled values.
"""
# Ensure 'date' is datetime and set as index
data_frame['date'] = pd.to_datetime(data_frame['date'])
data_frame.set_index('date', inplace=True)
resampled_data = []
# Iterate only on unique school IDs
for school_id, school_name in data_frame[
['school_id_giga', 'school_name']
].drop_duplicates().itertuples(index=False):
school_df = data_frame[
(data_frame['school_id_giga'] == school_id) & (data_frame['school_name'] == school_name)
].copy()
# Resample to daily frequency and aggregate
resampled_school_df = school_df.resample('D').agg({
'download_speed': 'mean',
'upload_speed': 'mean',
'latency': 'mean'
})
# Add back school_id_giga and school_name
resampled_school_df['school_id_giga'] = school_id
resampled_school_df['school_name'] = school_name
# Get the first non-null values for 'server_location', 'country', and 'iso3_format'
non_null_values = school_df[['server_location', 'country', 'iso3_format']].dropna().iloc[0]
resampled_school_df['server_location'] = non_null_values['server_location']
resampled_school_df['country'] = non_null_values['country']
resampled_school_df['iso3_format'] = non_null_values['iso3_format']
resampled_data.append(resampled_school_df)
resampled_df = pd.concat(resampled_data).reset_index()
return resampled_df
# Load the data
df = pd.read_csv(CSV_PATH)
df = df.drop(
columns=[
'school_id_govt', 'detected_isp', 'timestamp', 'detected_isp_asn', 'app_version', 'source'
]
)
df['date'] = pd.to_datetime(df['date'])
df.sort_values(by=['school_id_giga', 'date'], inplace=True)
# Get the value counts for each school ID
value_counts = df['school_id_giga'].value_counts()
print(value_counts)
# Remove rows with school IDs that are rare
min_count_threshold = 120
df_filtered = df[
df['school_id_giga'].isin(value_counts[value_counts >= min_count_threshold].index)
]
print(f'{df_filtered.shape=}')
print(df_filtered['school_id_giga'].value_counts())
# Resample data to daily frequency
df_daily = resample_daily_with_aggregation(df_filtered.copy())
print(df_daily.head())
# Imputation with median (simple, generally robust to outliers, often works)
df_daily['download_speed'] = df_daily['download_speed'].fillna(df_daily['download_speed'].median())
df_daily['upload_speed'] = df_daily['upload_speed'].fillna(df_daily['upload_speed'].median())
df_daily['latency'] = df_daily['latency'].fillna(df_daily['latency'].median())
# Export to CSV and SQLite db
engine = create_engine('sqlite:///resampled_daily_avg.sqlite', echo=False)
df_daily.to_sql(name='school_measurements', con=engine)
df_daily.to_csv('resampled_daily_avg.csv', index=False)