|
import pandas as pd |
|
from sqlalchemy import create_engine |
|
|
|
|
|
|
|
CSV_PATH = 'school_geolocation_measurements/measurements.csv' |
|
|
|
|
|
def generate_df_stats(data_frame: pd.DataFrame) -> pd.DataFrame: |
|
""" |
|
Generate and return a new df containing the keys stats of the given df. |
|
""" |
|
|
|
_ = data_frame.describe(include='all') |
|
_.loc['dtype'] = data_frame.dtypes |
|
_.loc['size'] = len(data_frame) |
|
_.loc['null count'] = data_frame.isnull().sum() |
|
|
|
return _ |
|
|
|
|
|
def resample_daily_with_aggregation(data_frame: pd.DataFrame) -> pd.DataFrame: |
|
""" |
|
Resample the input DataFrame to daily frequency and aggregate values, |
|
correctly handling missing dates and school information. |
|
|
|
:param data_frame: The input data. |
|
:return: A DataFrame with resampled values. |
|
""" |
|
|
|
|
|
data_frame['date'] = pd.to_datetime(data_frame['date']) |
|
data_frame.set_index('date', inplace=True) |
|
|
|
resampled_data = [] |
|
|
|
|
|
for school_id, school_name in data_frame[ |
|
['school_id_giga', 'school_name'] |
|
].drop_duplicates().itertuples(index=False): |
|
school_df = data_frame[ |
|
(data_frame['school_id_giga'] == school_id) & (data_frame['school_name'] == school_name) |
|
].copy() |
|
|
|
|
|
resampled_school_df = school_df.resample('D').agg({ |
|
'download_speed': 'mean', |
|
'upload_speed': 'mean', |
|
'latency': 'mean' |
|
}) |
|
|
|
|
|
resampled_school_df['school_id_giga'] = school_id |
|
resampled_school_df['school_name'] = school_name |
|
|
|
|
|
non_null_values = school_df[['server_location', 'country', 'iso3_format']].dropna().iloc[0] |
|
resampled_school_df['server_location'] = non_null_values['server_location'] |
|
resampled_school_df['country'] = non_null_values['country'] |
|
resampled_school_df['iso3_format'] = non_null_values['iso3_format'] |
|
|
|
resampled_data.append(resampled_school_df) |
|
|
|
resampled_df = pd.concat(resampled_data).reset_index() |
|
|
|
return resampled_df |
|
|
|
|
|
|
|
df = pd.read_csv(CSV_PATH) |
|
df = df.drop( |
|
columns=[ |
|
'school_id_govt', 'detected_isp', 'timestamp', 'detected_isp_asn', 'app_version', 'source' |
|
] |
|
) |
|
df['date'] = pd.to_datetime(df['date']) |
|
df.sort_values(by=['school_id_giga', 'date'], inplace=True) |
|
|
|
|
|
|
|
value_counts = df['school_id_giga'].value_counts() |
|
print(value_counts) |
|
|
|
|
|
min_count_threshold = 120 |
|
df_filtered = df[ |
|
df['school_id_giga'].isin(value_counts[value_counts >= min_count_threshold].index) |
|
] |
|
print(f'{df_filtered.shape=}') |
|
print(df_filtered['school_id_giga'].value_counts()) |
|
|
|
|
|
|
|
df_daily = resample_daily_with_aggregation(df_filtered.copy()) |
|
print(df_daily.head()) |
|
|
|
|
|
df_daily['download_speed'] = df_daily['download_speed'].fillna(df_daily['download_speed'].median()) |
|
df_daily['upload_speed'] = df_daily['upload_speed'].fillna(df_daily['upload_speed'].median()) |
|
df_daily['latency'] = df_daily['latency'].fillna(df_daily['latency'].median()) |
|
|
|
|
|
engine = create_engine('sqlite:///resampled_daily_avg.sqlite', echo=False) |
|
df_daily.to_sql(name='school_measurements', con=engine) |
|
df_daily.to_csv('resampled_daily_avg.csv', index=False) |
|
|