File size: 3,645 Bytes
17d3d6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import pandas as pd
from sqlalchemy import create_engine


# Download the zip file and extract it beforehand
CSV_PATH = 'school_geolocation_measurements/measurements.csv'


def generate_df_stats(data_frame: pd.DataFrame) -> pd.DataFrame:
    """
    Generate and return a new df containing the keys stats of the given df.
    """

    _ = data_frame.describe(include='all')
    _.loc['dtype'] = data_frame.dtypes
    _.loc['size'] = len(data_frame)
    _.loc['null count'] = data_frame.isnull().sum()

    return _


def resample_daily_with_aggregation(data_frame: pd.DataFrame) -> pd.DataFrame:
    """
    Resample the input DataFrame to daily frequency and aggregate values,
    correctly handling missing dates and school information.

    :param data_frame: The input data.
    :return: A DataFrame with resampled values.
    """

    # Ensure 'date' is datetime and set as index
    data_frame['date'] = pd.to_datetime(data_frame['date'])
    data_frame.set_index('date', inplace=True)

    resampled_data = []

    # Iterate only on unique school IDs
    for school_id, school_name in data_frame[
        ['school_id_giga', 'school_name']
    ].drop_duplicates().itertuples(index=False):
        school_df = data_frame[
            (data_frame['school_id_giga'] == school_id) & (data_frame['school_name'] == school_name)
        ].copy()

        # Resample to daily frequency and aggregate
        resampled_school_df = school_df.resample('D').agg({
            'download_speed': 'mean',
            'upload_speed': 'mean',
            'latency': 'mean'
        })

        # Add back school_id_giga and school_name
        resampled_school_df['school_id_giga'] = school_id
        resampled_school_df['school_name'] = school_name

        # Get the first non-null values for 'server_location', 'country', and 'iso3_format'
        non_null_values = school_df[['server_location', 'country', 'iso3_format']].dropna().iloc[0]
        resampled_school_df['server_location'] = non_null_values['server_location']
        resampled_school_df['country'] = non_null_values['country']
        resampled_school_df['iso3_format'] = non_null_values['iso3_format']

        resampled_data.append(resampled_school_df)

    resampled_df = pd.concat(resampled_data).reset_index()

    return resampled_df


# Load the data
df = pd.read_csv(CSV_PATH)
df = df.drop(
    columns=[
        'school_id_govt', 'detected_isp', 'timestamp', 'detected_isp_asn', 'app_version', 'source'
    ]
)
df['date'] = pd.to_datetime(df['date'])
df.sort_values(by=['school_id_giga', 'date'], inplace=True)


# Get the value counts for each school ID
value_counts = df['school_id_giga'].value_counts()
print(value_counts)

# Remove rows with school IDs that are rare
min_count_threshold = 120
df_filtered = df[
    df['school_id_giga'].isin(value_counts[value_counts >= min_count_threshold].index)
]
print(f'{df_filtered.shape=}')
print(df_filtered['school_id_giga'].value_counts())


# Resample data to daily frequency
df_daily = resample_daily_with_aggregation(df_filtered.copy())
print(df_daily.head())

# Imputation with median (simple, generally robust to outliers, often works)
df_daily['download_speed'] = df_daily['download_speed'].fillna(df_daily['download_speed'].median())
df_daily['upload_speed'] = df_daily['upload_speed'].fillna(df_daily['upload_speed'].median())
df_daily['latency'] = df_daily['latency'].fillna(df_daily['latency'].median())

# Export to CSV and SQLite db
engine = create_engine('sqlite:///resampled_daily_avg.sqlite', echo=False)
df_daily.to_sql(name='school_measurements', con=engine)
df_daily.to_csv('resampled_daily_avg.csv', index=False)