Add data pre-processing code
Browse files- pre_processing_resampling.py +105 -0
pre_processing_resampling.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from sqlalchemy import create_engine
|
3 |
+
|
4 |
+
|
5 |
+
# Download the zip file and extract it beforehand
|
6 |
+
CSV_PATH = 'school_geolocation_measurements/measurements.csv'
|
7 |
+
|
8 |
+
|
9 |
+
def generate_df_stats(data_frame: pd.DataFrame) -> pd.DataFrame:
|
10 |
+
"""
|
11 |
+
Generate and return a new df containing the keys stats of the given df.
|
12 |
+
"""
|
13 |
+
|
14 |
+
_ = data_frame.describe(include='all')
|
15 |
+
_.loc['dtype'] = data_frame.dtypes
|
16 |
+
_.loc['size'] = len(data_frame)
|
17 |
+
_.loc['null count'] = data_frame.isnull().sum()
|
18 |
+
|
19 |
+
return _
|
20 |
+
|
21 |
+
|
22 |
+
def resample_daily_with_aggregation(data_frame: pd.DataFrame) -> pd.DataFrame:
|
23 |
+
"""
|
24 |
+
Resample the input DataFrame to daily frequency and aggregate values,
|
25 |
+
correctly handling missing dates and school information.
|
26 |
+
|
27 |
+
:param data_frame: The input data.
|
28 |
+
:return: A DataFrame with resampled values.
|
29 |
+
"""
|
30 |
+
|
31 |
+
# Ensure 'date' is datetime and set as index
|
32 |
+
data_frame['date'] = pd.to_datetime(data_frame['date'])
|
33 |
+
data_frame.set_index('date', inplace=True)
|
34 |
+
|
35 |
+
resampled_data = []
|
36 |
+
|
37 |
+
# Iterate only on unique school IDs
|
38 |
+
for school_id, school_name in data_frame[
|
39 |
+
['school_id_giga', 'school_name']
|
40 |
+
].drop_duplicates().itertuples(index=False):
|
41 |
+
school_df = data_frame[
|
42 |
+
(data_frame['school_id_giga'] == school_id) & (data_frame['school_name'] == school_name)
|
43 |
+
].copy()
|
44 |
+
|
45 |
+
# Resample to daily frequency and aggregate
|
46 |
+
resampled_school_df = school_df.resample('D').agg({
|
47 |
+
'download_speed': 'mean',
|
48 |
+
'upload_speed': 'mean',
|
49 |
+
'latency': 'mean'
|
50 |
+
})
|
51 |
+
|
52 |
+
# Add back school_id_giga and school_name
|
53 |
+
resampled_school_df['school_id_giga'] = school_id
|
54 |
+
resampled_school_df['school_name'] = school_name
|
55 |
+
|
56 |
+
# Get the first non-null values for 'server_location', 'country', and 'iso3_format'
|
57 |
+
non_null_values = school_df[['server_location', 'country', 'iso3_format']].dropna().iloc[0]
|
58 |
+
resampled_school_df['server_location'] = non_null_values['server_location']
|
59 |
+
resampled_school_df['country'] = non_null_values['country']
|
60 |
+
resampled_school_df['iso3_format'] = non_null_values['iso3_format']
|
61 |
+
|
62 |
+
resampled_data.append(resampled_school_df)
|
63 |
+
|
64 |
+
resampled_df = pd.concat(resampled_data).reset_index()
|
65 |
+
|
66 |
+
return resampled_df
|
67 |
+
|
68 |
+
|
69 |
+
# Load the data
|
70 |
+
df = pd.read_csv(CSV_PATH)
|
71 |
+
df = df.drop(
|
72 |
+
columns=[
|
73 |
+
'school_id_govt', 'detected_isp', 'timestamp', 'detected_isp_asn', 'app_version', 'source'
|
74 |
+
]
|
75 |
+
)
|
76 |
+
df['date'] = pd.to_datetime(df['date'])
|
77 |
+
df.sort_values(by=['school_id_giga', 'date'], inplace=True)
|
78 |
+
|
79 |
+
|
80 |
+
# Get the value counts for each school ID
|
81 |
+
value_counts = df['school_id_giga'].value_counts()
|
82 |
+
print(value_counts)
|
83 |
+
|
84 |
+
# Remove rows with school IDs that are rare
|
85 |
+
min_count_threshold = 120
|
86 |
+
df_filtered = df[
|
87 |
+
df['school_id_giga'].isin(value_counts[value_counts >= min_count_threshold].index)
|
88 |
+
]
|
89 |
+
print(f'{df_filtered.shape=}')
|
90 |
+
print(df_filtered['school_id_giga'].value_counts())
|
91 |
+
|
92 |
+
|
93 |
+
# Resample data to daily frequency
|
94 |
+
df_daily = resample_daily_with_aggregation(df_filtered.copy())
|
95 |
+
print(df_daily.head())
|
96 |
+
|
97 |
+
# Imputation with median (simple, generally robust to outliers, often works)
|
98 |
+
df_daily['download_speed'] = df_daily['download_speed'].fillna(df_daily['download_speed'].median())
|
99 |
+
df_daily['upload_speed'] = df_daily['upload_speed'].fillna(df_daily['upload_speed'].median())
|
100 |
+
df_daily['latency'] = df_daily['latency'].fillna(df_daily['latency'].median())
|
101 |
+
|
102 |
+
# Export to CSV and SQLite db
|
103 |
+
engine = create_engine('sqlite:///resampled_daily_avg.sqlite', echo=False)
|
104 |
+
df_daily.to_sql(name='school_measurements', con=engine)
|
105 |
+
df_daily.to_csv('resampled_daily_avg.csv', index=False)
|