barunsaha commited on
Commit
17d3d6a
·
1 Parent(s): 10581d3

Add data pre-processing code

Browse files
Files changed (1) hide show
  1. pre_processing_resampling.py +105 -0
pre_processing_resampling.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sqlalchemy import create_engine
3
+
4
+
5
+ # Download the zip file and extract it beforehand
6
+ CSV_PATH = 'school_geolocation_measurements/measurements.csv'
7
+
8
+
9
+ def generate_df_stats(data_frame: pd.DataFrame) -> pd.DataFrame:
10
+ """
11
+ Generate and return a new df containing the keys stats of the given df.
12
+ """
13
+
14
+ _ = data_frame.describe(include='all')
15
+ _.loc['dtype'] = data_frame.dtypes
16
+ _.loc['size'] = len(data_frame)
17
+ _.loc['null count'] = data_frame.isnull().sum()
18
+
19
+ return _
20
+
21
+
22
+ def resample_daily_with_aggregation(data_frame: pd.DataFrame) -> pd.DataFrame:
23
+ """
24
+ Resample the input DataFrame to daily frequency and aggregate values,
25
+ correctly handling missing dates and school information.
26
+
27
+ :param data_frame: The input data.
28
+ :return: A DataFrame with resampled values.
29
+ """
30
+
31
+ # Ensure 'date' is datetime and set as index
32
+ data_frame['date'] = pd.to_datetime(data_frame['date'])
33
+ data_frame.set_index('date', inplace=True)
34
+
35
+ resampled_data = []
36
+
37
+ # Iterate only on unique school IDs
38
+ for school_id, school_name in data_frame[
39
+ ['school_id_giga', 'school_name']
40
+ ].drop_duplicates().itertuples(index=False):
41
+ school_df = data_frame[
42
+ (data_frame['school_id_giga'] == school_id) & (data_frame['school_name'] == school_name)
43
+ ].copy()
44
+
45
+ # Resample to daily frequency and aggregate
46
+ resampled_school_df = school_df.resample('D').agg({
47
+ 'download_speed': 'mean',
48
+ 'upload_speed': 'mean',
49
+ 'latency': 'mean'
50
+ })
51
+
52
+ # Add back school_id_giga and school_name
53
+ resampled_school_df['school_id_giga'] = school_id
54
+ resampled_school_df['school_name'] = school_name
55
+
56
+ # Get the first non-null values for 'server_location', 'country', and 'iso3_format'
57
+ non_null_values = school_df[['server_location', 'country', 'iso3_format']].dropna().iloc[0]
58
+ resampled_school_df['server_location'] = non_null_values['server_location']
59
+ resampled_school_df['country'] = non_null_values['country']
60
+ resampled_school_df['iso3_format'] = non_null_values['iso3_format']
61
+
62
+ resampled_data.append(resampled_school_df)
63
+
64
+ resampled_df = pd.concat(resampled_data).reset_index()
65
+
66
+ return resampled_df
67
+
68
+
69
+ # Load the data
70
+ df = pd.read_csv(CSV_PATH)
71
+ df = df.drop(
72
+ columns=[
73
+ 'school_id_govt', 'detected_isp', 'timestamp', 'detected_isp_asn', 'app_version', 'source'
74
+ ]
75
+ )
76
+ df['date'] = pd.to_datetime(df['date'])
77
+ df.sort_values(by=['school_id_giga', 'date'], inplace=True)
78
+
79
+
80
+ # Get the value counts for each school ID
81
+ value_counts = df['school_id_giga'].value_counts()
82
+ print(value_counts)
83
+
84
+ # Remove rows with school IDs that are rare
85
+ min_count_threshold = 120
86
+ df_filtered = df[
87
+ df['school_id_giga'].isin(value_counts[value_counts >= min_count_threshold].index)
88
+ ]
89
+ print(f'{df_filtered.shape=}')
90
+ print(df_filtered['school_id_giga'].value_counts())
91
+
92
+
93
+ # Resample data to daily frequency
94
+ df_daily = resample_daily_with_aggregation(df_filtered.copy())
95
+ print(df_daily.head())
96
+
97
+ # Imputation with median (simple, generally robust to outliers, often works)
98
+ df_daily['download_speed'] = df_daily['download_speed'].fillna(df_daily['download_speed'].median())
99
+ df_daily['upload_speed'] = df_daily['upload_speed'].fillna(df_daily['upload_speed'].median())
100
+ df_daily['latency'] = df_daily['latency'].fillna(df_daily['latency'].median())
101
+
102
+ # Export to CSV and SQLite db
103
+ engine = create_engine('sqlite:///resampled_daily_avg.sqlite', echo=False)
104
+ df_daily.to_sql(name='school_measurements', con=engine)
105
+ df_daily.to_csv('resampled_daily_avg.csv', index=False)