Spaces:

barunsaha
/

qoscope

Paused

App Files Files Community

qoscope / pre_processing_resampling.py

barunsaha

Add data pre-processing code

17d3d6a 5 months ago

raw

history blame contribute delete

3.65 kB

	import pandas as pd
	from sqlalchemy import create_engine


	# Download the zip file and extract it beforehand
	CSV_PATH = 'school_geolocation_measurements/measurements.csv'


	def generate_df_stats(data_frame: pd.DataFrame) -> pd.DataFrame:
	"""
	Generate and return a new df containing the keys stats of the given df.
	"""

	_ = data_frame.describe(include='all')
	_.loc['dtype'] = data_frame.dtypes
	_.loc['size'] = len(data_frame)
	_.loc['null count'] = data_frame.isnull().sum()

	return _


	def resample_daily_with_aggregation(data_frame: pd.DataFrame) -> pd.DataFrame:
	"""
	Resample the input DataFrame to daily frequency and aggregate values,
	correctly handling missing dates and school information.

	:param data_frame: The input data.
	:return: A DataFrame with resampled values.
	"""

	# Ensure 'date' is datetime and set as index
	data_frame['date'] = pd.to_datetime(data_frame['date'])
	data_frame.set_index('date', inplace=True)

	resampled_data = []

	# Iterate only on unique school IDs
	for school_id, school_name in data_frame[
	['school_id_giga', 'school_name']
	].drop_duplicates().itertuples(index=False):
	school_df = data_frame[
	(data_frame['school_id_giga'] == school_id) & (data_frame['school_name'] == school_name)
	].copy()

	# Resample to daily frequency and aggregate
	resampled_school_df = school_df.resample('D').agg({
	'download_speed': 'mean',
	'upload_speed': 'mean',
	'latency': 'mean'
	})

	# Add back school_id_giga and school_name
	resampled_school_df['school_id_giga'] = school_id
	resampled_school_df['school_name'] = school_name

	# Get the first non-null values for 'server_location', 'country', and 'iso3_format'
	non_null_values = school_df[['server_location', 'country', 'iso3_format']].dropna().iloc[0]
	resampled_school_df['server_location'] = non_null_values['server_location']
	resampled_school_df['country'] = non_null_values['country']
	resampled_school_df['iso3_format'] = non_null_values['iso3_format']

	resampled_data.append(resampled_school_df)

	resampled_df = pd.concat(resampled_data).reset_index()

	return resampled_df


	# Load the data
	df = pd.read_csv(CSV_PATH)
	df = df.drop(
	columns=[
	'school_id_govt', 'detected_isp', 'timestamp', 'detected_isp_asn', 'app_version', 'source'
	]
	)
	df['date'] = pd.to_datetime(df['date'])
	df.sort_values(by=['school_id_giga', 'date'], inplace=True)


	# Get the value counts for each school ID
	value_counts = df['school_id_giga'].value_counts()
	print(value_counts)

	# Remove rows with school IDs that are rare
	min_count_threshold = 120
	df_filtered = df[
	df['school_id_giga'].isin(value_counts[value_counts >= min_count_threshold].index)
	]
	print(f'{df_filtered.shape=}')
	print(df_filtered['school_id_giga'].value_counts())


	# Resample data to daily frequency
	df_daily = resample_daily_with_aggregation(df_filtered.copy())
	print(df_daily.head())

	# Imputation with median (simple, generally robust to outliers, often works)
	df_daily['download_speed'] = df_daily['download_speed'].fillna(df_daily['download_speed'].median())
	df_daily['upload_speed'] = df_daily['upload_speed'].fillna(df_daily['upload_speed'].median())
	df_daily['latency'] = df_daily['latency'].fillna(df_daily['latency'].median())

	# Export to CSV and SQLite db
	engine = create_engine('sqlite:///resampled_daily_avg.sqlite', echo=False)
	df_daily.to_sql(name='school_measurements', con=engine)
	df_daily.to_csv('resampled_daily_avg.csv', index=False)