from __future__ import print_function import os import json import time import sys import pandas as pd import numpy as np import seaborn as sn import gradio as gr from sklearn.compose import ColumnTransformer from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.pipeline import Pipeline from sklearn.model_selection import train_test_split from sklearn.neighbors import NearestNeighbors import spotipy from spotipy.oauth2 import SpotifyClientCredentials client_id = os.getenv("SPOTIPY_CLIENT_ID") client_secret = os.getenv("SPOTIPY_CLIENT_SECRET") sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)) df = pd.read_csv('spotify_data.csv') df = df.drop(columns=['Unnamed: 0', "Unnamed: 0.1", "pos", "artist_uri", "album_uri", "duration_ms_x", "album_name", "name", "type", "id", "track_href", "analysis_url", "duration_ms_y", "time_signature", "artist_pop", "track_pop"]) df.drop_duplicates(subset=['uri'], inplace=True) df.reset_index(drop=True, inplace=True) df_num = df.select_dtypes(include = ['float64', 'int64']) numeric_cols = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence'] categorical_cols = ['key', 'mode'] # Create the preprocessing pipeline preprocessing_pipeline = ColumnTransformer( transformers=[ ('num', StandardScaler(), numeric_cols), ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols) ]) # Apply the preprocessing pipeline to your DataFrame df_processed = preprocessing_pipeline.fit_transform(df_num) num_cols_transformed = numeric_cols cat_cols_transformed = preprocessing_pipeline.named_transformers_['cat'].get_feature_names_out(categorical_cols) # Combine the transformed column names all_cols_transformed = num_cols_transformed + cat_cols_transformed.tolist() # Convert the processed NumPy array back to a DataFrame df_processed = pd.DataFrame(df_processed, columns=all_cols_transformed) def transform_query(track_uri): audio_features = sp.audio_features(track_uri)[0] track_data = [] track_dict = { 'acousticness': audio_features['acousticness'], 'danceability': audio_features['danceability'], 'energy': audio_features['energy'], 'instrumentalness': audio_features['instrumentalness'], 'liveness': audio_features['liveness'], 'loudness': audio_features['loudness'], 'speechiness': audio_features['speechiness'], 'tempo': audio_features['tempo'], 'valence': audio_features['valence'], 'key': audio_features['key'], 'mode': audio_features['mode'] } track_data.append(track_dict) query_data = pd.DataFrame(track_data) return query_data knn_model = NearestNeighbors(n_neighbors=10, algorithm='auto', metric='euclidean') knn_model.fit(df_processed) # I'm using all the data for KNN # Function to find similar songs to the input URI def find_similar_songs(track_uri): query_data = transform_query(track_uri) # Scale the query data using the same scaler query_data_scaled = preprocessing_pipeline.transform(query_data) query_data_scaled_df = pd.DataFrame(query_data_scaled, columns=all_cols_transformed) # Find the most similar songs using the KNN model distances, indices = knn_model.kneighbors(query_data_scaled_df, n_neighbors=10) # Retrieve the Artist Name, Song Name, and Track URI of the most similar songs similar_songs = [] for index in indices[0]: artist_name = df.iloc[index]['artist_name'] song_name = df.iloc[index]['track_name'] similar_uri = df.iloc[index]['uri'] track_id = similar_uri.split(":")[-1] full_url = f"https://open.spotify.com/track/{track_id}" similar_songs.append((artist_name, song_name, full_url)) return similar_songs similar_songs = find_similar_songs('https://open.spotify.com/track/6rDaCGqcQB1urhpCrrD599?si=2ac7add2ea054ab2') def format_output(similar_songs): output = [] for song in similar_songs: output.append({"Artist Name": song[0], "Song Name": song[1], "Spotify Track URL": song[2]}) return pd.DataFrame(output) # Create the Gradio interface iface = gr.Interface( fn=find_similar_songs, # Your find_similar_songs function inputs=gr.Textbox(label="Enter Spotify Track URL"), outputs=gr.Dataframe(headers=["Artist Name", "Song Name", "Spotify Track URL"]), live=True ) iface.launch("share=True")