Spaces:

debisoft
/

junction

Runtime error

File size: 9,086 Bytes

import gradio as gr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid",
			{"grid.color": ".6",
			"grid.linestyle": ":"})
import category_encoders as ce
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# BiteBuddies AI/ML Stack
# Tinder-style Recommender System/Collaborative Filtering
# Testing: curl -X POST https://debisoft-junction.hf.space/api/predict -H 'Content-Type: application/json' -d '{"data": [<name>,<body_profile_type>]}
#  body_profile_type => [0-4]
# Eg. curl -X POST https://debisoft-junction.hf.space/api/predict -H 'Content-Type: application/json' -d '{"data": ["David",4]}
# Adapted from https://www.geeksforgeeks.org/predict-tinder-matches-with-machine-learning/
# TODO: Implement MBTI features

def predict_match(name, body_profile_type):
    user_df = {}

    # Get user input for numerical columns
    # TODO: Add more modifiable parameters

    user_df['age'] = 22.0
    user_df['status'] = 1.0
    user_df['sex'] = 0.0
    user_df['height'] = 60.0
    user_df['smokes'] = 1.0
    user_df['new_languages'] = 2.0
    user_df['body_profile'] = float(body_profile_type)
    user_df['education_level'] = 4.0
    user_df['dropped_out'] = 0.0
    user_df['bio'] = 'I am a foodie and traveller. But sometimes like to sit alone in a corner and read a good fiction.'
    user_df['location_preference'] = 2.0
    user_df['num_languages'] = 2.0
    user_df['drinks_encoded'] = 0.0
    user_df['drugs_encoded'] = 0.0
    # Get user input for one-hot encoded categorical columns
    user_df['location_new_york'] = 0.0
    user_df['location_northern_california'] = 1.0
    user_df['location_southern_california'] = 0.0
    user_df['job_encoded'] = 4.0
    user_df['pets_0'] = 1.0
    user_df['pets_1'] = 1.0
    user_df['pets_2'] = 1.0
    user_df['pets_3'] = 1.0

    # Convert tfidf matrix to DataFrame
    tfidf_df = pd.DataFrame(tfidf.transform([user_df['bio']]).toarray(), columns=feature_names)

    # Convert the user input
    # dictionary to a Pandas DataFrame
    user_df = pd.DataFrame(user_df, index=[0])
    user_df.drop("bio", axis=1, inplace=True)
    user_df = pd.concat([user_df, tfidf_df], axis=1)

    suggested_name = recommendOne(user_df)

    #return "Hello " + name + "!!" + " str2=" + str2
    return suggested_name

def greet_test(name, str2):
    return "Hello " + name + "!!" + " str2=" + str2

# Bootstrap the model
# reading dataset using panda
tinder_df = pd.read_csv("tinder_data.csv")

# count the number of languages in each row
tinder_df['num_languages'] = tinder_df['language']\
	.str.count(',') + 1
tinder_df.drop(["language"], axis=1, inplace=True)

place_type_strength = {
	'anywhere': 1.0,
	'same state': 2.0,
	'same city': 2.5
}

tinder_df['location_preference'] = \
	tinder_df['location_preference']\
	.apply(lambda x: place_type_strength[x])

two_unique_values_column = {
	'sex': {'f': 1, 'm': 0},
	'dropped_out': {'no': 0, 'yes': 1}
}

tinder_df.replace(two_unique_values_column,
				inplace=True)

status_type_strength = {
	'single': 2.0,
	'available': 2.0,
	'seeing someone': 1.0,
	'married': 1.0
}
tinder_df['status'] = tinder_df['status']\
	.apply(lambda x:
		status_type_strength[x])

# create a LabelEncoder object
orientation_encoder = LabelEncoder()

# fit the encoder on the orientation column
orientation_encoder.fit(tinder_df['orientation'])

# encode the orientation column using the fitted encoder
tinder_df['orientation'] = orientation_encoder.\
	transform(tinder_df['orientation'])

# Drop the existing orientation column
tinder_df.drop("orientation", axis=1, inplace=True)

drinking_habit = {
	'socially': 'sometimes',
	'rarely': 'sometimes',
	'not at all': 'do not drink',
	'often': 'drinks often',
	'very often': 'drinks often',
	'desperately': 'drinks often'
}
tinder_df['drinks'] = tinder_df['drinks']\
	.apply(lambda x:
		drinking_habit[x])
# create a LabelEncoder object
habit_encoder = LabelEncoder()

# fit the encoder on the drinks and drugs columns
habit_encoder.fit(tinder_df[['drinks', 'drugs']]
				.values.reshape(-1))

# encode the drinks and drugs columns
# using the fitted encoder
tinder_df['drinks_encoded'] = \
	habit_encoder.transform(tinder_df['drinks'])
tinder_df['drugs_encoded'] = \
	habit_encoder.transform(tinder_df['drugs'])

# Drop the existing drink and drugs column
tinder_df.drop(["drinks", "drugs"], axis=1,
			inplace=True)

region_dict = {'southern_california': ['los angeles',
						'san diego', 'hacienda heights',
						'north hollywood', 'phoenix'],
			'new_york': ['brooklyn',
							'new york']}

def get_region(city):
	for region, cities in region_dict.items():
		if city.lower() in [c.lower() for c in cities]:
			return region
	return "northern_california"


tinder_df['location'] = tinder_df['location']\
		.str.split(', ')\
		.str[0].apply(get_region)
# perform one hot encoding
location_encoder = OneHotEncoder()

# fit and transform the location column
location_encoded = location_encoder.fit_transform\
					(tinder_df[['location']])

# create a new DataFrame with the encoded columns
location_encoded_df = pd.DataFrame(location_encoded.toarray()\
						, columns=location_encoder.\
						get_feature_names_out(['location']))

# concatenate the new DataFrame with the original DataFrame
tinder_df = pd.concat([tinder_df, location_encoded_df], axis=1)
# Drop the existing location column
tinder_df.drop(["location"], axis=1, inplace=True)

# create a LabelEncoder object
job_encoder = LabelEncoder()

# fit the encoder on the job column
job_encoder.fit(tinder_df['job'])

# encode the job column using the fitted encoder
tinder_df['job_encoded'] = job_encoder.\
	transform(tinder_df['job'])

# drop the original job column
tinder_df.drop('job', axis=1, inplace=True)

smokes = {
'no': 1.0,
'sometimes': 0,
'yes': 0,
'when drinking':0,
'trying to quit':0
}
tinder_df['smokes'] = tinder_df['smokes']\
							.apply(lambda x: smokes[x])

bin_enc = ce.BinaryEncoder(cols=['pets'])

# fit and transform the pet column
pet_enc = bin_enc.fit_transform(tinder_df['pets'])

# add the encoded columns to the original dataframe
tinder_df = pd.concat([tinder_df, pet_enc], axis=1)

tinder_df.drop("pets",axis=1,inplace = True)

# create a LabelEncoder object
location_encoder = LabelEncoder()

# fit the encoder on the job column
location_encoder.fit(tinder_df['new_languages'])

# encode the job column using the fitted encoder
tinder_df['new_languages'] = location_encoder.transform(
	tinder_df['new_languages'])

# create an instance of LabelEncoder
le = LabelEncoder()

# encode the body_profile column
tinder_df["body_profile"] = le.fit_transform(tinder_df["body_profile"])

# Initialize TfidfVectorizer object
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the text data
tfidf_matrix = tfidf.fit_transform(tinder_df['bio'])

# Get the feature names from the TfidfVectorizer object
feature_names = tfidf.vocabulary_

# Convert tfidf matrix to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(),
						columns=feature_names)

# Add non-text features to the tfidf_df dataframe
tinder_dfs = tinder_df.drop(["bio", "user_id",
							"username"], axis=1)
tinder_dfs = pd.concat([tinder_dfs,
						tfidf_df], axis=1)

# Apply SVD to the feature matrix
svd = TruncatedSVD(n_components=100)
#svd = TruncatedSVD()

svd_matrix = svd.fit_transform(tinder_dfs)



# Calculate the cosine similarity
# between all pairs of users
cosine_sim = cosine_similarity(svd_matrix)

def recommend(user_df, num_recommendations=5):

	# Apply SVD to the feature
	# matrix of the user_df dataframe
	svd_matrixs = svd.transform(user_df)

	# Calculate the cosine similarity
	# between the user_df and training set users
	cosine_sim_new = cosine_similarity(svd_matrixs, svd_matrix)

	# Get the indices of the top
	# num_recommendations similar users
	sim_scores = list(enumerate(cosine_sim_new[0]))
	sim_scores = sorted(sim_scores,
						key=lambda x: x[1], reverse=True)
	sim_indices = [i[0] for i in
				sim_scores[1:num_recommendations+1]]

	# Return the user_ids of the recommended users
	return tinder_df['username'].iloc[sim_indices]

def recommendOne(user_df, num_recommendations=1):

	# Apply SVD to the feature
	# matrix of the user_df dataframe
	svd_matrixs = svd.transform(user_df)

	# Calculate the cosine similarity
	# between the user_df and training set users
	cosine_sim_new = cosine_similarity(svd_matrixs, svd_matrix)

	# Get the indices of the top
	# num_recommendations similar users
	sim_scores = list(enumerate(cosine_sim_new[0]))
	sim_scores = sorted(sim_scores,
						key=lambda x: x[1], reverse=True)
	sim_indices = [i[0] for i in
				sim_scores[1:num_recommendations+1]]
	ser = tinder_df['username'].iloc[sim_indices]

	return pd.Series(ser[sim_indices[0]])[0]

# Setup complete!

iface = gr.Interface(fn=predict_match, inputs=["text", "text"], outputs="text")
iface.launch()