|
import gradio as gr |
|
import numpy as np |
|
import pandas as pd |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
sns.set_style("darkgrid", |
|
{"grid.color": ".6", |
|
"grid.linestyle": ":"}) |
|
import category_encoders as ce |
|
from sklearn.decomposition import TruncatedSVD |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
from sklearn.preprocessing import LabelEncoder |
|
from sklearn.preprocessing import OneHotEncoder |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def predict_match(name, body_profile_type): |
|
user_df = {} |
|
|
|
|
|
|
|
|
|
user_df['age'] = 22.0 |
|
user_df['status'] = 1.0 |
|
user_df['sex'] = 0.0 |
|
user_df['height'] = 60.0 |
|
user_df['smokes'] = 1.0 |
|
user_df['new_languages'] = 2.0 |
|
user_df['body_profile'] = float(body_profile_type) |
|
user_df['education_level'] = 4.0 |
|
user_df['dropped_out'] = 0.0 |
|
user_df['bio'] = 'I am a foodie and traveller. But sometimes like to sit alone in a corner and read a good fiction.' |
|
user_df['location_preference'] = 2.0 |
|
user_df['num_languages'] = 2.0 |
|
user_df['drinks_encoded'] = 0.0 |
|
user_df['drugs_encoded'] = 0.0 |
|
|
|
user_df['location_new_york'] = 0.0 |
|
user_df['location_northern_california'] = 1.0 |
|
user_df['location_southern_california'] = 0.0 |
|
user_df['job_encoded'] = 4.0 |
|
user_df['pets_0'] = 1.0 |
|
user_df['pets_1'] = 1.0 |
|
user_df['pets_2'] = 1.0 |
|
user_df['pets_3'] = 1.0 |
|
|
|
|
|
tfidf_df = pd.DataFrame(tfidf.transform([user_df['bio']]).toarray(), columns=feature_names) |
|
|
|
|
|
|
|
user_df = pd.DataFrame(user_df, index=[0]) |
|
user_df.drop("bio", axis=1, inplace=True) |
|
user_df = pd.concat([user_df, tfidf_df], axis=1) |
|
|
|
suggested_name = recommendOne(user_df) |
|
|
|
|
|
return suggested_name |
|
|
|
def greet_test(name, str2): |
|
return "Hello " + name + "!!" + " str2=" + str2 |
|
|
|
|
|
|
|
tinder_df = pd.read_csv("tinder_data.csv") |
|
|
|
|
|
tinder_df['num_languages'] = tinder_df['language']\ |
|
.str.count(',') + 1 |
|
tinder_df.drop(["language"], axis=1, inplace=True) |
|
|
|
place_type_strength = { |
|
'anywhere': 1.0, |
|
'same state': 2.0, |
|
'same city': 2.5 |
|
} |
|
|
|
tinder_df['location_preference'] = \ |
|
tinder_df['location_preference']\ |
|
.apply(lambda x: place_type_strength[x]) |
|
|
|
two_unique_values_column = { |
|
'sex': {'f': 1, 'm': 0}, |
|
'dropped_out': {'no': 0, 'yes': 1} |
|
} |
|
|
|
tinder_df.replace(two_unique_values_column, |
|
inplace=True) |
|
|
|
status_type_strength = { |
|
'single': 2.0, |
|
'available': 2.0, |
|
'seeing someone': 1.0, |
|
'married': 1.0 |
|
} |
|
tinder_df['status'] = tinder_df['status']\ |
|
.apply(lambda x: |
|
status_type_strength[x]) |
|
|
|
|
|
orientation_encoder = LabelEncoder() |
|
|
|
|
|
orientation_encoder.fit(tinder_df['orientation']) |
|
|
|
|
|
tinder_df['orientation'] = orientation_encoder.\ |
|
transform(tinder_df['orientation']) |
|
|
|
|
|
tinder_df.drop("orientation", axis=1, inplace=True) |
|
|
|
drinking_habit = { |
|
'socially': 'sometimes', |
|
'rarely': 'sometimes', |
|
'not at all': 'do not drink', |
|
'often': 'drinks often', |
|
'very often': 'drinks often', |
|
'desperately': 'drinks often' |
|
} |
|
tinder_df['drinks'] = tinder_df['drinks']\ |
|
.apply(lambda x: |
|
drinking_habit[x]) |
|
|
|
habit_encoder = LabelEncoder() |
|
|
|
|
|
habit_encoder.fit(tinder_df[['drinks', 'drugs']] |
|
.values.reshape(-1)) |
|
|
|
|
|
|
|
tinder_df['drinks_encoded'] = \ |
|
habit_encoder.transform(tinder_df['drinks']) |
|
tinder_df['drugs_encoded'] = \ |
|
habit_encoder.transform(tinder_df['drugs']) |
|
|
|
|
|
tinder_df.drop(["drinks", "drugs"], axis=1, |
|
inplace=True) |
|
|
|
region_dict = {'southern_california': ['los angeles', |
|
'san diego', 'hacienda heights', |
|
'north hollywood', 'phoenix'], |
|
'new_york': ['brooklyn', |
|
'new york']} |
|
|
|
def get_region(city): |
|
for region, cities in region_dict.items(): |
|
if city.lower() in [c.lower() for c in cities]: |
|
return region |
|
return "northern_california" |
|
|
|
|
|
tinder_df['location'] = tinder_df['location']\ |
|
.str.split(', ')\ |
|
.str[0].apply(get_region) |
|
|
|
location_encoder = OneHotEncoder() |
|
|
|
|
|
location_encoded = location_encoder.fit_transform\ |
|
(tinder_df[['location']]) |
|
|
|
|
|
location_encoded_df = pd.DataFrame(location_encoded.toarray()\ |
|
, columns=location_encoder.\ |
|
get_feature_names_out(['location'])) |
|
|
|
|
|
tinder_df = pd.concat([tinder_df, location_encoded_df], axis=1) |
|
|
|
tinder_df.drop(["location"], axis=1, inplace=True) |
|
|
|
|
|
job_encoder = LabelEncoder() |
|
|
|
|
|
job_encoder.fit(tinder_df['job']) |
|
|
|
|
|
tinder_df['job_encoded'] = job_encoder.\ |
|
transform(tinder_df['job']) |
|
|
|
|
|
tinder_df.drop('job', axis=1, inplace=True) |
|
|
|
smokes = { |
|
'no': 1.0, |
|
'sometimes': 0, |
|
'yes': 0, |
|
'when drinking':0, |
|
'trying to quit':0 |
|
} |
|
tinder_df['smokes'] = tinder_df['smokes']\ |
|
.apply(lambda x: smokes[x]) |
|
|
|
bin_enc = ce.BinaryEncoder(cols=['pets']) |
|
|
|
|
|
pet_enc = bin_enc.fit_transform(tinder_df['pets']) |
|
|
|
|
|
tinder_df = pd.concat([tinder_df, pet_enc], axis=1) |
|
|
|
tinder_df.drop("pets",axis=1,inplace = True) |
|
|
|
|
|
location_encoder = LabelEncoder() |
|
|
|
|
|
location_encoder.fit(tinder_df['new_languages']) |
|
|
|
|
|
tinder_df['new_languages'] = location_encoder.transform( |
|
tinder_df['new_languages']) |
|
|
|
|
|
le = LabelEncoder() |
|
|
|
|
|
tinder_df["body_profile"] = le.fit_transform(tinder_df["body_profile"]) |
|
|
|
|
|
tfidf = TfidfVectorizer(stop_words='english') |
|
|
|
|
|
tfidf_matrix = tfidf.fit_transform(tinder_df['bio']) |
|
|
|
|
|
feature_names = tfidf.vocabulary_ |
|
|
|
|
|
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), |
|
columns=feature_names) |
|
|
|
|
|
tinder_dfs = tinder_df.drop(["bio", "user_id", |
|
"username"], axis=1) |
|
tinder_dfs = pd.concat([tinder_dfs, |
|
tfidf_df], axis=1) |
|
|
|
|
|
svd = TruncatedSVD(n_components=100) |
|
|
|
|
|
svd_matrix = svd.fit_transform(tinder_dfs) |
|
|
|
|
|
|
|
|
|
|
|
cosine_sim = cosine_similarity(svd_matrix) |
|
|
|
def recommend(user_df, num_recommendations=5): |
|
|
|
|
|
|
|
svd_matrixs = svd.transform(user_df) |
|
|
|
|
|
|
|
cosine_sim_new = cosine_similarity(svd_matrixs, svd_matrix) |
|
|
|
|
|
|
|
sim_scores = list(enumerate(cosine_sim_new[0])) |
|
sim_scores = sorted(sim_scores, |
|
key=lambda x: x[1], reverse=True) |
|
sim_indices = [i[0] for i in |
|
sim_scores[1:num_recommendations+1]] |
|
|
|
|
|
return tinder_df['username'].iloc[sim_indices] |
|
|
|
def recommendOne(user_df, num_recommendations=1): |
|
|
|
|
|
|
|
svd_matrixs = svd.transform(user_df) |
|
|
|
|
|
|
|
cosine_sim_new = cosine_similarity(svd_matrixs, svd_matrix) |
|
|
|
|
|
|
|
sim_scores = list(enumerate(cosine_sim_new[0])) |
|
sim_scores = sorted(sim_scores, |
|
key=lambda x: x[1], reverse=True) |
|
sim_indices = [i[0] for i in |
|
sim_scores[1:num_recommendations+1]] |
|
ser = tinder_df['username'].iloc[sim_indices] |
|
|
|
return pd.Series(ser[sim_indices[0]])[0] |
|
|
|
|
|
|
|
iface = gr.Interface(fn=predict_match, inputs=["text", "text"], outputs="text") |
|
iface.launch() |
|
|