Tinder!
Browse files- app.py +256 -1
- tinder_data.csv +0 -0
app.py
CHANGED
@@ -15,8 +15,263 @@ from sklearn.preprocessing import OneHotEncoder
|
|
15 |
|
16 |
|
17 |
|
18 |
-
def
|
19 |
return "Hello " + name + "!!" + " str2=" + str2
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
iface = gr.Interface(fn=greet, inputs=["text", "text"], outputs="text")
|
22 |
iface.launch()
|
|
|
15 |
|
16 |
|
17 |
|
18 |
+
def greet_o(name, str2):
|
19 |
return "Hello " + name + "!!" + " str2=" + str2
|
20 |
|
21 |
+
def greet(name, str2):
|
22 |
+
user_df = {}
|
23 |
+
|
24 |
+
# Get user input for numerical columns
|
25 |
+
user_df['age'] = 22.0
|
26 |
+
user_df['status'] = 1.0
|
27 |
+
user_df['sex'] = 0.0
|
28 |
+
user_df['height'] = 60.0
|
29 |
+
user_df['smokes'] = 1.0
|
30 |
+
user_df['new_languages'] = 2.0
|
31 |
+
user_df['body_profile'] = 0.0
|
32 |
+
user_df['education_level'] = 4.0
|
33 |
+
user_df['dropped_out'] = 0.0
|
34 |
+
user_df['bio'] = 'I am a foodie and traveller. But sometimes like to sit alone in a corner and read a good fiction.'
|
35 |
+
user_df['location_preference'] = 2.0
|
36 |
+
user_df['num_languages'] = 2.0
|
37 |
+
user_df['drinks_encoded'] = 0.0
|
38 |
+
user_df['drugs_encoded'] = 0.0
|
39 |
+
# Get user input for one-hot encoded categorical columns
|
40 |
+
user_df['location_new_york'] = 0.0
|
41 |
+
user_df['location_northern_california'] = 1.0
|
42 |
+
user_df['location_southern_california'] = 0.0
|
43 |
+
user_df['job_encoded'] = 4.0
|
44 |
+
user_df['pets_0'] = 1.0
|
45 |
+
user_df['pets_1'] = 1.0
|
46 |
+
user_df['pets_2'] = 1.0
|
47 |
+
user_df['pets_3'] = 1.0
|
48 |
+
|
49 |
+
# Convert tfidf matrix to DataFrame
|
50 |
+
tfidf_df = pd.DataFrame(tfidf.transform([user_df['bio']]).toarray(), columns=feature_names)
|
51 |
+
|
52 |
+
# Convert the user input
|
53 |
+
# dictionary to a Pandas DataFrame
|
54 |
+
user_df = pd.DataFrame(user_df, index=[0])
|
55 |
+
user_df.drop("bio", axis=1, inplace=True)
|
56 |
+
user_df = pd.concat([user_df, tfidf_df], axis=1)
|
57 |
+
|
58 |
+
suggested_arr = recommend(user_df)
|
59 |
+
|
60 |
+
return "Hello " + suggested_arr[0] + "!!" + " str2=" + str2
|
61 |
+
|
62 |
+
# reading dataset using panda
|
63 |
+
tinder_df = pd.read_csv("tinder_data.csv")
|
64 |
+
|
65 |
+
# count the number of languages in each row
|
66 |
+
tinder_df['num_languages'] = tinder_df['language']\
|
67 |
+
.str.count(',') + 1
|
68 |
+
tinder_df.drop(["language"], axis=1, inplace=True)
|
69 |
+
|
70 |
+
place_type_strength = {
|
71 |
+
'anywhere': 1.0,
|
72 |
+
'same state': 2.0,
|
73 |
+
'same city': 2.5
|
74 |
+
}
|
75 |
+
|
76 |
+
tinder_df['location_preference'] = \
|
77 |
+
tinder_df['location_preference']\
|
78 |
+
.apply(lambda x: place_type_strength[x])
|
79 |
+
|
80 |
+
two_unique_values_column = {
|
81 |
+
'sex': {'f': 1, 'm': 0},
|
82 |
+
'dropped_out': {'no': 0, 'yes': 1}
|
83 |
+
}
|
84 |
+
|
85 |
+
tinder_df.replace(two_unique_values_column,
|
86 |
+
inplace=True)
|
87 |
+
|
88 |
+
status_type_strength = {
|
89 |
+
'single': 2.0,
|
90 |
+
'available': 2.0,
|
91 |
+
'seeing someone': 1.0,
|
92 |
+
'married': 1.0
|
93 |
+
}
|
94 |
+
tinder_df['status'] = tinder_df['status']\
|
95 |
+
.apply(lambda x:
|
96 |
+
status_type_strength[x])
|
97 |
+
|
98 |
+
# create a LabelEncoder object
|
99 |
+
orientation_encoder = LabelEncoder()
|
100 |
+
|
101 |
+
# fit the encoder on the orientation column
|
102 |
+
orientation_encoder.fit(tinder_df['orientation'])
|
103 |
+
|
104 |
+
# encode the orientation column using the fitted encoder
|
105 |
+
tinder_df['orientation'] = orientation_encoder.\
|
106 |
+
transform(tinder_df['orientation'])
|
107 |
+
|
108 |
+
# Drop the existing orientation column
|
109 |
+
tinder_df.drop("orientation", axis=1, inplace=True)
|
110 |
+
|
111 |
+
drinking_habit = {
|
112 |
+
'socially': 'sometimes',
|
113 |
+
'rarely': 'sometimes',
|
114 |
+
'not at all': 'do not drink',
|
115 |
+
'often': 'drinks often',
|
116 |
+
'very often': 'drinks often',
|
117 |
+
'desperately': 'drinks often'
|
118 |
+
}
|
119 |
+
tinder_df['drinks'] = tinder_df['drinks']\
|
120 |
+
.apply(lambda x:
|
121 |
+
drinking_habit[x])
|
122 |
+
# create a LabelEncoder object
|
123 |
+
habit_encoder = LabelEncoder()
|
124 |
+
|
125 |
+
# fit the encoder on the drinks and drugs columns
|
126 |
+
habit_encoder.fit(tinder_df[['drinks', 'drugs']]
|
127 |
+
.values.reshape(-1))
|
128 |
+
|
129 |
+
# encode the drinks and drugs columns
|
130 |
+
# using the fitted encoder
|
131 |
+
tinder_df['drinks_encoded'] = \
|
132 |
+
habit_encoder.transform(tinder_df['drinks'])
|
133 |
+
tinder_df['drugs_encoded'] = \
|
134 |
+
habit_encoder.transform(tinder_df['drugs'])
|
135 |
+
|
136 |
+
# Drop the existing drink and drugs column
|
137 |
+
tinder_df.drop(["drinks", "drugs"], axis=1,
|
138 |
+
inplace=True)
|
139 |
+
|
140 |
+
region_dict = {'southern_california': ['los angeles',
|
141 |
+
'san diego', 'hacienda heights',
|
142 |
+
'north hollywood', 'phoenix'],
|
143 |
+
'new_york': ['brooklyn',
|
144 |
+
'new york']}
|
145 |
+
|
146 |
+
def get_region(city):
|
147 |
+
for region, cities in region_dict.items():
|
148 |
+
if city.lower() in [c.lower() for c in cities]:
|
149 |
+
return region
|
150 |
+
return "northern_california"
|
151 |
+
|
152 |
+
|
153 |
+
tinder_df['location'] = tinder_df['location']\
|
154 |
+
.str.split(', ')\
|
155 |
+
.str[0].apply(get_region)
|
156 |
+
# perform one hot encoding
|
157 |
+
location_encoder = OneHotEncoder()
|
158 |
+
|
159 |
+
# fit and transform the location column
|
160 |
+
location_encoded = location_encoder.fit_transform\
|
161 |
+
(tinder_df[['location']])
|
162 |
+
|
163 |
+
# create a new DataFrame with the encoded columns
|
164 |
+
location_encoded_df = pd.DataFrame(location_encoded.toarray()\
|
165 |
+
, columns=location_encoder.\
|
166 |
+
get_feature_names_out(['location']))
|
167 |
+
|
168 |
+
# concatenate the new DataFrame with the original DataFrame
|
169 |
+
tinder_df = pd.concat([tinder_df, location_encoded_df], axis=1)
|
170 |
+
# Drop the existing location column
|
171 |
+
tinder_df.drop(["location"], axis=1, inplace=True)
|
172 |
+
|
173 |
+
# create a LabelEncoder object
|
174 |
+
job_encoder = LabelEncoder()
|
175 |
+
|
176 |
+
# fit the encoder on the job column
|
177 |
+
job_encoder.fit(tinder_df['job'])
|
178 |
+
|
179 |
+
# encode the job column using the fitted encoder
|
180 |
+
tinder_df['job_encoded'] = job_encoder.\
|
181 |
+
transform(tinder_df['job'])
|
182 |
+
|
183 |
+
# drop the original job column
|
184 |
+
tinder_df.drop('job', axis=1, inplace=True)
|
185 |
+
|
186 |
+
smokes = {
|
187 |
+
'no': 1.0,
|
188 |
+
'sometimes': 0,
|
189 |
+
'yes': 0,
|
190 |
+
'when drinking':0,
|
191 |
+
'trying to quit':0
|
192 |
+
}
|
193 |
+
tinder_df['smokes'] = tinder_df['smokes']\
|
194 |
+
.apply(lambda x: smokes[x])
|
195 |
+
|
196 |
+
bin_enc = ce.BinaryEncoder(cols=['pets'])
|
197 |
+
|
198 |
+
# fit and transform the pet column
|
199 |
+
pet_enc = bin_enc.fit_transform(tinder_df['pets'])
|
200 |
+
|
201 |
+
# add the encoded columns to the original dataframe
|
202 |
+
tinder_df = pd.concat([tinder_df, pet_enc], axis=1)
|
203 |
+
|
204 |
+
tinder_df.drop("pets",axis=1,inplace = True)
|
205 |
+
|
206 |
+
# create a LabelEncoder object
|
207 |
+
location_encoder = LabelEncoder()
|
208 |
+
|
209 |
+
# fit the encoder on the job column
|
210 |
+
location_encoder.fit(tinder_df['new_languages'])
|
211 |
+
|
212 |
+
# encode the job column using the fitted encoder
|
213 |
+
tinder_df['new_languages'] = location_encoder.transform(
|
214 |
+
tinder_df['new_languages'])
|
215 |
+
|
216 |
+
# create an instance of LabelEncoder
|
217 |
+
le = LabelEncoder()
|
218 |
+
|
219 |
+
# encode the body_profile column
|
220 |
+
tinder_df["body_profile"] = le.fit_transform(tinder_df["body_profile"])
|
221 |
+
|
222 |
+
# Initialize TfidfVectorizer object
|
223 |
+
tfidf = TfidfVectorizer(stop_words='english')
|
224 |
+
|
225 |
+
# Fit and transform the text data
|
226 |
+
tfidf_matrix = tfidf.fit_transform(tinder_df['bio'])
|
227 |
+
|
228 |
+
# Get the feature names from the TfidfVectorizer object
|
229 |
+
feature_names = tfidf.vocabulary_
|
230 |
+
|
231 |
+
# Convert tfidf matrix to DataFrame
|
232 |
+
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(),
|
233 |
+
columns=feature_names)
|
234 |
+
|
235 |
+
# Add non-text features to the tfidf_df dataframe
|
236 |
+
tinder_dfs = tinder_df.drop(["bio", "user_id",
|
237 |
+
"username"], axis=1)
|
238 |
+
tinder_dfs = pd.concat([tinder_dfs,
|
239 |
+
tfidf_df], axis=1)
|
240 |
+
|
241 |
+
# Apply SVD to the feature matrix
|
242 |
+
svd = TruncatedSVD(n_components=100)
|
243 |
+
#svd = TruncatedSVD()
|
244 |
+
|
245 |
+
svd_matrix = svd.fit_transform(tinder_dfs)
|
246 |
+
|
247 |
+
|
248 |
+
|
249 |
+
# Calculate the cosine similarity
|
250 |
+
# between all pairs of users
|
251 |
+
cosine_sim = cosine_similarity(svd_matrix)
|
252 |
+
|
253 |
+
def recommend(user_df, num_recommendations=5):
|
254 |
+
|
255 |
+
# Apply SVD to the feature
|
256 |
+
# matrix of the user_df dataframe
|
257 |
+
svd_matrixs = svd.transform(user_df)
|
258 |
+
|
259 |
+
# Calculate the cosine similarity
|
260 |
+
# between the user_df and training set users
|
261 |
+
cosine_sim_new = cosine_similarity(svd_matrixs, svd_matrix)
|
262 |
+
|
263 |
+
# Get the indices of the top
|
264 |
+
# num_recommendations similar users
|
265 |
+
sim_scores = list(enumerate(cosine_sim_new[0]))
|
266 |
+
sim_scores = sorted(sim_scores,
|
267 |
+
key=lambda x: x[1], reverse=True)
|
268 |
+
sim_indices = [i[0] for i in
|
269 |
+
sim_scores[1:num_recommendations+1]]
|
270 |
+
|
271 |
+
# Return the user_ids of the recommended users
|
272 |
+
return tinder_df['username'].iloc[sim_indices]
|
273 |
+
|
274 |
+
# Setup complete!
|
275 |
+
|
276 |
iface = gr.Interface(fn=greet, inputs=["text", "text"], outputs="text")
|
277 |
iface.launch()
|
tinder_data.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|