MusIre commited on
Commit
d82e76f
·
verified ·
1 Parent(s): a8c0870

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +234 -0
app.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pandas as pd
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ import plotly.express as px
7
+
8
+ num_rows = 20000
9
+ df = pd.read_csv('/emails.csv', on_bad_lines='skip', nrows=num_rows)
10
+
11
+ def get_message(Series: pd.Series):
12
+ result = pd.Series(index=Series.index)
13
+ for row, message in enumerate(Series):
14
+ message_words = message.split('\n')
15
+ del message_words[:15]
16
+ result.iloc[row] = ''.join(message_words).strip()
17
+ return result
18
+
19
+ def get_date(Series: pd.Series):
20
+ result = pd.Series(index=Series.index)
21
+ for row, message in enumerate(Series):
22
+ message_words = message.split('\n')
23
+ del message_words[0]
24
+ del message_words[1:]
25
+ result.iloc[row] = ''.join(message_words).strip()
26
+ result.iloc[row] = result.iloc[row].replace('Date: ', '')
27
+ print('Done parsing, converting to datetime format..')
28
+ return pd.to_datetime(result)
29
+
30
+ def get_sender_and_receiver(Series: pd.Series):
31
+ sender = pd.Series(index = Series.index)
32
+ recipient1 = pd.Series(index = Series.index)
33
+ recipient2 = pd.Series(index = Series.index)
34
+ recipient3 = pd.Series(index = Series.index)
35
+
36
+ for row,message in enumerate(Series):
37
+ message_words = message.split('\n')
38
+ sender[row] = message_words[2].replace('From: ', '')
39
+ recipient1[row] = message_words[3].replace('To: ', '')
40
+ recipient2[row] = message_words[10].replace('X-cc: ', '')
41
+ recipient3[row] = message_words[11].replace('X-bcc: ', '')
42
+
43
+ return sender, recipient1, recipient2, recipient3
44
+
45
+ def get_subject(Series: pd.Series):
46
+ result = pd.Series(index = Series.index)
47
+
48
+ for row, message in enumerate(Series):
49
+ message_words = message.split('\n')
50
+ message_words = message_words[4]
51
+ result[row] = message_words.replace('Subject: ', '')
52
+ return result
53
+
54
+ def get_folder(Series: pd.Series):
55
+ result = pd.Series(index = Series.index)
56
+
57
+ for row, message in enumerate(Series):
58
+ message_words = message.split('\n')
59
+ message_words = message_words[12]
60
+ result[row] = message_words.replace('X-Folder: ', '')
61
+ return result
62
+
63
+ df['text'] = get_message(df.message)
64
+ df['sender'], df['recipient1'], df['recipient2'], df['recipient3'] = get_sender_and_receiver(df.message)
65
+ df['Subject'] = get_subject(df.message)
66
+ df['folder'] = get_folder(df.message)
67
+ df['date'] = get_date(df.message)
68
+
69
+ df = df.drop(['message', 'file'], axis = 1)
70
+
71
+ df.head(100)
72
+
73
+
74
+ import chromadb
75
+ chroma_client = chromadb.Client()
76
+
77
+ collection = chroma_client.create_collection(name="emails")
78
+
79
+ df.loc[4, 'text']
80
+
81
+ for i in df.index:
82
+ collection.add(
83
+
84
+ documents = df.loc[i, 'text'],
85
+
86
+
87
+ metadatas = [{"sender": df.loc[i, 'sender'],
88
+ "recipient1": df.loc[i, 'recipient1'],
89
+ "recipient2": df.loc[i, 'recipient2'],
90
+ "recipient3": df.loc[i, 'recipient3'],
91
+ "subject": df.loc[i, 'Subject'],
92
+ "folder": df.loc[i, 'folder'],
93
+ "date": str(df.loc[i, 'date'])
94
+ }],
95
+
96
+
97
+ ids = str(i)
98
+ )
99
+
100
+ collection.get(
101
+ ids=["140"]
102
+ )
103
+
104
+ results = collection.query(
105
+ query_texts = ["this is a document"],
106
+ n_results = 2,
107
+ include = ['distances', 'metadatas', 'documents']
108
+ )
109
+ results
110
+
111
+
112
+ from chromadb.utils import embedding_functions
113
+
114
+
115
+
116
+ sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="paraphrase-MiniLM-L3-v2")
117
+
118
+ collection_minilm = chroma_client.create_collection(name="emails_minilm", embedding_function=sentence_transformer_ef)
119
+
120
+
121
+ for i in df.index:
122
+ print(i)
123
+ collection_minilm.add(
124
+
125
+ documents = df.loc[i, 'text'],
126
+
127
+ metadatas = [{"sender": df.loc[i, 'sender'],
128
+ "recipient1": df.loc[i, 'recipient1'],
129
+ "recipient2": df.loc[i, 'recipient2'],
130
+ "recipient3": df.loc[i, 'recipient3'],
131
+ "subject": df.loc[i, 'Subject'],
132
+ "folder": df.loc[i, 'folder'],
133
+ "date": str(df.loc[i, 'date'])
134
+ }],
135
+
136
+
137
+ ids = str(i)
138
+ )
139
+
140
+ results = collection_minilm.query(
141
+ query_texts = ["this is a document"],
142
+ n_results = 2,
143
+ include = ['distances', 'metadatas', 'documents']
144
+ )
145
+ results
146
+
147
+
148
+
149
+
150
+
151
+ import gradio as gr
152
+
153
+
154
+ def query_chromadb(question,numberOfResults):
155
+ results = collection_minilm.query(
156
+ n_results = numberOfResults,
157
+ )
158
+
159
+ return results['documents'][0]
160
+
161
+ iface = gr.Interface(
162
+ fn=query_chromadb,
163
+ inputs=["text","number"],
164
+ outputs="text",
165
+ title="Email Dataset Interface",
166
+ description="Insert the question or the key word to find the topic correlated in the dataset"
167
+ )
168
+
169
+ iface.launch(share=True)
170
+
171
+
172
+
173
+ import ast
174
+
175
+ def create_output(dictionary, number):
176
+
177
+ dictionary_ids = str(dictionary['ids'])
178
+
179
+
180
+ dictionary_ids_clean = dictionary_ids.strip("[]")
181
+
182
+ dictionary_ids_clean = dictionary_ids_clean.replace("'", "")
183
+
184
+
185
+ dictionary_ids_list = dictionary_ids_clean.split(", ")
186
+
187
+ string_results = "";
188
+
189
+
190
+ for n in range(number):
191
+ t = collection_minilm.get(
192
+ ids=[dictionary_ids_list[n]]
193
+ )
194
+
195
+
196
+ id = str(t["ids"])
197
+ doc = str(t["documents"])
198
+ metadata = str(t["metadatas"])
199
+
200
+ dictionary_metadata = ast.literal_eval(metadata.strip("[]"))
201
+
202
+ string_results_old = string_results
203
+
204
+ string_temp = """---------------
205
+ SUBJECT: """ + dictionary_metadata['subject'] + """"
206
+ MESSAGE: """ + "\n" + doc + """
207
+ ---------------"""
208
+
209
+ string_results = string_results_old + string_temp
210
+
211
+ return string_results
212
+
213
+ def query_chromadb_advanced(question,numberOfResults):
214
+ results = collection_minilm.query(
215
+ query_texts = question,
216
+ n_results = numberOfResults,
217
+ )
218
+
219
+ return create_output(results, numberOfResults)
220
+
221
+
222
+ result_advance = query_chromadb_advanced("bank", 4)
223
+
224
+ print(result_advance)
225
+
226
+ iface = gr.Interface(
227
+ fn=query_chromadb_advanced,
228
+ inputs=["text","number"],
229
+ outputs="text",
230
+ title="Email Dataset Interface",
231
+ description="Insert the question or the key word to find the topic correlated in the dataset"
232
+ )
233
+
234
+ iface.launch(share=True)