MusIre commited on
Commit
57553ee
·
verified ·
1 Parent(s): 77d72f5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -0
app.py CHANGED
@@ -4,6 +4,70 @@ import matplotlib.pyplot as plt
4
  import seaborn as sns
5
  import plotly.express as px
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  import chromadb
8
  chroma_client = chromadb.Client()
9
 
 
4
  import seaborn as sns
5
  import plotly.express as px
6
 
7
+ num_rows = 20000
8
+ df = pd.read_csv('emails.csv', on_bad_lines='skip', nrows=num_rows)
9
+
10
+ def get_message(Series: pd.Series):
11
+ result = pd.Series(index=Series.index)
12
+ for row, message in enumerate(Series):
13
+ message_words = message.split('\n')
14
+ del message_words[:15]
15
+ result.iloc[row] = ''.join(message_words).strip()
16
+ return result
17
+
18
+ def get_date(Series: pd.Series):
19
+ result = pd.Series(index=Series.index)
20
+ for row, message in enumerate(Series):
21
+ message_words = message.split('\n')
22
+ del message_words[0]
23
+ del message_words[1:]
24
+ result.iloc[row] = ''.join(message_words).strip()
25
+ result.iloc[row] = result.iloc[row].replace('Date: ', '')
26
+ print('Done parsing, converting to datetime format..')
27
+ return pd.to_datetime(result)
28
+
29
+ def get_sender_and_receiver(Series: pd.Series):
30
+ sender = pd.Series(index = Series.index)
31
+ recipient1 = pd.Series(index = Series.index)
32
+ recipient2 = pd.Series(index = Series.index)
33
+ recipient3 = pd.Series(index = Series.index)
34
+
35
+ for row,message in enumerate(Series):
36
+ message_words = message.split('\n')
37
+ sender[row] = message_words[2].replace('From: ', '')
38
+ recipient1[row] = message_words[3].replace('To: ', '')
39
+ recipient2[row] = message_words[10].replace('X-cc: ', '')
40
+ recipient3[row] = message_words[11].replace('X-bcc: ', '')
41
+
42
+ return sender, recipient1, recipient2, recipient3
43
+
44
+ def get_subject(Series: pd.Series):
45
+ result = pd.Series(index = Series.index)
46
+
47
+ for row, message in enumerate(Series):
48
+ message_words = message.split('\n')
49
+ message_words = message_words[4]
50
+ result[row] = message_words.replace('Subject: ', '')
51
+ return result
52
+
53
+ def get_folder(Series: pd.Series):
54
+ result = pd.Series(index = Series.index)
55
+
56
+ for row, message in enumerate(Series):
57
+ message_words = message.split('\n')
58
+ message_words = message_words[12]
59
+ result[row] = message_words.replace('X-Folder: ', '')
60
+ return result
61
+
62
+ df['text'] = get_message(df.message)
63
+ df['sender'], df['recipient1'], df['recipient2'], df['recipient3'] = get_sender_and_receiver(df.message)
64
+ df['Subject'] = get_subject(df.message)
65
+ df['folder'] = get_folder(df.message)
66
+ df['date'] = get_date(df.message)
67
+
68
+ df = df.drop(['message', 'file'], axis = 1)
69
+
70
+
71
  import chromadb
72
  chroma_client = chromadb.Client()
73