awacke1 commited on
Commit
1c12389
Β·
1 Parent(s): 2bc1c14

Update backupapp.py

Browse files
Files changed (1) hide show
  1. backupapp.py +5 -31
backupapp.py CHANGED
@@ -1,89 +1,71 @@
1
-
2
  import streamlit as st
3
  import re
4
  import json
 
5
  import nltk
6
  from nltk.corpus import stopwords
7
  from nltk import FreqDist
8
- from graphviz import Digraph
9
- from collections import Counter
10
-
11
  nltk.download('punkt')
12
  nltk.download('stopwords')
13
 
 
 
 
14
  def remove_timestamps(text):
15
  return re.sub(r'\d{1,2}:\d{2}\n', '', text)
16
 
17
  def process_text(text):
18
  lines = text.split("\n")
19
  processed_lines = []
20
-
21
  for line in lines:
22
  if line:
23
  processed_lines.append(line)
24
-
25
  outline = ""
26
  for i, line in enumerate(processed_lines):
27
  if i % 2 == 0:
28
  outline += f"**{line}**\n"
29
  else:
30
  outline += f"- {line} πŸ˜„\n"
31
-
32
  return outline
33
 
34
  def create_jsonl_list(text):
35
  lines = text.split("\n")
36
  jsonl_list = []
37
-
38
  for line in lines:
39
  if line:
40
  jsonl_list.append({"text": line})
41
-
42
  return jsonl_list
43
 
44
  def unit_test(input_text):
45
  st.write("Test Text without Timestamps:")
46
  test_text_without_timestamps = remove_timestamps(input_text)
47
  st.write(test_text_without_timestamps)
48
-
49
  st.write("Test JSONL List:")
50
  test_jsonl_list = create_jsonl_list(test_text_without_timestamps)
51
  st.write(test_jsonl_list)
52
 
53
-
54
-
55
  def extract_high_information_words(text, top_n=10):
56
  words = nltk.word_tokenize(text)
57
  words = [word.lower() for word in words if word.isalpha()]
58
-
59
  stop_words = set(stopwords.words('english'))
60
  filtered_words = [word for word in words if word not in stop_words]
61
-
62
  freq_dist = FreqDist(filtered_words)
63
  high_information_words = [word for word, _ in freq_dist.most_common(top_n)]
64
-
65
  return high_information_words
66
 
67
-
68
  def create_relationship_graph(words):
69
  graph = Digraph()
70
-
71
  for index, word in enumerate(words):
72
  graph.node(str(index), word)
73
 
74
  if index > 0:
75
  graph.edge(str(index - 1), str(index), label=str(index))
76
-
77
  return graph
78
 
79
-
80
  def display_relationship_graph(words):
81
  graph = create_relationship_graph(words)
82
  st.graphviz_chart(graph)
83
 
84
-
85
-
86
-
87
  text_input = st.text_area("Enter text:", value="", height=300)
88
  text_without_timestamps = remove_timestamps(text_input)
89
 
@@ -186,20 +168,12 @@ that's it that's literally it this is the core idea now it turns out it's not
186
  difficult to formalize mathematically but this is really what's going on if in a neural network
187
 
188
  '''
189
-
190
  unit_test(unit_test_text_3)
191
 
192
-
193
-
194
-
195
-
196
  # Adding new functionality to the existing code
197
  text_without_timestamps = remove_timestamps(unit_test_text_2)
198
  top_words = extract_high_information_words(text_without_timestamps, 10)
199
  st.markdown("**Top 10 High Information Words:**")
200
  st.write(top_words)
201
-
202
  st.markdown("**Relationship Graph:**")
203
- display_relationship_graph(top_words)
204
-
205
-
 
 
1
  import streamlit as st
2
  import re
3
  import json
4
+
5
  import nltk
6
  from nltk.corpus import stopwords
7
  from nltk import FreqDist
 
 
 
8
  nltk.download('punkt')
9
  nltk.download('stopwords')
10
 
11
+ from graphviz import Digraph
12
+ from collections import Counter
13
+
14
  def remove_timestamps(text):
15
  return re.sub(r'\d{1,2}:\d{2}\n', '', text)
16
 
17
  def process_text(text):
18
  lines = text.split("\n")
19
  processed_lines = []
 
20
  for line in lines:
21
  if line:
22
  processed_lines.append(line)
 
23
  outline = ""
24
  for i, line in enumerate(processed_lines):
25
  if i % 2 == 0:
26
  outline += f"**{line}**\n"
27
  else:
28
  outline += f"- {line} πŸ˜„\n"
 
29
  return outline
30
 
31
  def create_jsonl_list(text):
32
  lines = text.split("\n")
33
  jsonl_list = []
 
34
  for line in lines:
35
  if line:
36
  jsonl_list.append({"text": line})
 
37
  return jsonl_list
38
 
39
  def unit_test(input_text):
40
  st.write("Test Text without Timestamps:")
41
  test_text_without_timestamps = remove_timestamps(input_text)
42
  st.write(test_text_without_timestamps)
 
43
  st.write("Test JSONL List:")
44
  test_jsonl_list = create_jsonl_list(test_text_without_timestamps)
45
  st.write(test_jsonl_list)
46
 
 
 
47
  def extract_high_information_words(text, top_n=10):
48
  words = nltk.word_tokenize(text)
49
  words = [word.lower() for word in words if word.isalpha()]
 
50
  stop_words = set(stopwords.words('english'))
51
  filtered_words = [word for word in words if word not in stop_words]
 
52
  freq_dist = FreqDist(filtered_words)
53
  high_information_words = [word for word, _ in freq_dist.most_common(top_n)]
 
54
  return high_information_words
55
 
 
56
  def create_relationship_graph(words):
57
  graph = Digraph()
 
58
  for index, word in enumerate(words):
59
  graph.node(str(index), word)
60
 
61
  if index > 0:
62
  graph.edge(str(index - 1), str(index), label=str(index))
 
63
  return graph
64
 
 
65
  def display_relationship_graph(words):
66
  graph = create_relationship_graph(words)
67
  st.graphviz_chart(graph)
68
 
 
 
 
69
  text_input = st.text_area("Enter text:", value="", height=300)
70
  text_without_timestamps = remove_timestamps(text_input)
71
 
 
168
  difficult to formalize mathematically but this is really what's going on if in a neural network
169
 
170
  '''
 
171
  unit_test(unit_test_text_3)
172
 
 
 
 
 
173
  # Adding new functionality to the existing code
174
  text_without_timestamps = remove_timestamps(unit_test_text_2)
175
  top_words = extract_high_information_words(text_without_timestamps, 10)
176
  st.markdown("**Top 10 High Information Words:**")
177
  st.write(top_words)
 
178
  st.markdown("**Relationship Graph:**")
179
+ display_relationship_graph(top_words)