awacke1 commited on
Commit
d8d0fe8
·
1 Parent(s): d7e74c5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +119 -0
app.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import re
3
+ import nltk
4
+ from nltk.corpus import stopwords
5
+ from nltk import FreqDist
6
+ from graphviz import Digraph
7
+ from collections import Counter
8
+ import datetime
9
+ import pandas as pd
10
+ from PyPDF2 import PdfFileReader
11
+ from io import StringIO, BytesIO
12
+
13
+ nltk.download('punkt')
14
+ nltk.download('stopwords')
15
+
16
+
17
+ def remove_timestamps(text):
18
+ return re.sub(r'\d{1,2}:\d{2}\n', '', text)
19
+
20
+
21
+ def process_text(text):
22
+ lines = text.split("\n")
23
+ processed_lines = []
24
+
25
+ for line in lines:
26
+ if line:
27
+ processed_lines.append(line)
28
+
29
+ outline = ""
30
+ for i, line in enumerate(processed_lines):
31
+ if i % 2 == 0:
32
+ outline += f"**{line}**\n"
33
+ else:
34
+ outline += f"- {line} 😄\n"
35
+
36
+ return outline
37
+
38
+
39
+ def extract_high_information_words(text, top_n=10):
40
+ words = nltk.word_tokenize(text)
41
+ words = [word.lower() for word in words if word.isalpha()]
42
+
43
+ stop_words = set(stopwords.words('english'))
44
+ filtered_words = [word for word in words if word not in stop_words]
45
+
46
+ freq_dist = FreqDist(filtered_words)
47
+ high_information_words = [word for word, _ in freq_dist.most_common(top_n)]
48
+
49
+ return high_information_words
50
+
51
+
52
+ def create_relationship_graph(words):
53
+ graph = Digraph()
54
+
55
+ for index, word in enumerate(words):
56
+ graph.node(str(index), word)
57
+
58
+ if index > 0:
59
+ graph.edge(str(index - 1), str(index), label=str(index))
60
+
61
+ return graph
62
+
63
+
64
+ def display_relationship_graph(words):
65
+ graph = create_relationship_graph(words)
66
+ st.graphviz_chart(graph)
67
+
68
+
69
+ def save_text_file(text):
70
+ date_str = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
71
+ file_name = f"{date_str}.txt"
72
+ with open(file_name, 'w') as f:
73
+ f.write(text)
74
+ return file_name
75
+
76
+
77
+ def extract_text_from_uploaded_files(uploaded_files):
78
+ merged_text = ""
79
+
80
+ for uploaded_file in uploaded_files:
81
+ extension = uploaded_file.name.split('.')[-1]
82
+
83
+ if extension == "txt":
84
+ merged_text += uploaded_file.read().decode()
85
+
86
+ elif extension == "pdf":
87
+ pdf = PdfFileReader(uploaded_file)
88
+ for page_num in range(pdf.numPages):
89
+ page = pdf.getPage(page_num)
90
+ merged_text += page.extractText()
91
+
92
+ elif extension == "csv":
93
+ df = pd.read_csv(uploaded_file)
94
+ merged_text += '\n'.join(df.applymap(str).agg(' '.join, axis=1))
95
+
96
+ return merged_text
97
+
98
+
99
+ uploaded_files = st.file_uploader("Choose files", type=['txt', 'pdf', 'csv'], accept_multiple_files=True)
100
+
101
+ if uploaded_files:
102
+ merged_text = extract_text_from_uploaded_files(uploaded_files)
103
+ save_text_file(merged_text)
104
+
105
+ text_without_timestamps = remove_timestamps(merged_text)
106
+
107
+ st.markdown("**Text without Timestamps:**")
108
+ st.write(text_without_timestamps)
109
+
110
+ processed_text = process_text(text_without_timestamps)
111
+ st.markdown("**Markdown Outline with Emojis:**")
112
+ st.markdown(processed_text)
113
+
114
+ top_words = extract_high_information_words(text_without_timestamps, 10)
115
+ st.markdown("**Top 10 High Information Words:**")
116
+ st.write(top_words)
117
+
118
+ st.markdown("**Relationship Graph:**")
119
+ display_relationship_graph(top_words)