File size: 3,200 Bytes
d8d0fe8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import streamlit as st
import re
import nltk
from nltk.corpus import stopwords
from nltk import FreqDist
from graphviz import Digraph
from collections import Counter
import datetime
import pandas as pd
from PyPDF2 import PdfFileReader
from io import StringIO, BytesIO

nltk.download('punkt')
nltk.download('stopwords')


def remove_timestamps(text):
    return re.sub(r'\d{1,2}:\d{2}\n', '', text)


def process_text(text):
    lines = text.split("\n")
    processed_lines = []

    for line in lines:
        if line:
            processed_lines.append(line)

    outline = ""
    for i, line in enumerate(processed_lines):
        if i % 2 == 0:
            outline += f"**{line}**\n"
        else:
            outline += f"- {line} πŸ˜„\n"

    return outline


def extract_high_information_words(text, top_n=10):
    words = nltk.word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha()]

    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]

    freq_dist = FreqDist(filtered_words)
    high_information_words = [word for word, _ in freq_dist.most_common(top_n)]

    return high_information_words


def create_relationship_graph(words):
    graph = Digraph()

    for index, word in enumerate(words):
        graph.node(str(index), word)

        if index > 0:
            graph.edge(str(index - 1), str(index), label=str(index))

    return graph


def display_relationship_graph(words):
    graph = create_relationship_graph(words)
    st.graphviz_chart(graph)


def save_text_file(text):
    date_str = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    file_name = f"{date_str}.txt"
    with open(file_name, 'w') as f:
        f.write(text)
    return file_name


def extract_text_from_uploaded_files(uploaded_files):
    merged_text = ""

    for uploaded_file in uploaded_files:
        extension = uploaded_file.name.split('.')[-1]

        if extension == "txt":
            merged_text += uploaded_file.read().decode()

        elif extension == "pdf":
            pdf = PdfFileReader(uploaded_file)
            for page_num in range(pdf.numPages):
                page = pdf.getPage(page_num)
                merged_text += page.extractText()

        elif extension == "csv":
            df = pd.read_csv(uploaded_file)
            merged_text += '\n'.join(df.applymap(str).agg(' '.join, axis=1))

    return merged_text


uploaded_files = st.file_uploader("Choose files", type=['txt', 'pdf', 'csv'], accept_multiple_files=True)

if uploaded_files:
    merged_text = extract_text_from_uploaded_files(uploaded_files)
    save_text_file(merged_text)

    text_without_timestamps = remove_timestamps(merged_text)

    st.markdown("**Text without Timestamps:**")
    st.write(text_without_timestamps)

    processed_text = process_text(text_without_timestamps)
    st.markdown("**Markdown Outline with Emojis:**")
    st.markdown(processed_text)

    top_words = extract_high_information_words(text_without_timestamps, 10)
    st.markdown("**Top 10 High Information Words:**")
    st.write(top_words)

    st.markdown("**Relationship Graph:**")
    display_relationship_graph(top_words)