Spaces:

awacke1
/

NLPTopicModelerProcess-xlsx-csv-md

Sleeping

File size: 2,906 Bytes

import streamlit as st
import pandas as pd
import datetime
import io
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

nltk.download('punkt')

def save_text_as_file(text, file_type):
    current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    file_name = f"text_file_{current_time}.{file_type}"
    with open(file_name, "w") as file:
        file.write(text)
    st.success(f"Text saved as {file_name}")

def save_csv_as_excel(text):
    try:
        df = pd.read_csv(io.StringIO(text), header=None)
        if df.iloc[0].dtype == object:
            header = 0
            file_name = f"csv_with_header_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
        else:
            header = None
            file_name = f"csv_without_header_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
        df.to_excel(file_name, index=False, header=header)
        st.success(f"CSV data saved as {file_name}")
        st.dataframe(df)
    except pd.errors.EmptyDataError:
        st.error("The pasted text does not contain valid CSV data.")
    except pd.errors.ParserError as e:
        st.error(f"Error parsing CSV data: {str(e)}")
        st.info("Please ensure that the pasted text is in a valid CSV format.")
        
def split_sentences(text):
    sentences = sent_tokenize(text)
    return "\n".join(sentences)

def perform_nlp(text):
    sentences = sent_tokenize(text)
    
    # Topic Modeling
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(sentences)
    lda = LatentDirichletAllocation(n_components=3, random_state=42)
    lda.fit(X)
    topics = lda.transform(X)
    
    # Display topics
    st.subheader("Topic Modeling")
    for i, topic in enumerate(topics):
        st.write(f"Topic {i+1}:")
        topic_words = ", ".join([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-6:-1]])
        st.write(topic_words)
    
    # Word Frequency
    word_freq = pd.Series(" ".join(sentences).split()).value_counts()[:10]
    st.subheader("Word Frequency")
    st.bar_chart(word_freq)

def main():
    st.title("AI UI for Text Processing")
    
    text_input = st.text_area("Paste your text here")
    if st.button("Process Text"):
        if text_input.strip() == "":
            st.warning("Please paste some text.")
        else:
            if "," in text_input or "\t" in text_input:
                save_csv_as_excel(text_input)
            elif "." in text_input or "!" in text_input or "?" in text_input:
                sentences = split_sentences(text_input)
                st.subheader("Sentences")
                st.write(sentences)
                perform_nlp(text_input)
            else:
                save_text_as_file(text_input, "txt")

if __name__ == "__main__":
    main()