import streamlit as st import numpy as np from sklearn.feature_extraction.text import CountVectorizer from sklearn.decomposition import DictionaryLearning import pandas as pd import networkx as nx import matplotlib.pyplot as plt # Title of the app st.title('Dictionary Learning Demo with Streamlit') # Description st.write(''' This application demonstrates the concept of Dictionary Learning using the scikit-learn library. Dictionary learning aims to find a sparse representation of the data in the form of a dictionary and a sparse matrix. ''') # Load text from file with open("text_file.txt", "r", encoding="utf-8") as file: text_input = file.read() # Text input st.text_area("Analyzed Text:", value=text_input, height=200) # Get user input for the number of dictionary components n_components = st.slider('Number of dictionary components', 1, 20, 10) if st.button('Analyze'): # Perform text preprocessing vectorizer = CountVectorizer(stop_words='english') X = vectorizer.fit_transform([text_input]) # Convert sparse matrix to dense numpy array X_dense = X.toarray() # Perform dictionary learning dl = DictionaryLearning(n_components=n_components, transform_algorithm='lasso_lars', random_state=0) X_transformed = dl.fit_transform(X_dense) dictionary = dl.components_ # Get the feature names (terms) feature_names = vectorizer.get_feature_names_out() # Create a DataFrame with dictionary components and their corresponding terms df_components = pd.DataFrame(dictionary, columns=feature_names) df_components['Component'] = ['Component ' + str(i+1) for i in range(n_components)] df_components = df_components.set_index('Component') # Display the DataFrame st.markdown("### Dictionary Components") st.dataframe(df_components) # Plot the high-use words and terms fig, ax = plt.subplots(figsize=(10, 6)) word_counts = df_components.sum(axis=0).sort_values(ascending=False)[:20] ax.bar(word_counts.index, word_counts.values) ax.set_xticklabels(word_counts.index, rotation=45, ha='right') ax.set_xlabel('Words/Terms') ax.set_ylabel('Count') ax.set_title('High-Use Words and Terms') st.pyplot(fig) # Create a graph of terms and their connections G = nx.Graph() # Add nodes to the graph for term in feature_names: G.add_node(term) # Add edges to the graph based on co-occurrence in dictionary components for i in range(n_components): terms = df_components.columns[df_components.iloc[i] > 0] for term1 in terms: for term2 in terms: if term1 != term2: G.add_edge(term1, term2) # Plot the graph fig, ax = plt.subplots(figsize=(8, 8)) pos = nx.spring_layout(G, k=0.3) nx.draw_networkx_nodes(G, pos, node_size=100, node_color='lightblue', alpha=0.8) nx.draw_networkx_edges(G, pos, edge_color='gray', alpha=0.5) nx.draw_networkx_labels(G, pos, font_size=8) ax.axis('off') st.pyplot(fig) # Generate a short summary using high-use words and terms summary = ' '.join(word_counts.index[:5]) # Use top 5 words/terms for summary st.markdown(f"### Short Summary\n```python\n{summary}\n```") # Generate glossary-style sentences for each high-use word/term glossary = {} for word in word_counts.index[:20]: # Find the word in the text and extract surrounding words start_index = text_input.find(word) if start_index != -1: end_index = start_index + len(word) start_context = max(0, start_index - 20) end_context = min(len(text_input), end_index + 20) context = text_input[start_context:start_index] + ' ' + word + ' ' + text_input[end_index:end_context] glossary[word] = context.replace('\n', ' ').strip() # Display the glossary-style sentences in markdown format st.markdown("### Glossary") for word, sentence in glossary.items(): st.markdown(f"```python\n{word}: {sentence}\n```")