import streamlit as st import numpy as np from sklearn.feature_extraction.text import CountVectorizer from sklearn.decomposition import DictionaryLearning import pandas as pd import networkx as nx import matplotlib.pyplot as plt # Title of the app st.title('Dictionary Learning Demo with Streamlit') # Description st.write(''' This application demonstrates the concept of Dictionary Learning using the scikit-learn library. Dictionary learning aims to find a sparse representation of the data in the form of a dictionary and a sparse matrix. ''') # Load text from file with open("text_file.txt", "r", encoding="utf-8") as file: text_input = file.read() # Text input st.text_area("Analyzed Text:", value=text_input, height=200) # Get user input for the number of dictionary components n_components = st.slider('Number of dictionary components', 1, 20, 10) if st.button('Analyze'): # Perform text preprocessing vectorizer = CountVectorizer(stop_words='english') X = vectorizer.fit_transform([text_input]) # Convert sparse matrix to dense numpy array X_dense = X.toarray() # Perform dictionary learning dl = DictionaryLearning(n_components=n_components, transform_algorithm='lasso_lars', random_state=0) X_transformed = dl.fit_transform(X_dense) dictionary = dl.components_ # Get the feature names (terms) feature_names = vectorizer.get_feature_names_out() # Create a DataFrame with dictionary components and their corresponding terms df_components = pd.DataFrame(dictionary, columns=feature_names) df_components['Component'] = ['Component ' + str(i+1) for i in range(n_components)] df_components = df_components.set_index('Component') # Display the DataFrame st.markdown("### Dictionary Components") st.dataframe(df_components) # Plot the high-use words and terms fig, ax = plt.subplots(figsize=(10, 6)) word_counts = df_components.sum(axis=0).sort_values(ascending=False)[:20] ax.bar(word_counts.index, word_counts.values) ax.set_xticklabels(word_counts.index, rotation=45, ha='right') ax.set_xlabel('Words/Terms') ax.set_ylabel('Count') ax.set_title('High-Use Words and Terms') st.pyplot(fig) # Create a graph of terms and their connections G = nx.Graph() # Add nodes to the graph for term in feature_names: G.add_node(term) # Add edges to the graph based on co-occurrence in dictionary components for i in range(n_components): terms = df_components.columns[df_components.iloc[i] > 0] for term1 in terms: for term2 in terms: if term1 != term2: G.add_edge(term1, term2) # Plot the graph fig, ax = plt.subplots(figsize=(8, 8)) pos = nx.spring_layout(G, k=0.3) nx.draw_networkx_nodes(G, pos, node_size=100, node_color='lightblue', alpha=0.8) nx.draw_networkx_edges(G, pos, edge_color='gray', alpha=0.5) nx.draw_networkx_labels(G, pos, font_size=8) ax.axis('off') st.pyplot(fig)