awacke1's picture
Update app.py
aaf590b verified
raw
history blame
3.04 kB
import streamlit as st
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import DictionaryLearning
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
# Title of the app
st.title('Dictionary Learning Demo with Streamlit')
# Description
st.write('''
This application demonstrates the concept of Dictionary Learning using the scikit-learn library.
Dictionary learning aims to find a sparse representation of the data in the form of a dictionary and a sparse matrix.
''')
# Load text from file
with open("text_file.txt", "r", encoding="utf-8") as file:
text_input = file.read()
# Text input
st.text_area("Analyzed Text:", value=text_input, height=200)
# Get user input for the number of dictionary components
n_components = st.slider('Number of dictionary components', 1, 20, 10)
if st.button('Analyze'):
# Perform text preprocessing
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform([text_input])
# Convert sparse matrix to dense numpy array
X_dense = X.toarray()
# Perform dictionary learning
dl = DictionaryLearning(n_components=n_components, transform_algorithm='lasso_lars', random_state=0)
X_transformed = dl.fit_transform(X_dense)
dictionary = dl.components_
# Get the feature names (terms)
feature_names = vectorizer.get_feature_names_out()
# Create a DataFrame with dictionary components and their corresponding terms
df_components = pd.DataFrame(dictionary, columns=feature_names)
df_components['Component'] = ['Component ' + str(i+1) for i in range(n_components)]
df_components = df_components.set_index('Component')
# Display the DataFrame
st.markdown("### Dictionary Components")
st.dataframe(df_components)
# Plot the high-use words and terms
fig, ax = plt.subplots(figsize=(10, 6))
word_counts = df_components.sum(axis=0).sort_values(ascending=False)[:20]
ax.bar(word_counts.index, word_counts.values)
ax.set_xticklabels(word_counts.index, rotation=45, ha='right')
ax.set_xlabel('Words/Terms')
ax.set_ylabel('Count')
ax.set_title('High-Use Words and Terms')
st.pyplot(fig)
# Create a graph of terms and their connections
G = nx.Graph()
# Add nodes to the graph
for term in feature_names:
G.add_node(term)
# Add edges to the graph based on co-occurrence in dictionary components
for i in range(n_components):
terms = df_components.columns[df_components.iloc[i] > 0]
for term1 in terms:
for term2 in terms:
if term1 != term2:
G.add_edge(term1, term2)
# Plot the graph
fig, ax = plt.subplots(figsize=(8, 8))
pos = nx.spring_layout(G, k=0.3)
nx.draw_networkx_nodes(G, pos, node_size=100, node_color='lightblue', alpha=0.8)
nx.draw_networkx_edges(G, pos, edge_color='gray', alpha=0.5)
nx.draw_networkx_labels(G, pos, font_size=8)
ax.axis('off')
st.pyplot(fig)