|
import streamlit as st |
|
import numpy as np |
|
from sklearn.feature_extraction.text import CountVectorizer |
|
from sklearn.decomposition import DictionaryLearning |
|
import pandas as pd |
|
import networkx as nx |
|
import matplotlib.pyplot as plt |
|
|
|
|
|
st.title('Dictionary Learning Demo with Streamlit') |
|
|
|
|
|
st.write(''' |
|
This application demonstrates the concept of Dictionary Learning using the scikit-learn library. |
|
Dictionary learning aims to find a sparse representation of the data in the form of a dictionary and a sparse matrix. |
|
''') |
|
|
|
|
|
with open("text_file.txt", "r", encoding="utf-8") as file: |
|
text_input = file.read() |
|
|
|
|
|
st.text_area("Analyzed Text:", value=text_input, height=200) |
|
|
|
|
|
n_components = st.slider('Number of dictionary components', 1, 20, 10) |
|
|
|
if st.button('Analyze'): |
|
|
|
vectorizer = CountVectorizer(stop_words='english') |
|
X = vectorizer.fit_transform([text_input]) |
|
|
|
|
|
X_dense = X.toarray() |
|
|
|
|
|
dl = DictionaryLearning(n_components=n_components, transform_algorithm='lasso_lars', random_state=0) |
|
X_transformed = dl.fit_transform(X_dense) |
|
dictionary = dl.components_ |
|
|
|
|
|
feature_names = vectorizer.get_feature_names_out() |
|
|
|
|
|
df_components = pd.DataFrame(dictionary, columns=feature_names) |
|
df_components['Component'] = ['Component ' + str(i+1) for i in range(n_components)] |
|
df_components = df_components.set_index('Component') |
|
|
|
|
|
st.markdown("### Dictionary Components") |
|
st.dataframe(df_components) |
|
|
|
|
|
fig, ax = plt.subplots(figsize=(10, 6)) |
|
word_counts = df_components.sum(axis=0).sort_values(ascending=False)[:20] |
|
ax.bar(word_counts.index, word_counts.values) |
|
ax.set_xticklabels(word_counts.index, rotation=45, ha='right') |
|
ax.set_xlabel('Words/Terms') |
|
ax.set_ylabel('Count') |
|
ax.set_title('High-Use Words and Terms') |
|
st.pyplot(fig) |
|
|
|
|
|
G = nx.Graph() |
|
|
|
|
|
for term in feature_names: |
|
G.add_node(term) |
|
|
|
|
|
for i in range(n_components): |
|
terms = df_components.columns[df_components.iloc[i] > 0] |
|
for term1 in terms: |
|
for term2 in terms: |
|
if term1 != term2: |
|
G.add_edge(term1, term2) |
|
|
|
|
|
fig, ax = plt.subplots(figsize=(8, 8)) |
|
pos = nx.spring_layout(G, k=0.3) |
|
nx.draw_networkx_nodes(G, pos, node_size=100, node_color='lightblue', alpha=0.8) |
|
nx.draw_networkx_edges(G, pos, edge_color='gray', alpha=0.5) |
|
nx.draw_networkx_labels(G, pos, font_size=8) |
|
ax.axis('off') |
|
st.pyplot(fig) |