File size: 3,038 Bytes
6827b7d 966fab8 6827b7d 966fab8 6827b7d aaf590b 746a417 aaf590b 746a417 aaf590b 966fab8 aaf590b 966fab8 746a417 aaf590b 05c1b6a aaf590b 746a417 05c1b6a 746a417 aaf590b 746a417 aaf590b 746a417 aaf590b 746a417 aaf590b 746a417 aaf590b 746a417 aaf590b 746a417 aaf590b 746a417 aaf590b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import streamlit as st
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import DictionaryLearning
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
# Title of the app
st.title('Dictionary Learning Demo with Streamlit')
# Description
st.write('''
This application demonstrates the concept of Dictionary Learning using the scikit-learn library.
Dictionary learning aims to find a sparse representation of the data in the form of a dictionary and a sparse matrix.
''')
# Load text from file
with open("text_file.txt", "r", encoding="utf-8") as file:
text_input = file.read()
# Text input
st.text_area("Analyzed Text:", value=text_input, height=200)
# Get user input for the number of dictionary components
n_components = st.slider('Number of dictionary components', 1, 20, 10)
if st.button('Analyze'):
# Perform text preprocessing
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform([text_input])
# Convert sparse matrix to dense numpy array
X_dense = X.toarray()
# Perform dictionary learning
dl = DictionaryLearning(n_components=n_components, transform_algorithm='lasso_lars', random_state=0)
X_transformed = dl.fit_transform(X_dense)
dictionary = dl.components_
# Get the feature names (terms)
feature_names = vectorizer.get_feature_names_out()
# Create a DataFrame with dictionary components and their corresponding terms
df_components = pd.DataFrame(dictionary, columns=feature_names)
df_components['Component'] = ['Component ' + str(i+1) for i in range(n_components)]
df_components = df_components.set_index('Component')
# Display the DataFrame
st.markdown("### Dictionary Components")
st.dataframe(df_components)
# Plot the high-use words and terms
fig, ax = plt.subplots(figsize=(10, 6))
word_counts = df_components.sum(axis=0).sort_values(ascending=False)[:20]
ax.bar(word_counts.index, word_counts.values)
ax.set_xticklabels(word_counts.index, rotation=45, ha='right')
ax.set_xlabel('Words/Terms')
ax.set_ylabel('Count')
ax.set_title('High-Use Words and Terms')
st.pyplot(fig)
# Create a graph of terms and their connections
G = nx.Graph()
# Add nodes to the graph
for term in feature_names:
G.add_node(term)
# Add edges to the graph based on co-occurrence in dictionary components
for i in range(n_components):
terms = df_components.columns[df_components.iloc[i] > 0]
for term1 in terms:
for term2 in terms:
if term1 != term2:
G.add_edge(term1, term2)
# Plot the graph
fig, ax = plt.subplots(figsize=(8, 8))
pos = nx.spring_layout(G, k=0.3)
nx.draw_networkx_nodes(G, pos, node_size=100, node_color='lightblue', alpha=0.8)
nx.draw_networkx_edges(G, pos, edge_color='gray', alpha=0.5)
nx.draw_networkx_labels(G, pos, font_size=8)
ax.axis('off')
st.pyplot(fig) |