Spaces:

awacke1
/

DictionaryLearningAndAnthropic

Sleeping

App Files Files Community

DictionaryLearningAndAnthropic / app.py

awacke1

Update app.py

aaf590b verified about 1 year ago

raw

history blame

3.04 kB

	import streamlit as st
	import numpy as np
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.decomposition import DictionaryLearning
	import pandas as pd
	import networkx as nx
	import matplotlib.pyplot as plt

	# Title of the app
	st.title('Dictionary Learning Demo with Streamlit')

	# Description
	st.write('''
	This application demonstrates the concept of Dictionary Learning using the scikit-learn library.
	Dictionary learning aims to find a sparse representation of the data in the form of a dictionary and a sparse matrix.
	''')

	# Load text from file
	with open("text_file.txt", "r", encoding="utf-8") as file:
	text_input = file.read()

	# Text input
	st.text_area("Analyzed Text:", value=text_input, height=200)

	# Get user input for the number of dictionary components
	n_components = st.slider('Number of dictionary components', 1, 20, 10)

	if st.button('Analyze'):
	# Perform text preprocessing
	vectorizer = CountVectorizer(stop_words='english')
	X = vectorizer.fit_transform([text_input])

	# Convert sparse matrix to dense numpy array
	X_dense = X.toarray()

	# Perform dictionary learning
	dl = DictionaryLearning(n_components=n_components, transform_algorithm='lasso_lars', random_state=0)
	X_transformed = dl.fit_transform(X_dense)
	dictionary = dl.components_

	# Get the feature names (terms)
	feature_names = vectorizer.get_feature_names_out()

	# Create a DataFrame with dictionary components and their corresponding terms
	df_components = pd.DataFrame(dictionary, columns=feature_names)
	df_components['Component'] = ['Component ' + str(i+1) for i in range(n_components)]
	df_components = df_components.set_index('Component')

	# Display the DataFrame
	st.markdown("### Dictionary Components")
	st.dataframe(df_components)

	# Plot the high-use words and terms
	fig, ax = plt.subplots(figsize=(10, 6))
	word_counts = df_components.sum(axis=0).sort_values(ascending=False)[:20]
	ax.bar(word_counts.index, word_counts.values)
	ax.set_xticklabels(word_counts.index, rotation=45, ha='right')
	ax.set_xlabel('Words/Terms')
	ax.set_ylabel('Count')
	ax.set_title('High-Use Words and Terms')
	st.pyplot(fig)

	# Create a graph of terms and their connections
	G = nx.Graph()

	# Add nodes to the graph
	for term in feature_names:
	G.add_node(term)

	# Add edges to the graph based on co-occurrence in dictionary components
	for i in range(n_components):
	terms = df_components.columns[df_components.iloc[i] > 0]
	for term1 in terms:
	for term2 in terms:
	if term1 != term2:
	G.add_edge(term1, term2)

	# Plot the graph
	fig, ax = plt.subplots(figsize=(8, 8))
	pos = nx.spring_layout(G, k=0.3)
	nx.draw_networkx_nodes(G, pos, node_size=100, node_color='lightblue', alpha=0.8)
	nx.draw_networkx_edges(G, pos, edge_color='gray', alpha=0.5)
	nx.draw_networkx_labels(G, pos, font_size=8)
	ax.axis('off')
	st.pyplot(fig)