Spaces:

awacke1
/

DictionaryLearningAndAnthropic

Sleeping

File size: 4,074 Bytes

6827b7d
 
966fab8
6827b7d
966fab8
 
6827b7d
 
 
 
 
 
 
 
 
 
 
aaf590b
 
 
746a417
aaf590b
 
746a417
aaf590b
966fab8
aaf590b
966fab8
746a417
 
 
aaf590b
05c1b6a
 
aaf590b
746a417
 
05c1b6a
746a417
aaf590b
746a417
 
aaf590b
746a417
 
 
 
aaf590b
746a417
 
 
aaf590b
 
 
 
 
 
 
 
 
 
 
746a417
 
aaf590b
746a417
 
 
aaf590b
746a417
 
 
 
 
 
 
aaf590b
746a417
 
 
 
 
 
 
eb9026d

import streamlit as st
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import DictionaryLearning
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

# Title of the app
st.title('Dictionary Learning Demo with Streamlit')

# Description
st.write('''
    This application demonstrates the concept of Dictionary Learning using the scikit-learn library.
    Dictionary learning aims to find a sparse representation of the data in the form of a dictionary and a sparse matrix.
''')

# Load text from file
with open("text_file.txt", "r", encoding="utf-8") as file:
    text_input = file.read()

# Text input
st.text_area("Analyzed Text:", value=text_input, height=200)

# Get user input for the number of dictionary components
n_components = st.slider('Number of dictionary components', 1, 20, 10)

if st.button('Analyze'):
    # Perform text preprocessing
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform([text_input])

    # Convert sparse matrix to dense numpy array
    X_dense = X.toarray()

    # Perform dictionary learning
    dl = DictionaryLearning(n_components=n_components, transform_algorithm='lasso_lars', random_state=0)
    X_transformed = dl.fit_transform(X_dense)
    dictionary = dl.components_

    # Get the feature names (terms)
    feature_names = vectorizer.get_feature_names_out()

    # Create a DataFrame with dictionary components and their corresponding terms
    df_components = pd.DataFrame(dictionary, columns=feature_names)
    df_components['Component'] = ['Component ' + str(i+1) for i in range(n_components)]
    df_components = df_components.set_index('Component')

    # Display the DataFrame
    st.markdown("### Dictionary Components")
    st.dataframe(df_components)

    # Plot the high-use words and terms
    fig, ax = plt.subplots(figsize=(10, 6))
    word_counts = df_components.sum(axis=0).sort_values(ascending=False)[:20]
    ax.bar(word_counts.index, word_counts.values)
    ax.set_xticklabels(word_counts.index, rotation=45, ha='right')
    ax.set_xlabel('Words/Terms')
    ax.set_ylabel('Count')
    ax.set_title('High-Use Words and Terms')
    st.pyplot(fig)

    # Create a graph of terms and their connections
    G = nx.Graph()

    # Add nodes to the graph
    for term in feature_names:
        G.add_node(term)

    # Add edges to the graph based on co-occurrence in dictionary components
    for i in range(n_components):
        terms = df_components.columns[df_components.iloc[i] > 0]
        for term1 in terms:
            for term2 in terms:
                if term1 != term2:
                    G.add_edge(term1, term2)

    # Plot the graph
    fig, ax = plt.subplots(figsize=(8, 8))
    pos = nx.spring_layout(G, k=0.3)
    nx.draw_networkx_nodes(G, pos, node_size=100, node_color='lightblue', alpha=0.8)
    nx.draw_networkx_edges(G, pos, edge_color='gray', alpha=0.5)
    nx.draw_networkx_labels(G, pos, font_size=8)
    ax.axis('off')
    st.pyplot(fig)

    # Generate a short summary using high-use words and terms
    summary = ' '.join(word_counts.index[:5])  # Use top 5 words/terms for summary
    st.markdown(f"### Short Summary\n```python\n{summary}\n```")
    
    # Generate glossary-style sentences for each high-use word/term
    glossary = {}
    for word in word_counts.index[:20]:
        # Find the word in the text and extract surrounding words
        start_index = text_input.find(word)
        if start_index != -1:
            end_index = start_index + len(word)
            start_context = max(0, start_index - 20)
            end_context = min(len(text_input), end_index + 20)
            context = text_input[start_context:start_index] + ' ' + word + ' ' + text_input[end_index:end_context]
            glossary[word] = context.replace('\n', ' ').strip()
    
    # Display the glossary-style sentences in markdown format
    st.markdown("### Glossary")
    for word, sentence in glossary.items():
        st.markdown(f"```python\n{word}: {sentence}\n```")