import streamlit as st
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import DictionaryLearning
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

# Title of the app
st.title('Dictionary Learning Demo with Streamlit')

# Description
st.write('''
    This application demonstrates the concept of Dictionary Learning using the scikit-learn library.
    Dictionary learning aims to find a sparse representation of the data in the form of a dictionary and a sparse matrix.
''')

# Load text from file
with open("text_file.txt", "r", encoding="utf-8") as file:
    text_input = file.read()

# Text input
st.text_area("Analyzed Text:", value=text_input, height=200)

# Get user input for the number of dictionary components
n_components = st.slider('Number of dictionary components', 1, 20, 10)

if st.button('Analyze'):
    # Perform text preprocessing
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform([text_input])

    # Convert sparse matrix to dense numpy array
    X_dense = X.toarray()

    # Perform dictionary learning
    dl = DictionaryLearning(n_components=n_components, transform_algorithm='lasso_lars', random_state=0)
    X_transformed = dl.fit_transform(X_dense)
    dictionary = dl.components_

    # Get the feature names (terms)
    feature_names = vectorizer.get_feature_names_out()

    # Create a DataFrame with dictionary components and their corresponding terms
    df_components = pd.DataFrame(dictionary, columns=feature_names)
    df_components['Component'] = ['Component ' + str(i+1) for i in range(n_components)]
    df_components = df_components.set_index('Component')

    # Display the DataFrame
    st.markdown("### Dictionary Components")
    st.dataframe(df_components)

    # Plot the high-use words and terms
    fig, ax = plt.subplots(figsize=(10, 6))
    word_counts = df_components.sum(axis=0).sort_values(ascending=False)[:20]
    ax.bar(word_counts.index, word_counts.values)
    ax.set_xticklabels(word_counts.index, rotation=45, ha='right')
    ax.set_xlabel('Words/Terms')
    ax.set_ylabel('Count')
    ax.set_title('High-Use Words and Terms')
    st.pyplot(fig)

    # Create a graph of terms and their connections
    G = nx.Graph()

    # Add nodes to the graph
    for term in feature_names:
        G.add_node(term)

    # Add edges to the graph based on co-occurrence in dictionary components
    for i in range(n_components):
        terms = df_components.columns[df_components.iloc[i] > 0]
        for term1 in terms:
            for term2 in terms:
                if term1 != term2:
                    G.add_edge(term1, term2)

    # Plot the graph
    fig, ax = plt.subplots(figsize=(8, 8))
    pos = nx.spring_layout(G, k=0.3)
    nx.draw_networkx_nodes(G, pos, node_size=100, node_color='lightblue', alpha=0.8)
    nx.draw_networkx_edges(G, pos, edge_color='gray', alpha=0.5)
    nx.draw_networkx_labels(G, pos, font_size=8)
    ax.axis('off')
    st.pyplot(fig)

    # Generate a short summary using high-use words and terms
    summary = ' '.join(word_counts.index[:5])  # Use top 5 words/terms for summary
    st.markdown(f"### Short Summary\n```python\n{summary}\n```")
    
    # Generate glossary-style sentences for each high-use word/term
    glossary = {}
    for word in word_counts.index[:20]:
        # Find the word in the text and extract surrounding words
        start_index = text_input.find(word)
        if start_index != -1:
            end_index = start_index + len(word)
            start_context = max(0, start_index - 20)
            end_context = min(len(text_input), end_index + 20)
            context = text_input[start_context:start_index] + ' ' + word + ' ' + text_input[end_index:end_context]
            glossary[word] = context.replace('\n', ' ').strip()
    
    # Display the glossary-style sentences in markdown format
    st.markdown("### Glossary")
    for word, sentence in glossary.items():
        st.markdown(f"```python\n{word}: {sentence}\n```")