File size: 2,840 Bytes
14ae033
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import streamlit as st
import fitz  # PyMuPDF
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    with fitz.open(stream=pdf_file.read(), filetype="pdf") as doc:
        text = "\n".join([page.get_text() for page in doc])
    return text

# Streamlit UI
st.title("πŸ“„ Document Clustering App")
st.write("This app performs unsupervised clustering on uploaded PDF documents.")

# Upload PDF files
uploaded_files = st.file_uploader("Upload one or more PDF files", type=["pdf"], accept_multiple_files=True)

# Slider for number of clusters
num_clusters = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3, step=1)

if uploaded_files:
    # Extract text from PDFs
    documents = [extract_text_from_pdf(file) for file in uploaded_files]
    
    # Convert documents to TF-IDF features
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(documents)
    
    # Apply KMeans clustering
    model = KMeans(n_clusters=num_clusters, random_state=42)
    clusters = model.fit_predict(X)
    
    # Model Metrics Section
    st.subheader("πŸ“Š Model Metrics")
    if len(uploaded_files) > 1:
        silhouette_avg = silhouette_score(X, clusters)
        st.write(f"**Silhouette Score:** {silhouette_avg:.3f}")
    else:
        st.write("**Silhouette Score:** N/A (Need at least 2 documents)")
    
    # Cluster Size Distribution
    st.write("### Cluster Size Distribution")
    cluster_counts = pd.Series(clusters).value_counts().sort_index()
    fig, ax = plt.subplots()
    sns.barplot(x=cluster_counts.index, y=cluster_counts.values, ax=ax, palette="viridis")
    ax.set_xlabel("Cluster")
    ax.set_ylabel("Number of Documents")
    ax.set_title("Cluster Size Distribution")
    st.pyplot(fig)
    
    # Create word clouds for each cluster
    st.subheader("πŸŒ₯ Word Clouds for Each Cluster")
    for i in range(num_clusters):
        cluster_docs = [documents[j] for j in range(len(documents)) if clusters[j] == i]
        cluster_text = " ".join(cluster_docs)
        
        if cluster_text:
            wordcloud = WordCloud(width=800, height=400, max_words=100, background_color='white').generate(cluster_text)
            st.write(f"### Cluster {i+1}")
            st.image(wordcloud.to_array())
        else:
            st.write(f"### Cluster {i+1} (No documents in this cluster)")
    
    # Display clustered documents
    st.subheader("πŸ“‘ Clustered Documents")
    df = pd.DataFrame({"Document": [file.name for file in uploaded_files], "Cluster": clusters})
    st.dataframe(df)