import streamlit as st import fitz # PyMuPDF import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from wordcloud import WordCloud from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score # Function to extract text from PDF def extract_text_from_pdf(pdf_file): with fitz.open(stream=pdf_file.read(), filetype="pdf") as doc: text = "\n".join([page.get_text() for page in doc]) return text # Streamlit UI st.title("📄 Document Clustering App") st.write("This app performs unsupervised clustering on uploaded PDF documents.") # Upload PDF files uploaded_files = st.file_uploader("Upload one or more PDF files", type=["pdf"], accept_multiple_files=True) # Slider for number of clusters num_clusters = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3, step=1) if uploaded_files: # Extract text from PDFs documents = [extract_text_from_pdf(file) for file in uploaded_files] # Convert documents to TF-IDF features vectorizer = TfidfVectorizer(stop_words='english') X = vectorizer.fit_transform(documents) # Apply KMeans clustering model = KMeans(n_clusters=num_clusters, random_state=42) clusters = model.fit_predict(X) # Model Metrics Section st.subheader("📊 Model Metrics") if len(uploaded_files) > 1: silhouette_avg = silhouette_score(X, clusters) st.write(f"**Silhouette Score:** {silhouette_avg:.3f}") else: st.write("**Silhouette Score:** N/A (Need at least 2 documents)") # Cluster Size Distribution st.write("### Cluster Size Distribution") cluster_counts = pd.Series(clusters).value_counts().sort_index() fig, ax = plt.subplots() sns.barplot(x=cluster_counts.index, y=cluster_counts.values, ax=ax, palette="viridis") ax.set_xlabel("Cluster") ax.set_ylabel("Number of Documents") ax.set_title("Cluster Size Distribution") st.pyplot(fig) # Create word clouds for each cluster st.subheader("🌥 Word Clouds for Each Cluster") for i in range(num_clusters): cluster_docs = [documents[j] for j in range(len(documents)) if clusters[j] == i] cluster_text = " ".join(cluster_docs) if cluster_text: wordcloud = WordCloud(width=800, height=400, max_words=100, background_color='white').generate(cluster_text) st.write(f"### Cluster {i+1}") st.image(wordcloud.to_array()) else: st.write(f"### Cluster {i+1} (No documents in this cluster)") # Display clustered documents st.subheader("📑 Clustered Documents") df = pd.DataFrame({"Document": [file.name for file in uploaded_files], "Cluster": clusters}) st.dataframe(df)