Spaces:
Sleeping
Sleeping
import streamlit as st | |
import fitz # PyMuPDF | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from wordcloud import WordCloud | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.cluster import KMeans | |
from sklearn.metrics import silhouette_score | |
# Function to extract text from PDF | |
def extract_text_from_pdf(pdf_file): | |
with fitz.open(stream=pdf_file.read(), filetype="pdf") as doc: | |
text = "\n".join([page.get_text() for page in doc]) | |
return text | |
# Streamlit UI | |
st.title("π Document Clustering App") | |
st.write("This app performs unsupervised clustering on uploaded PDF documents.") | |
# Upload PDF files | |
uploaded_files = st.file_uploader("Upload one or more PDF files", type=["pdf"], accept_multiple_files=True) | |
# Slider for number of clusters | |
num_clusters = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3, step=1) | |
if uploaded_files: | |
# Extract text from PDFs | |
documents = [extract_text_from_pdf(file) for file in uploaded_files] | |
# Convert documents to TF-IDF features | |
vectorizer = TfidfVectorizer(stop_words='english') | |
X = vectorizer.fit_transform(documents) | |
# Apply KMeans clustering | |
model = KMeans(n_clusters=num_clusters, random_state=42) | |
clusters = model.fit_predict(X) | |
# Model Metrics Section | |
st.subheader("π Model Metrics") | |
if len(uploaded_files) > 1: | |
silhouette_avg = silhouette_score(X, clusters) | |
st.write(f"**Silhouette Score:** {silhouette_avg:.3f}") | |
else: | |
st.write("**Silhouette Score:** N/A (Need at least 2 documents)") | |
# Cluster Size Distribution | |
st.write("### Cluster Size Distribution") | |
cluster_counts = pd.Series(clusters).value_counts().sort_index() | |
fig, ax = plt.subplots() | |
sns.barplot(x=cluster_counts.index, y=cluster_counts.values, ax=ax, palette="viridis") | |
ax.set_xlabel("Cluster") | |
ax.set_ylabel("Number of Documents") | |
ax.set_title("Cluster Size Distribution") | |
st.pyplot(fig) | |
# Create word clouds for each cluster | |
st.subheader("π₯ Word Clouds for Each Cluster") | |
for i in range(num_clusters): | |
cluster_docs = [documents[j] for j in range(len(documents)) if clusters[j] == i] | |
cluster_text = " ".join(cluster_docs) | |
if cluster_text: | |
wordcloud = WordCloud(width=800, height=400, max_words=100, background_color='white').generate(cluster_text) | |
st.write(f"### Cluster {i+1}") | |
st.image(wordcloud.to_array()) | |
else: | |
st.write(f"### Cluster {i+1} (No documents in this cluster)") | |
# Display clustered documents | |
st.subheader("π Clustered Documents") | |
df = pd.DataFrame({"Document": [file.name for file in uploaded_files], "Cluster": clusters}) | |
st.dataframe(df) |