Spaces:
Sleeping
Sleeping
File size: 2,840 Bytes
14ae033 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import streamlit as st
import fitz # PyMuPDF
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
with fitz.open(stream=pdf_file.read(), filetype="pdf") as doc:
text = "\n".join([page.get_text() for page in doc])
return text
# Streamlit UI
st.title("π Document Clustering App")
st.write("This app performs unsupervised clustering on uploaded PDF documents.")
# Upload PDF files
uploaded_files = st.file_uploader("Upload one or more PDF files", type=["pdf"], accept_multiple_files=True)
# Slider for number of clusters
num_clusters = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3, step=1)
if uploaded_files:
# Extract text from PDFs
documents = [extract_text_from_pdf(file) for file in uploaded_files]
# Convert documents to TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)
# Apply KMeans clustering
model = KMeans(n_clusters=num_clusters, random_state=42)
clusters = model.fit_predict(X)
# Model Metrics Section
st.subheader("π Model Metrics")
if len(uploaded_files) > 1:
silhouette_avg = silhouette_score(X, clusters)
st.write(f"**Silhouette Score:** {silhouette_avg:.3f}")
else:
st.write("**Silhouette Score:** N/A (Need at least 2 documents)")
# Cluster Size Distribution
st.write("### Cluster Size Distribution")
cluster_counts = pd.Series(clusters).value_counts().sort_index()
fig, ax = plt.subplots()
sns.barplot(x=cluster_counts.index, y=cluster_counts.values, ax=ax, palette="viridis")
ax.set_xlabel("Cluster")
ax.set_ylabel("Number of Documents")
ax.set_title("Cluster Size Distribution")
st.pyplot(fig)
# Create word clouds for each cluster
st.subheader("π₯ Word Clouds for Each Cluster")
for i in range(num_clusters):
cluster_docs = [documents[j] for j in range(len(documents)) if clusters[j] == i]
cluster_text = " ".join(cluster_docs)
if cluster_text:
wordcloud = WordCloud(width=800, height=400, max_words=100, background_color='white').generate(cluster_text)
st.write(f"### Cluster {i+1}")
st.image(wordcloud.to_array())
else:
st.write(f"### Cluster {i+1} (No documents in this cluster)")
# Display clustered documents
st.subheader("π Clustered Documents")
df = pd.DataFrame({"Document": [file.name for file in uploaded_files], "Cluster": clusters})
st.dataframe(df) |