corpuscope / app.py
gloignon's picture
Update app.py
d6047d9 verified
# Dernière màj: 2024-10-11
import streamlit as st
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import pandas as pd
import plotly.express as px # Import plotly express
import zipfile
import os
import hashlib
import numpy as np
# Set page configuration
st.set_page_config(page_title="Analyse en composantes principales interactive des plongements sémantiques (2 corpus)", layout="wide")
# Title of the app
st.title("Exploration de l'espace sémantique (2 corpus)")
# Sidebar for uploading files
st.sidebar.header("Téléversez vos corpus")
uploaded_files = st.sidebar.file_uploader("Téléversez (upload) jusqu'à deux dossiers compressés en format zip", type="zip", accept_multiple_files=True)
# Function to extract texts and labels from zipped folders
def load_texts_from_zip(zip_file, corpus_prefix):
texts, labels = [], []
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
for file_info in zip_ref.infolist():
if file_info.filename.endswith('.txt'):
with zip_ref.open(file_info.filename) as file:
text = file.read().decode('utf-8')
texts.append(text)
labels.append(f'{corpus_prefix}_{os.path.basename(file_info.filename)}')
return texts, labels
# Function to compute a hash of the uploaded files for comparison
def compute_file_hash(files):
file_hash = hashlib.md5()
for file in files:
file_hash.update(file.read())
file.seek(0)
return file_hash.hexdigest()
# Function to determine corpus from label prefix
def determine_corpus(label, corpus_names):
if label.startswith(f'{corpus_names[0]}_'):
return corpus_names[0]
elif label.startswith(f'{corpus_names[1]}_'):
return corpus_names[1]
# Function to process the uploaded files and generate embeddings
def process_files_and_generate_embeddings(uploaded_files, model, corpus_names):
texts_all, labels_all = [], []
for i, zip_file in enumerate(uploaded_files):
texts, labels = load_texts_from_zip(zip_file, corpus_names[i])
texts_all.extend(texts)
labels_all.extend(labels)
# Generate embeddings
embeddings = model.encode(texts_all)
# Create a DataFrame with embeddings, labels, and corpus information
embeddings_df = pd.DataFrame(embeddings)
embeddings_df['label'] = labels_all
embeddings_df['corpus'] = embeddings_df['label'].apply(determine_corpus, corpus_names=corpus_names)
return embeddings_df
# Function to perform PCA on embeddings
def perform_pca(embeddings_df, n_components=3):
pca = PCA(n_components=n_components)
pca_components = pca.fit_transform(embeddings_df.drop(columns=['label', 'corpus']))
pca_df = pd.DataFrame(pca_components, columns=[f'PCA{i+1}' for i in range(n_components)])
return pd.concat([pca_df, embeddings_df[['label', 'corpus']]], axis=1)
# Function to perform logistic regression on embeddings and compute accuracy
def classify_and_report_accuracy(embeddings_df):
unique_classes = embeddings_df['corpus'].nunique()
# Check if there are at least two unique classes for classification
if unique_classes < 2:
# st.sidebar.write("Classification impossible : il n'y a qu'un seul corpus.")
return
# Proceed with classification if there are at least two classes
X = embeddings_df.drop(columns=['label', 'corpus']) # Use full embeddings
y_gold = embeddings_df['corpus']
# Train logistic regression model
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X, y_gold)
# Make predictions and compute accuracy
y_pred = classifier.predict(X)
balanced_acc = balanced_accuracy_score(y_gold, y_pred)
st.sidebar.write(f"Classification (précision) : {balanced_acc:.2f}")
# Function to plot embeddings using Plotly (with 2D or 3D switch)
def plot_embeddings(pca_df):
# Add a checkbox for selecting 3D plot (2D by default)
show_3d = st.checkbox("Afficher en 3D", value=False)
if show_3d:
# Plot in 3D using PCA components
fig = px.scatter_3d(
pca_df, x='PCA1', y='PCA2', z='PCA3',
color='corpus', hover_data=['label'],
title='Visualisation des Embeddings (3D - PCA)'
)
else:
# Plot in 2D using the first two PCA components
fig = px.scatter(
pca_df, x='PCA1', y='PCA2', color='corpus', hover_data=['label'],
title='Visualisation des Embeddings (2D - PCA)'
)
# Update layout and display the plot
fig.update_layout(width=1200, height=800, margin=dict(l=20, r=20, t=50, b=20))
st.plotly_chart(fig, use_container_width=True)
# Function to compute cosine similarity between two corpora
def compute_corpus_similarity(embeddings_df, corpus_names):
unique_classes = embeddings_df['corpus'].nunique()
# Check if there are at least two unique classes for similarity computation
if unique_classes < 2:
# st.sidebar.write("Calcul de similarité impossible : il n'y a qu'un seul corpus.")
return
# Proceed with cosine similarity calculation
corpus_embeddings = embeddings_df.drop(columns=['label', 'corpus'])
# Compute mean embeddings for each corpus
corpus1_embeddings = corpus_embeddings[embeddings_df['corpus'] == corpus_names[0]].values
corpus2_embeddings = corpus_embeddings[embeddings_df['corpus'] == corpus_names[1]].values
similarity = cosine_similarity(corpus1_embeddings, corpus2_embeddings)[0][0]
# Display cosine similarity
st.sidebar.write(f"Similarité Cosine entre les deux corpus: {similarity:.2f}")
# Main logic of the app
if uploaded_files and len(uploaded_files) <= 2:
# Get the corpus names without the .zip extension
corpus_names = [os.path.splitext(uploaded_file.name)[0] for uploaded_file in uploaded_files]
# Hash uploaded files and reset state if needed
file_hash = compute_file_hash(uploaded_files)
if 'uploaded_file_hash' not in st.session_state or st.session_state.uploaded_file_hash != file_hash:
st.session_state.uploaded_file_hash = file_hash
st.session_state.embeddings_df = None
# Load model
if 'model' not in st.session_state:
st.session_state.model = SentenceTransformer('distiluse-base-multilingual-cased-v2')
# Process files and generate embeddings if they aren't already cached
if st.session_state.embeddings_df is None:
st.session_state.embeddings_df = process_files_and_generate_embeddings(uploaded_files, st.session_state.model, corpus_names)
embeddings_df = st.session_state.embeddings_df
# Get the PCA components
pca_df = perform_pca(embeddings_df)
# Perform classification and report accuracy
classify_and_report_accuracy(embeddings_df)
# Compute and display cosine similarity between corpora
compute_corpus_similarity(embeddings_df, corpus_names)
# plot embeddings
plot_embeddings(pca_df)
else:
st.warning("Veuillez téléverser 2 corpus sous forme de dossier compressé (.zip) de fichiers texte (.txt).")