Spaces:
Sleeping
Sleeping
# Dernière màj: 2024-10-11 | |
import streamlit as st | |
from sentence_transformers import SentenceTransformer | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.metrics import balanced_accuracy_score | |
from sklearn.metrics.pairwise import cosine_similarity | |
from sklearn.decomposition import PCA | |
import pandas as pd | |
import plotly.express as px # Import plotly express | |
import zipfile | |
import os | |
import hashlib | |
import numpy as np | |
# Set page configuration | |
st.set_page_config(page_title="Analyse en composantes principales interactive des plongements sémantiques (2 corpus)", layout="wide") | |
# Title of the app | |
st.title("Exploration de l'espace sémantique (2 corpus)") | |
# Sidebar for uploading files | |
st.sidebar.header("Téléversez vos corpus") | |
uploaded_files = st.sidebar.file_uploader("Téléversez (upload) jusqu'à deux dossiers compressés en format zip", type="zip", accept_multiple_files=True) | |
# Function to extract texts and labels from zipped folders | |
def load_texts_from_zip(zip_file, corpus_prefix): | |
texts, labels = [], [] | |
with zipfile.ZipFile(zip_file, 'r') as zip_ref: | |
for file_info in zip_ref.infolist(): | |
if file_info.filename.endswith('.txt'): | |
with zip_ref.open(file_info.filename) as file: | |
text = file.read().decode('utf-8') | |
texts.append(text) | |
labels.append(f'{corpus_prefix}_{os.path.basename(file_info.filename)}') | |
return texts, labels | |
# Function to compute a hash of the uploaded files for comparison | |
def compute_file_hash(files): | |
file_hash = hashlib.md5() | |
for file in files: | |
file_hash.update(file.read()) | |
file.seek(0) | |
return file_hash.hexdigest() | |
# Function to determine corpus from label prefix | |
def determine_corpus(label, corpus_names): | |
if label.startswith(f'{corpus_names[0]}_'): | |
return corpus_names[0] | |
elif label.startswith(f'{corpus_names[1]}_'): | |
return corpus_names[1] | |
# Function to process the uploaded files and generate embeddings | |
def process_files_and_generate_embeddings(uploaded_files, model, corpus_names): | |
texts_all, labels_all = [], [] | |
for i, zip_file in enumerate(uploaded_files): | |
texts, labels = load_texts_from_zip(zip_file, corpus_names[i]) | |
texts_all.extend(texts) | |
labels_all.extend(labels) | |
# Generate embeddings | |
embeddings = model.encode(texts_all) | |
# Create a DataFrame with embeddings, labels, and corpus information | |
embeddings_df = pd.DataFrame(embeddings) | |
embeddings_df['label'] = labels_all | |
embeddings_df['corpus'] = embeddings_df['label'].apply(determine_corpus, corpus_names=corpus_names) | |
return embeddings_df | |
# Function to perform PCA on embeddings | |
def perform_pca(embeddings_df, n_components=3): | |
pca = PCA(n_components=n_components) | |
pca_components = pca.fit_transform(embeddings_df.drop(columns=['label', 'corpus'])) | |
pca_df = pd.DataFrame(pca_components, columns=[f'PCA{i+1}' for i in range(n_components)]) | |
return pd.concat([pca_df, embeddings_df[['label', 'corpus']]], axis=1) | |
# Function to perform logistic regression on embeddings and compute accuracy | |
def classify_and_report_accuracy(embeddings_df): | |
unique_classes = embeddings_df['corpus'].nunique() | |
# Check if there are at least two unique classes for classification | |
if unique_classes < 2: | |
# st.sidebar.write("Classification impossible : il n'y a qu'un seul corpus.") | |
return | |
# Proceed with classification if there are at least two classes | |
X = embeddings_df.drop(columns=['label', 'corpus']) # Use full embeddings | |
y_gold = embeddings_df['corpus'] | |
# Train logistic regression model | |
classifier = LogisticRegression(max_iter=1000) | |
classifier.fit(X, y_gold) | |
# Make predictions and compute accuracy | |
y_pred = classifier.predict(X) | |
balanced_acc = balanced_accuracy_score(y_gold, y_pred) | |
st.sidebar.write(f"Classification (précision) : {balanced_acc:.2f}") | |
# Function to plot embeddings using Plotly (with 2D or 3D switch) | |
def plot_embeddings(pca_df): | |
# Add a checkbox for selecting 3D plot (2D by default) | |
show_3d = st.checkbox("Afficher en 3D", value=False) | |
if show_3d: | |
# Plot in 3D using PCA components | |
fig = px.scatter_3d( | |
pca_df, x='PCA1', y='PCA2', z='PCA3', | |
color='corpus', hover_data=['label'], | |
title='Visualisation des Embeddings (3D - PCA)' | |
) | |
else: | |
# Plot in 2D using the first two PCA components | |
fig = px.scatter( | |
pca_df, x='PCA1', y='PCA2', color='corpus', hover_data=['label'], | |
title='Visualisation des Embeddings (2D - PCA)' | |
) | |
# Update layout and display the plot | |
fig.update_layout(width=1200, height=800, margin=dict(l=20, r=20, t=50, b=20)) | |
st.plotly_chart(fig, use_container_width=True) | |
# Function to compute cosine similarity between two corpora | |
def compute_corpus_similarity(embeddings_df, corpus_names): | |
unique_classes = embeddings_df['corpus'].nunique() | |
# Check if there are at least two unique classes for similarity computation | |
if unique_classes < 2: | |
# st.sidebar.write("Calcul de similarité impossible : il n'y a qu'un seul corpus.") | |
return | |
# Proceed with cosine similarity calculation | |
corpus_embeddings = embeddings_df.drop(columns=['label', 'corpus']) | |
# Compute mean embeddings for each corpus | |
corpus1_embeddings = corpus_embeddings[embeddings_df['corpus'] == corpus_names[0]].values | |
corpus2_embeddings = corpus_embeddings[embeddings_df['corpus'] == corpus_names[1]].values | |
similarity = cosine_similarity(corpus1_embeddings, corpus2_embeddings)[0][0] | |
# Display cosine similarity | |
st.sidebar.write(f"Similarité Cosine entre les deux corpus: {similarity:.2f}") | |
# Main logic of the app | |
if uploaded_files and len(uploaded_files) <= 2: | |
# Get the corpus names without the .zip extension | |
corpus_names = [os.path.splitext(uploaded_file.name)[0] for uploaded_file in uploaded_files] | |
# Hash uploaded files and reset state if needed | |
file_hash = compute_file_hash(uploaded_files) | |
if 'uploaded_file_hash' not in st.session_state or st.session_state.uploaded_file_hash != file_hash: | |
st.session_state.uploaded_file_hash = file_hash | |
st.session_state.embeddings_df = None | |
# Load model | |
if 'model' not in st.session_state: | |
st.session_state.model = SentenceTransformer('distiluse-base-multilingual-cased-v2') | |
# Process files and generate embeddings if they aren't already cached | |
if st.session_state.embeddings_df is None: | |
st.session_state.embeddings_df = process_files_and_generate_embeddings(uploaded_files, st.session_state.model, corpus_names) | |
embeddings_df = st.session_state.embeddings_df | |
# Get the PCA components | |
pca_df = perform_pca(embeddings_df) | |
# Perform classification and report accuracy | |
classify_and_report_accuracy(embeddings_df) | |
# Compute and display cosine similarity between corpora | |
compute_corpus_similarity(embeddings_df, corpus_names) | |
# plot embeddings | |
plot_embeddings(pca_df) | |
else: | |
st.warning("Veuillez téléverser 2 corpus sous forme de dossier compressé (.zip) de fichiers texte (.txt).") | |