# Dernière màj: 2024-10-11 import streamlit as st from sentence_transformers import SentenceTransformer from sklearn.linear_model import LogisticRegression from sklearn.metrics import balanced_accuracy_score from sklearn.metrics.pairwise import cosine_similarity from sklearn.decomposition import PCA import pandas as pd import plotly.express as px # Import plotly express import zipfile import os import hashlib import numpy as np # Set page configuration st.set_page_config(page_title="Analyse en composantes principales interactive des plongements sémantiques (2 corpus)", layout="wide") # Title of the app st.title("Exploration de l'espace sémantique (2 corpus)") # Sidebar for uploading files st.sidebar.header("Téléversez vos corpus") uploaded_files = st.sidebar.file_uploader("Téléversez (upload) jusqu'à deux dossiers compressés en format zip", type="zip", accept_multiple_files=True) # Function to extract texts and labels from zipped folders def load_texts_from_zip(zip_file, corpus_prefix): texts, labels = [], [] with zipfile.ZipFile(zip_file, 'r') as zip_ref: for file_info in zip_ref.infolist(): if file_info.filename.endswith('.txt'): with zip_ref.open(file_info.filename) as file: text = file.read().decode('utf-8') texts.append(text) labels.append(f'{corpus_prefix}_{os.path.basename(file_info.filename)}') return texts, labels # Function to compute a hash of the uploaded files for comparison def compute_file_hash(files): file_hash = hashlib.md5() for file in files: file_hash.update(file.read()) file.seek(0) return file_hash.hexdigest() # Function to determine corpus from label prefix def determine_corpus(label, corpus_names): if label.startswith(f'{corpus_names[0]}_'): return corpus_names[0] elif label.startswith(f'{corpus_names[1]}_'): return corpus_names[1] # Function to process the uploaded files and generate embeddings def process_files_and_generate_embeddings(uploaded_files, model, corpus_names): texts_all, labels_all = [], [] for i, zip_file in enumerate(uploaded_files): texts, labels = load_texts_from_zip(zip_file, corpus_names[i]) texts_all.extend(texts) labels_all.extend(labels) # Generate embeddings embeddings = model.encode(texts_all) # Create a DataFrame with embeddings, labels, and corpus information embeddings_df = pd.DataFrame(embeddings) embeddings_df['label'] = labels_all embeddings_df['corpus'] = embeddings_df['label'].apply(determine_corpus, corpus_names=corpus_names) return embeddings_df # Function to perform PCA on embeddings def perform_pca(embeddings_df, n_components=3): pca = PCA(n_components=n_components) pca_components = pca.fit_transform(embeddings_df.drop(columns=['label', 'corpus'])) pca_df = pd.DataFrame(pca_components, columns=[f'PCA{i+1}' for i in range(n_components)]) return pd.concat([pca_df, embeddings_df[['label', 'corpus']]], axis=1) # Function to perform logistic regression on embeddings and compute accuracy def classify_and_report_accuracy(embeddings_df): unique_classes = embeddings_df['corpus'].nunique() # Check if there are at least two unique classes for classification if unique_classes < 2: # st.sidebar.write("Classification impossible : il n'y a qu'un seul corpus.") return # Proceed with classification if there are at least two classes X = embeddings_df.drop(columns=['label', 'corpus']) # Use full embeddings y_gold = embeddings_df['corpus'] # Train logistic regression model classifier = LogisticRegression(max_iter=1000) classifier.fit(X, y_gold) # Make predictions and compute accuracy y_pred = classifier.predict(X) balanced_acc = balanced_accuracy_score(y_gold, y_pred) st.sidebar.write(f"Classification (précision) : {balanced_acc:.2f}") # Function to plot embeddings using Plotly (with 2D or 3D switch) def plot_embeddings(pca_df): # Add a checkbox for selecting 3D plot (2D by default) show_3d = st.checkbox("Afficher en 3D", value=False) if show_3d: # Plot in 3D using PCA components fig = px.scatter_3d( pca_df, x='PCA1', y='PCA2', z='PCA3', color='corpus', hover_data=['label'], title='Visualisation des Embeddings (3D - PCA)' ) else: # Plot in 2D using the first two PCA components fig = px.scatter( pca_df, x='PCA1', y='PCA2', color='corpus', hover_data=['label'], title='Visualisation des Embeddings (2D - PCA)' ) # Update layout and display the plot fig.update_layout(width=1200, height=800, margin=dict(l=20, r=20, t=50, b=20)) st.plotly_chart(fig, use_container_width=True) # Function to compute cosine similarity between two corpora def compute_corpus_similarity(embeddings_df, corpus_names): unique_classes = embeddings_df['corpus'].nunique() # Check if there are at least two unique classes for similarity computation if unique_classes < 2: # st.sidebar.write("Calcul de similarité impossible : il n'y a qu'un seul corpus.") return # Proceed with cosine similarity calculation corpus_embeddings = embeddings_df.drop(columns=['label', 'corpus']) # Compute mean embeddings for each corpus corpus1_embeddings = corpus_embeddings[embeddings_df['corpus'] == corpus_names[0]].values corpus2_embeddings = corpus_embeddings[embeddings_df['corpus'] == corpus_names[1]].values similarity = cosine_similarity(corpus1_embeddings, corpus2_embeddings)[0][0] # Display cosine similarity st.sidebar.write(f"Similarité Cosine entre les deux corpus: {similarity:.2f}") # Main logic of the app if uploaded_files and len(uploaded_files) <= 2: # Get the corpus names without the .zip extension corpus_names = [os.path.splitext(uploaded_file.name)[0] for uploaded_file in uploaded_files] # Hash uploaded files and reset state if needed file_hash = compute_file_hash(uploaded_files) if 'uploaded_file_hash' not in st.session_state or st.session_state.uploaded_file_hash != file_hash: st.session_state.uploaded_file_hash = file_hash st.session_state.embeddings_df = None # Load model if 'model' not in st.session_state: st.session_state.model = SentenceTransformer('distiluse-base-multilingual-cased-v2') # Process files and generate embeddings if they aren't already cached if st.session_state.embeddings_df is None: st.session_state.embeddings_df = process_files_and_generate_embeddings(uploaded_files, st.session_state.model, corpus_names) embeddings_df = st.session_state.embeddings_df # Get the PCA components pca_df = perform_pca(embeddings_df) # Perform classification and report accuracy classify_and_report_accuracy(embeddings_df) # Compute and display cosine similarity between corpora compute_corpus_similarity(embeddings_df, corpus_names) # plot embeddings plot_embeddings(pca_df) else: st.warning("Veuillez téléverser 2 corpus sous forme de dossier compressé (.zip) de fichiers texte (.txt).")