Spaces:
Sleeping
Sleeping
File size: 7,275 Bytes
d6047d9 8dd897c d6047d9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
# Dernière màj: 2024-10-11
import streamlit as st
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import pandas as pd
import plotly.express as px # Import plotly express
import zipfile
import os
import hashlib
import numpy as np
# Set page configuration
st.set_page_config(page_title="Analyse en composantes principales interactive des plongements sémantiques (2 corpus)", layout="wide")
# Title of the app
st.title("Exploration de l'espace sémantique (2 corpus)")
# Sidebar for uploading files
st.sidebar.header("Téléversez vos corpus")
uploaded_files = st.sidebar.file_uploader("Téléversez (upload) jusqu'à deux dossiers compressés en format zip", type="zip", accept_multiple_files=True)
# Function to extract texts and labels from zipped folders
def load_texts_from_zip(zip_file, corpus_prefix):
texts, labels = [], []
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
for file_info in zip_ref.infolist():
if file_info.filename.endswith('.txt'):
with zip_ref.open(file_info.filename) as file:
text = file.read().decode('utf-8')
texts.append(text)
labels.append(f'{corpus_prefix}_{os.path.basename(file_info.filename)}')
return texts, labels
# Function to compute a hash of the uploaded files for comparison
def compute_file_hash(files):
file_hash = hashlib.md5()
for file in files:
file_hash.update(file.read())
file.seek(0)
return file_hash.hexdigest()
# Function to determine corpus from label prefix
def determine_corpus(label, corpus_names):
if label.startswith(f'{corpus_names[0]}_'):
return corpus_names[0]
elif label.startswith(f'{corpus_names[1]}_'):
return corpus_names[1]
# Function to process the uploaded files and generate embeddings
def process_files_and_generate_embeddings(uploaded_files, model, corpus_names):
texts_all, labels_all = [], []
for i, zip_file in enumerate(uploaded_files):
texts, labels = load_texts_from_zip(zip_file, corpus_names[i])
texts_all.extend(texts)
labels_all.extend(labels)
# Generate embeddings
embeddings = model.encode(texts_all)
# Create a DataFrame with embeddings, labels, and corpus information
embeddings_df = pd.DataFrame(embeddings)
embeddings_df['label'] = labels_all
embeddings_df['corpus'] = embeddings_df['label'].apply(determine_corpus, corpus_names=corpus_names)
return embeddings_df
# Function to perform PCA on embeddings
def perform_pca(embeddings_df, n_components=3):
pca = PCA(n_components=n_components)
pca_components = pca.fit_transform(embeddings_df.drop(columns=['label', 'corpus']))
pca_df = pd.DataFrame(pca_components, columns=[f'PCA{i+1}' for i in range(n_components)])
return pd.concat([pca_df, embeddings_df[['label', 'corpus']]], axis=1)
# Function to perform logistic regression on embeddings and compute accuracy
def classify_and_report_accuracy(embeddings_df):
unique_classes = embeddings_df['corpus'].nunique()
# Check if there are at least two unique classes for classification
if unique_classes < 2:
# st.sidebar.write("Classification impossible : il n'y a qu'un seul corpus.")
return
# Proceed with classification if there are at least two classes
X = embeddings_df.drop(columns=['label', 'corpus']) # Use full embeddings
y_gold = embeddings_df['corpus']
# Train logistic regression model
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X, y_gold)
# Make predictions and compute accuracy
y_pred = classifier.predict(X)
balanced_acc = balanced_accuracy_score(y_gold, y_pred)
st.sidebar.write(f"Classification (précision) : {balanced_acc:.2f}")
# Function to plot embeddings using Plotly (with 2D or 3D switch)
def plot_embeddings(pca_df):
# Add a checkbox for selecting 3D plot (2D by default)
show_3d = st.checkbox("Afficher en 3D", value=False)
if show_3d:
# Plot in 3D using PCA components
fig = px.scatter_3d(
pca_df, x='PCA1', y='PCA2', z='PCA3',
color='corpus', hover_data=['label'],
title='Visualisation des Embeddings (3D - PCA)'
)
else:
# Plot in 2D using the first two PCA components
fig = px.scatter(
pca_df, x='PCA1', y='PCA2', color='corpus', hover_data=['label'],
title='Visualisation des Embeddings (2D - PCA)'
)
# Update layout and display the plot
fig.update_layout(width=1200, height=800, margin=dict(l=20, r=20, t=50, b=20))
st.plotly_chart(fig, use_container_width=True)
# Function to compute cosine similarity between two corpora
def compute_corpus_similarity(embeddings_df, corpus_names):
unique_classes = embeddings_df['corpus'].nunique()
# Check if there are at least two unique classes for similarity computation
if unique_classes < 2:
# st.sidebar.write("Calcul de similarité impossible : il n'y a qu'un seul corpus.")
return
# Proceed with cosine similarity calculation
corpus_embeddings = embeddings_df.drop(columns=['label', 'corpus'])
# Compute mean embeddings for each corpus
corpus1_embeddings = corpus_embeddings[embeddings_df['corpus'] == corpus_names[0]].values
corpus2_embeddings = corpus_embeddings[embeddings_df['corpus'] == corpus_names[1]].values
similarity = cosine_similarity(corpus1_embeddings, corpus2_embeddings)[0][0]
# Display cosine similarity
st.sidebar.write(f"Similarité Cosine entre les deux corpus: {similarity:.2f}")
# Main logic of the app
if uploaded_files and len(uploaded_files) <= 2:
# Get the corpus names without the .zip extension
corpus_names = [os.path.splitext(uploaded_file.name)[0] for uploaded_file in uploaded_files]
# Hash uploaded files and reset state if needed
file_hash = compute_file_hash(uploaded_files)
if 'uploaded_file_hash' not in st.session_state or st.session_state.uploaded_file_hash != file_hash:
st.session_state.uploaded_file_hash = file_hash
st.session_state.embeddings_df = None
# Load model
if 'model' not in st.session_state:
st.session_state.model = SentenceTransformer('distiluse-base-multilingual-cased-v2')
# Process files and generate embeddings if they aren't already cached
if st.session_state.embeddings_df is None:
st.session_state.embeddings_df = process_files_and_generate_embeddings(uploaded_files, st.session_state.model, corpus_names)
embeddings_df = st.session_state.embeddings_df
# Get the PCA components
pca_df = perform_pca(embeddings_df)
# Perform classification and report accuracy
classify_and_report_accuracy(embeddings_df)
# Compute and display cosine similarity between corpora
compute_corpus_similarity(embeddings_df, corpus_names)
# plot embeddings
plot_embeddings(pca_df)
else:
st.warning("Veuillez téléverser 2 corpus sous forme de dossier compressé (.zip) de fichiers texte (.txt).")
|