File size: 7,275 Bytes
d6047d9
 
 
 
 
 
 
 
 
 
 
8dd897c
 
d6047d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# Dernière màj: 2024-10-11

import streamlit as st
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

import pandas as pd
import plotly.express as px  # Import plotly express
import zipfile
import os
import hashlib
import numpy as np

# Set page configuration
st.set_page_config(page_title="Analyse en composantes principales interactive des plongements sémantiques (2 corpus)", layout="wide")

# Title of the app
st.title("Exploration de l'espace sémantique (2 corpus)")

# Sidebar for uploading files
st.sidebar.header("Téléversez vos corpus")
uploaded_files = st.sidebar.file_uploader("Téléversez (upload) jusqu'à deux dossiers compressés en format zip", type="zip", accept_multiple_files=True)


# Function to extract texts and labels from zipped folders
def load_texts_from_zip(zip_file, corpus_prefix):
    texts, labels = [], []
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        for file_info in zip_ref.infolist():
            if file_info.filename.endswith('.txt'):
                with zip_ref.open(file_info.filename) as file:
                    text = file.read().decode('utf-8')
                    texts.append(text)
                    labels.append(f'{corpus_prefix}_{os.path.basename(file_info.filename)}')
    return texts, labels


# Function to compute a hash of the uploaded files for comparison
def compute_file_hash(files):
    file_hash = hashlib.md5()
    for file in files:
        file_hash.update(file.read())
        file.seek(0)
    return file_hash.hexdigest()


# Function to determine corpus from label prefix
def determine_corpus(label, corpus_names):
    if label.startswith(f'{corpus_names[0]}_'):
        return corpus_names[0]
    elif label.startswith(f'{corpus_names[1]}_'):
        return corpus_names[1]


# Function to process the uploaded files and generate embeddings
def process_files_and_generate_embeddings(uploaded_files, model, corpus_names):
    texts_all, labels_all = [], []
    for i, zip_file in enumerate(uploaded_files):
        texts, labels = load_texts_from_zip(zip_file, corpus_names[i])
        texts_all.extend(texts)
        labels_all.extend(labels)

    # Generate embeddings
    embeddings = model.encode(texts_all)

    # Create a DataFrame with embeddings, labels, and corpus information
    embeddings_df = pd.DataFrame(embeddings)
    embeddings_df['label'] = labels_all
    embeddings_df['corpus'] = embeddings_df['label'].apply(determine_corpus, corpus_names=corpus_names)

    return embeddings_df

# Function to perform PCA on embeddings
def perform_pca(embeddings_df, n_components=3):
    pca = PCA(n_components=n_components)
    pca_components = pca.fit_transform(embeddings_df.drop(columns=['label', 'corpus']))
    pca_df = pd.DataFrame(pca_components, columns=[f'PCA{i+1}' for i in range(n_components)])
    return pd.concat([pca_df, embeddings_df[['label', 'corpus']]], axis=1)


# Function to perform logistic regression on embeddings and compute accuracy
def classify_and_report_accuracy(embeddings_df):
    unique_classes = embeddings_df['corpus'].nunique()

    # Check if there are at least two unique classes for classification
    if unique_classes < 2:
        # st.sidebar.write("Classification impossible : il n'y a qu'un seul corpus.")
        return

    # Proceed with classification if there are at least two classes
    X = embeddings_df.drop(columns=['label', 'corpus'])  # Use full embeddings
    y_gold = embeddings_df['corpus']

    # Train logistic regression model
    classifier = LogisticRegression(max_iter=1000)
    classifier.fit(X, y_gold)

    # Make predictions and compute accuracy
    y_pred = classifier.predict(X)
    balanced_acc = balanced_accuracy_score(y_gold, y_pred)
    st.sidebar.write(f"Classification (précision) : {balanced_acc:.2f}")


# Function to plot embeddings using Plotly (with 2D or 3D switch)
def plot_embeddings(pca_df):
    # Add a checkbox for selecting 3D plot (2D by default)
    show_3d = st.checkbox("Afficher en 3D", value=False)

    if show_3d:
        # Plot in 3D using PCA components
        fig = px.scatter_3d(
            pca_df, x='PCA1', y='PCA2', z='PCA3', 
            color='corpus', hover_data=['label'],
            title='Visualisation des Embeddings (3D - PCA)'
        )
    else:
        # Plot in 2D using the first two PCA components
        fig = px.scatter(
            pca_df, x='PCA1', y='PCA2', color='corpus', hover_data=['label'],
            title='Visualisation des Embeddings (2D - PCA)'
        )

    # Update layout and display the plot
    fig.update_layout(width=1200, height=800, margin=dict(l=20, r=20, t=50, b=20))
    st.plotly_chart(fig, use_container_width=True)



# Function to compute cosine similarity between two corpora
def compute_corpus_similarity(embeddings_df, corpus_names):
    unique_classes = embeddings_df['corpus'].nunique()

    # Check if there are at least two unique classes for similarity computation
    if unique_classes < 2:
        # st.sidebar.write("Calcul de similarité impossible : il n'y a qu'un seul corpus.")
        return

    # Proceed with cosine similarity calculation
    corpus_embeddings = embeddings_df.drop(columns=['label', 'corpus'])

    # Compute mean embeddings for each corpus
    corpus1_embeddings = corpus_embeddings[embeddings_df['corpus'] == corpus_names[0]].values
    corpus2_embeddings = corpus_embeddings[embeddings_df['corpus'] == corpus_names[1]].values

    similarity = cosine_similarity(corpus1_embeddings, corpus2_embeddings)[0][0]

    # Display cosine similarity
    st.sidebar.write(f"Similarité Cosine entre les deux corpus: {similarity:.2f}")


# Main logic of the app
if uploaded_files and len(uploaded_files) <= 2:
    # Get the corpus names without the .zip extension
    corpus_names = [os.path.splitext(uploaded_file.name)[0] for uploaded_file in uploaded_files]

    # Hash uploaded files and reset state if needed
    file_hash = compute_file_hash(uploaded_files)
    if 'uploaded_file_hash' not in st.session_state or st.session_state.uploaded_file_hash != file_hash:
        st.session_state.uploaded_file_hash = file_hash
        st.session_state.embeddings_df = None

    # Load model
    if 'model' not in st.session_state:
        st.session_state.model = SentenceTransformer('distiluse-base-multilingual-cased-v2')

    # Process files and generate embeddings if they aren't already cached
    if st.session_state.embeddings_df is None:
        st.session_state.embeddings_df = process_files_and_generate_embeddings(uploaded_files, st.session_state.model, corpus_names)

    embeddings_df = st.session_state.embeddings_df

    # Get the PCA components
    pca_df = perform_pca(embeddings_df)

    # Perform classification and report accuracy
    classify_and_report_accuracy(embeddings_df)

    # Compute and display cosine similarity between corpora
    compute_corpus_similarity(embeddings_df, corpus_names)

    # plot embeddings
    plot_embeddings(pca_df)

else:
    st.warning("Veuillez téléverser 2 corpus sous forme de dossier compressé (.zip) de fichiers texte (.txt).")