corpuscope

Sleeping

App Files Files Community

gloignon commited on Oct 11, 2024

Commit

d6047d9

verified ·

1 Parent(s): b8c44cc

Update app.py

Browse files

Files changed (1) hide show

app.py +186 -154

app.py CHANGED Viewed

@@ -1,156 +1,188 @@
-import gradio as gr
 import zipfile
 import os
-import tempfile
-import pandas as pd
-import spacy
-import subprocess
-# Ensure the spaCy French model is downloaded
-try:
-    nlp = spacy.load("fr_core_news_sm")
-except OSError:
-    print("Downloading spaCy 'fr_core_news_sm' model...")
-    subprocess.run(["python", "-m", "spacy", "download", "fr_core_news_sm"])
-    nlp = spacy.load("fr_core_news_sm")
-# Function to lemmatize text using spaCy
-def lemmatize_text(text):
-    doc = nlp(text)
-    return " ".join([token.lemma_ for token in doc])
-# Global variables to store the corpus
-raw_corpus = {}  # To store raw texts
-lemmatized_corpus = {}  # To store lemmatized texts
-initial_df = pd.DataFrame()
-# Function to process the zip file, lemmatize text, get document names, and calculate word counts
-def process_zip_initial(zip_file):
-    global raw_corpus, lemmatized_corpus, initial_df  # To store the raw texts, lemmatized texts, and DataFrame
-    raw_corpus = {}
-    lemmatized_corpus = {}  # Reset the corpus on new upload
-    # Create a temporary directory to extract files
-    with tempfile.TemporaryDirectory() as temp_dir:
-        # Extract the zip file
-        with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
-            zip_ref.extractall(temp_dir)
-        # Recursively get list of all .txt files in all directories and lemmatize the text
-        txt_files = []
-        word_counts = []
-        for root, dirs, files in os.walk(temp_dir):
-            for file in files:
-                if file.endswith('.txt'):
-                    file_path = os.path.join(root, file)
-                    txt_files.append(os.path.basename(file_path))  # Only the file name
-                    # Read the text
-                    with open(file_path, 'r', encoding='utf-8') as f:
-                        text = f.read()
-                        word_count = len(text.split())  # Split text by spaces to count words
-                        word_counts.append(word_count)
-                        # Store raw text in raw_corpus
-                        raw_corpus[os.path.basename(file_path)] = text.lower()
-                        # Lemmatize the text and store in lemmatized_corpus
-                        lemmatized_text = lemmatize_text(text.lower())
-                        lemmatized_corpus[os.path.basename(file_path)] = lemmatized_text
-        # Create a DataFrame with document names and word counts
-        initial_df = pd.DataFrame({"Nom du document": txt_files, "N. mots": word_counts})
-        return initial_df
-# Function to search for keywords in the selected corpus (raw or lemmatized)
-def process_zip_and_search(keywords_text, search_mode):
-    global raw_corpus, lemmatized_corpus, initial_df  # Use the texts stored at corpus upload and initial DataFrame
-    # Read the keywords (no lemmatization of keywords)
-    keywords = [keyword.strip().lower() for keyword in keywords_text.strip().split("\n") if keyword.strip()]
-    if not keywords:
-        # If no keywords are provided, return the initial DataFrame (without the keyword columns)
-        return initial_df
-    # Select the appropriate corpus based on the search mode
-    corpus = lemmatized_corpus if search_mode == "Lemmes" else raw_corpus
-    # Prepare a dictionary to store the results (initialize with Document Name and empty results)
-    results = {doc_name: {keyword: "" for keyword in keywords} for doc_name in corpus.keys()}
-    # Search for keyword frequencies in each text file
-    for doc_name, text in corpus.items():
-        for keyword in keywords:
-            keyword_count = text.count(keyword)  # Count occurrences of each keyword
-            if keyword_count > 0:
-                results[doc_name][keyword] = keyword_count
-    # Convert the results dictionary to a DataFrame
-    df_keywords = pd.DataFrame(results).T  # Transpose to have files as rows and keywords as columns
-    # Reset index to make the document names a column
-    df_keywords.reset_index(inplace=True)
-    # Rename the first column to 'Nom du document'
-    df_keywords.rename(columns={"index": "Nom du document"}, inplace=True)
-    # Replace 0 frequencies with empty strings
-    df_keywords.replace(0, "", inplace=True)
-    # Merge the initial DataFrame with the keyword search results
-    final_df = pd.merge(initial_df, df_keywords, on="Nom du document", how="left")
-    return final_df
-# Function to export the DataFrame to Excel
-def export_to_excel(df):
-    # Create a temporary directory for storing the Excel file
-    with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
-        excel_path = tmp.name
-        # Save the DataFrame to Excel
-        df.to_excel(excel_path, index=False)
-    return excel_pathp
-# Create Gradio interface with one results table and export functionality
-with gr.Blocks() as demo:
-    gr.Markdown("# Recherche simple par mots-clés avec lemmatisation")  # This line adds the title
-    with gr.Row():
-        # File upload and initial table with document names
-        zip_file_input = gr.File(label="Téléversez votre dossier .zip contenant les fichiers texte (format .txt)")
-    with gr.Row():
-        # Textbox for entering keywords
-        keywords_input = gr.Textbox(label="Entrez les mots clés (un par ligne, peuvent contenir plus d'un mot)", placeholder="mots-clés...", lines=10)
-    with gr.Row():
-        # Switch button to select between raw tokens and lemmatized search
-        search_mode = gr.Radio(label="Choisissez le type de recherche", choices=["Mots", "Lemmes"], value="Lemmes")
-    with gr.Row():
-        # Button to trigger keyword search
-        search_button = gr.Button("Recherche")
-    # Output the final results table after the search button
-    with gr.Row():
-        result_table = gr.DataFrame(label="Résultats", col_count=(1, "dynamic"), interactive=False)  # Disable renaming/editing
-    # Button to trigger the Excel export
-    with gr.Row():
-        export_button = gr.Button("Exporter vers Excel (.xlsx)")
-        download_link = gr.File(label="Télécharger le fichier")
-    # Action to display document names and lemmatized text upon ZIP upload
-    zip_file_input.change(fn=process_zip_initial, inputs=zip_file_input, outputs=result_table)
-    # Action to update the table with keywords and results based on the selected search mode
-    search_button.click(fn=process_zip_and_search, inputs=[keywords_input, search_mode], outputs=result_table)
-    # Action to export the results to Excel
-    export_button.click(fn=export_to_excel, inputs=result_table, outputs=download_link)
-# Launch the app
-demo.launch()

+# Dernière màj: 2024-10-11
+import streamlit as st
+from sentence_transformers import SentenceTransformer
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import balanced_accuracy_score
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.decomposition import PCA
+import pandas as pd
+import plotly.express as px  # Import plotly express
 import zipfile
 import os
+import hashlib
+import numpy as np
+# Set page configuration
+st.set_page_config(page_title="Analyse en composantes principales interactive des plongements sémantiques (2 corpus)", layout="wide")
+# Title of the app
+st.title("Exploration de l'espace sémantique (2 corpus)")
+# Sidebar for uploading files
+st.sidebar.header("Téléversez vos corpus")
+uploaded_files = st.sidebar.file_uploader("Téléversez (upload) jusqu'à deux dossiers compressés en format zip", type="zip", accept_multiple_files=True)
+# Function to extract texts and labels from zipped folders
+def load_texts_from_zip(zip_file, corpus_prefix):
+    texts, labels = [], []
+    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
+        for file_info in zip_ref.infolist():
+            if file_info.filename.endswith('.txt'):
+                with zip_ref.open(file_info.filename) as file:
+                    text = file.read().decode('utf-8')
+                    texts.append(text)
+                    labels.append(f'{corpus_prefix}_{os.path.basename(file_info.filename)}')
+    return texts, labels
+# Function to compute a hash of the uploaded files for comparison
+def compute_file_hash(files):
+    file_hash = hashlib.md5()
+    for file in files:
+        file_hash.update(file.read())
+        file.seek(0)
+    return file_hash.hexdigest()
+# Function to determine corpus from label prefix
+def determine_corpus(label, corpus_names):
+    if label.startswith(f'{corpus_names[0]}_'):
+        return corpus_names[0]
+    elif label.startswith(f'{corpus_names[1]}_'):
+        return corpus_names[1]
+# Function to process the uploaded files and generate embeddings
+def process_files_and_generate_embeddings(uploaded_files, model, corpus_names):
+    texts_all, labels_all = [], []
+    for i, zip_file in enumerate(uploaded_files):
+        texts, labels = load_texts_from_zip(zip_file, corpus_names[i])
+        texts_all.extend(texts)
+        labels_all.extend(labels)
+    # Generate embeddings
+    embeddings = model.encode(texts_all)
+    # Create a DataFrame with embeddings, labels, and corpus information
+    embeddings_df = pd.DataFrame(embeddings)
+    embeddings_df['label'] = labels_all
+    embeddings_df['corpus'] = embeddings_df['label'].apply(determine_corpus, corpus_names=corpus_names)
+    return embeddings_df
+# Function to perform PCA on embeddings
+def perform_pca(embeddings_df, n_components=3):
+    pca = PCA(n_components=n_components)
+    pca_components = pca.fit_transform(embeddings_df.drop(columns=['label', 'corpus']))
+    pca_df = pd.DataFrame(pca_components, columns=[f'PCA{i+1}' for i in range(n_components)])
+    return pd.concat([pca_df, embeddings_df[['label', 'corpus']]], axis=1)
+# Function to perform logistic regression on embeddings and compute accuracy
+def classify_and_report_accuracy(embeddings_df):
+    unique_classes = embeddings_df['corpus'].nunique()
+    # Check if there are at least two unique classes for classification
+    if unique_classes < 2:
+        # st.sidebar.write("Classification impossible : il n'y a qu'un seul corpus.")
+        return
+    # Proceed with classification if there are at least two classes
+    X = embeddings_df.drop(columns=['label', 'corpus'])  # Use full embeddings
+    y_gold = embeddings_df['corpus']
+    # Train logistic regression model
+    classifier = LogisticRegression(max_iter=1000)
+    classifier.fit(X, y_gold)
+    # Make predictions and compute accuracy
+    y_pred = classifier.predict(X)
+    balanced_acc = balanced_accuracy_score(y_gold, y_pred)
+    st.sidebar.write(f"Classification (précision) : {balanced_acc:.2f}")
+# Function to plot embeddings using Plotly (with 2D or 3D switch)
+def plot_embeddings(pca_df):
+    # Add a checkbox for selecting 3D plot (2D by default)
+    show_3d = st.checkbox("Afficher en 3D", value=False)
+    if show_3d:
+        # Plot in 3D using PCA components
+        fig = px.scatter_3d(
+            pca_df, x='PCA1', y='PCA2', z='PCA3',
+            color='corpus', hover_data=['label'],
+            title='Visualisation des Embeddings (3D - PCA)'
+        )
+    else:
+        # Plot in 2D using the first two PCA components
+        fig = px.scatter(
+            pca_df, x='PCA1', y='PCA2', color='corpus', hover_data=['label'],
+            title='Visualisation des Embeddings (2D - PCA)'
+        )
+    # Update layout and display the plot
+    fig.update_layout(width=1200, height=800, margin=dict(l=20, r=20, t=50, b=20))
+    st.plotly_chart(fig, use_container_width=True)
+# Function to compute cosine similarity between two corpora
+def compute_corpus_similarity(embeddings_df, corpus_names):
+    unique_classes = embeddings_df['corpus'].nunique()
+    # Check if there are at least two unique classes for similarity computation
+    if unique_classes < 2:
+        # st.sidebar.write("Calcul de similarité impossible : il n'y a qu'un seul corpus.")
+        return
+    # Proceed with cosine similarity calculation
+    corpus_embeddings = embeddings_df.drop(columns=['label', 'corpus'])
+    # Compute mean embeddings for each corpus
+    corpus1_embeddings = corpus_embeddings[embeddings_df['corpus'] == corpus_names[0]].values
+    corpus2_embeddings = corpus_embeddings[embeddings_df['corpus'] == corpus_names[1]].values
+    similarity = cosine_similarity(corpus1_embeddings, corpus2_embeddings)[0][0]
+    # Display cosine similarity
+    st.sidebar.write(f"Similarité Cosine entre les deux corpus: {similarity:.2f}")
+# Main logic of the app
+if uploaded_files and len(uploaded_files) <= 2:
+    # Get the corpus names without the .zip extension
+    corpus_names = [os.path.splitext(uploaded_file.name)[0] for uploaded_file in uploaded_files]
+    # Hash uploaded files and reset state if needed
+    file_hash = compute_file_hash(uploaded_files)
+    if 'uploaded_file_hash' not in st.session_state or st.session_state.uploaded_file_hash != file_hash:
+        st.session_state.uploaded_file_hash = file_hash
+        st.session_state.embeddings_df = None
+    # Load model
+    if 'model' not in st.session_state:
+        st.session_state.model = SentenceTransformer('distiluse-base-multilingual-cased-v2')
+    # Process files and generate embeddings if they aren't already cached
+    if st.session_state.embeddings_df is None:
+        st.session_state.embeddings_df = process_files_and_generate_embeddings(uploaded_files, st.session_state.model, corpus_names)
+    embeddings_df = st.session_state.embeddings_df
+    # Get the PCA components
+    pca_df = perform_pca(embeddings_df)
+    # Perform classification and report accuracy
+    classify_and_report_accuracy(embeddings_df)
+    # Compute and display cosine similarity between corpora
+    compute_corpus_similarity(embeddings_df, corpus_names)
+    # plot embeddings
+    plot_embeddings(pca_df)
+else:
+    st.warning("Veuillez téléverser 2 corpus sous forme de dossier compressé (.zip) de fichiers texte (.txt).")