gloignon commited on
Commit
d6047d9
·
verified ·
1 Parent(s): b8c44cc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +186 -154
app.py CHANGED
@@ -1,156 +1,188 @@
1
- import gradio as gr
 
 
 
 
 
 
 
 
 
 
2
  import zipfile
3
  import os
4
- import tempfile
5
- import pandas as pd
6
- import spacy
7
- import subprocess
8
-
9
- # Ensure the spaCy French model is downloaded
10
- try:
11
- nlp = spacy.load("fr_core_news_sm")
12
- except OSError:
13
- print("Downloading spaCy 'fr_core_news_sm' model...")
14
- subprocess.run(["python", "-m", "spacy", "download", "fr_core_news_sm"])
15
- nlp = spacy.load("fr_core_news_sm")
16
-
17
- # Function to lemmatize text using spaCy
18
- def lemmatize_text(text):
19
- doc = nlp(text)
20
- return " ".join([token.lemma_ for token in doc])
21
-
22
- # Global variables to store the corpus
23
- raw_corpus = {} # To store raw texts
24
- lemmatized_corpus = {} # To store lemmatized texts
25
- initial_df = pd.DataFrame()
26
-
27
- # Function to process the zip file, lemmatize text, get document names, and calculate word counts
28
- def process_zip_initial(zip_file):
29
- global raw_corpus, lemmatized_corpus, initial_df # To store the raw texts, lemmatized texts, and DataFrame
30
- raw_corpus = {}
31
- lemmatized_corpus = {} # Reset the corpus on new upload
32
-
33
- # Create a temporary directory to extract files
34
- with tempfile.TemporaryDirectory() as temp_dir:
35
- # Extract the zip file
36
- with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
37
- zip_ref.extractall(temp_dir)
38
-
39
- # Recursively get list of all .txt files in all directories and lemmatize the text
40
- txt_files = []
41
- word_counts = []
42
- for root, dirs, files in os.walk(temp_dir):
43
- for file in files:
44
- if file.endswith('.txt'):
45
- file_path = os.path.join(root, file)
46
- txt_files.append(os.path.basename(file_path)) # Only the file name
47
-
48
- # Read the text
49
- with open(file_path, 'r', encoding='utf-8') as f:
50
- text = f.read()
51
- word_count = len(text.split()) # Split text by spaces to count words
52
- word_counts.append(word_count)
53
-
54
- # Store raw text in raw_corpus
55
- raw_corpus[os.path.basename(file_path)] = text.lower()
56
-
57
- # Lemmatize the text and store in lemmatized_corpus
58
- lemmatized_text = lemmatize_text(text.lower())
59
- lemmatized_corpus[os.path.basename(file_path)] = lemmatized_text
60
-
61
- # Create a DataFrame with document names and word counts
62
- initial_df = pd.DataFrame({"Nom du document": txt_files, "N. mots": word_counts})
63
-
64
- return initial_df
65
-
66
- # Function to search for keywords in the selected corpus (raw or lemmatized)
67
- def process_zip_and_search(keywords_text, search_mode):
68
- global raw_corpus, lemmatized_corpus, initial_df # Use the texts stored at corpus upload and initial DataFrame
69
-
70
- # Read the keywords (no lemmatization of keywords)
71
- keywords = [keyword.strip().lower() for keyword in keywords_text.strip().split("\n") if keyword.strip()]
72
-
73
- if not keywords:
74
- # If no keywords are provided, return the initial DataFrame (without the keyword columns)
75
- return initial_df
76
-
77
- # Select the appropriate corpus based on the search mode
78
- corpus = lemmatized_corpus if search_mode == "Lemmes" else raw_corpus
79
-
80
- # Prepare a dictionary to store the results (initialize with Document Name and empty results)
81
- results = {doc_name: {keyword: "" for keyword in keywords} for doc_name in corpus.keys()}
82
-
83
- # Search for keyword frequencies in each text file
84
- for doc_name, text in corpus.items():
85
- for keyword in keywords:
86
- keyword_count = text.count(keyword) # Count occurrences of each keyword
87
- if keyword_count > 0:
88
- results[doc_name][keyword] = keyword_count
89
-
90
- # Convert the results dictionary to a DataFrame
91
- df_keywords = pd.DataFrame(results).T # Transpose to have files as rows and keywords as columns
92
-
93
- # Reset index to make the document names a column
94
- df_keywords.reset_index(inplace=True)
95
-
96
- # Rename the first column to 'Nom du document'
97
- df_keywords.rename(columns={"index": "Nom du document"}, inplace=True)
98
-
99
- # Replace 0 frequencies with empty strings
100
- df_keywords.replace(0, "", inplace=True)
101
-
102
- # Merge the initial DataFrame with the keyword search results
103
- final_df = pd.merge(initial_df, df_keywords, on="Nom du document", how="left")
104
-
105
- return final_df
106
-
107
-
108
- # Function to export the DataFrame to Excel
109
- def export_to_excel(df):
110
- # Create a temporary directory for storing the Excel file
111
- with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
112
- excel_path = tmp.name
113
- # Save the DataFrame to Excel
114
- df.to_excel(excel_path, index=False)
115
- return excel_pathp
116
-
117
- # Create Gradio interface with one results table and export functionality
118
- with gr.Blocks() as demo:
119
- gr.Markdown("# Recherche simple par mots-clés avec lemmatisation") # This line adds the title
120
-
121
- with gr.Row():
122
- # File upload and initial table with document names
123
- zip_file_input = gr.File(label="Téléversez votre dossier .zip contenant les fichiers texte (format .txt)")
124
-
125
- with gr.Row():
126
- # Textbox for entering keywords
127
- keywords_input = gr.Textbox(label="Entrez les mots clés (un par ligne, peuvent contenir plus d'un mot)", placeholder="mots-clés...", lines=10)
128
-
129
- with gr.Row():
130
- # Switch button to select between raw tokens and lemmatized search
131
- search_mode = gr.Radio(label="Choisissez le type de recherche", choices=["Mots", "Lemmes"], value="Lemmes")
132
-
133
- with gr.Row():
134
- # Button to trigger keyword search
135
- search_button = gr.Button("Recherche")
136
-
137
- # Output the final results table after the search button
138
- with gr.Row():
139
- result_table = gr.DataFrame(label="Résultats", col_count=(1, "dynamic"), interactive=False) # Disable renaming/editing
140
-
141
- # Button to trigger the Excel export
142
- with gr.Row():
143
- export_button = gr.Button("Exporter vers Excel (.xlsx)")
144
- download_link = gr.File(label="Télécharger le fichier")
145
-
146
- # Action to display document names and lemmatized text upon ZIP upload
147
- zip_file_input.change(fn=process_zip_initial, inputs=zip_file_input, outputs=result_table)
148
-
149
- # Action to update the table with keywords and results based on the selected search mode
150
- search_button.click(fn=process_zip_and_search, inputs=[keywords_input, search_mode], outputs=result_table)
151
-
152
- # Action to export the results to Excel
153
- export_button.click(fn=export_to_excel, inputs=result_table, outputs=download_link)
154
-
155
- # Launch the app
156
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dernière màj: 2024-10-11
2
+
3
+ import streamlit as st
4
+ from sentence_transformers import SentenceTransformer
5
+ from sklearn.linear_model import LogisticRegression
6
+ from sklearn.metrics import balanced_accuracy_score
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+ from sklearn.decomposition import PCA
9
+
10
+ import pandas as pd
11
+ import plotly.express as px # Import plotly express
12
  import zipfile
13
  import os
14
+ import hashlib
15
+ import numpy as np
16
+
17
+ # Set page configuration
18
+ st.set_page_config(page_title="Analyse en composantes principales interactive des plongements sémantiques (2 corpus)", layout="wide")
19
+
20
+ # Title of the app
21
+ st.title("Exploration de l'espace sémantique (2 corpus)")
22
+
23
+ # Sidebar for uploading files
24
+ st.sidebar.header("Téléversez vos corpus")
25
+ uploaded_files = st.sidebar.file_uploader("Téléversez (upload) jusqu'à deux dossiers compressés en format zip", type="zip", accept_multiple_files=True)
26
+
27
+
28
+ # Function to extract texts and labels from zipped folders
29
+ def load_texts_from_zip(zip_file, corpus_prefix):
30
+ texts, labels = [], []
31
+ with zipfile.ZipFile(zip_file, 'r') as zip_ref:
32
+ for file_info in zip_ref.infolist():
33
+ if file_info.filename.endswith('.txt'):
34
+ with zip_ref.open(file_info.filename) as file:
35
+ text = file.read().decode('utf-8')
36
+ texts.append(text)
37
+ labels.append(f'{corpus_prefix}_{os.path.basename(file_info.filename)}')
38
+ return texts, labels
39
+
40
+
41
+ # Function to compute a hash of the uploaded files for comparison
42
+ def compute_file_hash(files):
43
+ file_hash = hashlib.md5()
44
+ for file in files:
45
+ file_hash.update(file.read())
46
+ file.seek(0)
47
+ return file_hash.hexdigest()
48
+
49
+
50
+ # Function to determine corpus from label prefix
51
+ def determine_corpus(label, corpus_names):
52
+ if label.startswith(f'{corpus_names[0]}_'):
53
+ return corpus_names[0]
54
+ elif label.startswith(f'{corpus_names[1]}_'):
55
+ return corpus_names[1]
56
+
57
+
58
+ # Function to process the uploaded files and generate embeddings
59
+ def process_files_and_generate_embeddings(uploaded_files, model, corpus_names):
60
+ texts_all, labels_all = [], []
61
+ for i, zip_file in enumerate(uploaded_files):
62
+ texts, labels = load_texts_from_zip(zip_file, corpus_names[i])
63
+ texts_all.extend(texts)
64
+ labels_all.extend(labels)
65
+
66
+ # Generate embeddings
67
+ embeddings = model.encode(texts_all)
68
+
69
+ # Create a DataFrame with embeddings, labels, and corpus information
70
+ embeddings_df = pd.DataFrame(embeddings)
71
+ embeddings_df['label'] = labels_all
72
+ embeddings_df['corpus'] = embeddings_df['label'].apply(determine_corpus, corpus_names=corpus_names)
73
+
74
+ return embeddings_df
75
+
76
+ # Function to perform PCA on embeddings
77
+ def perform_pca(embeddings_df, n_components=3):
78
+ pca = PCA(n_components=n_components)
79
+ pca_components = pca.fit_transform(embeddings_df.drop(columns=['label', 'corpus']))
80
+ pca_df = pd.DataFrame(pca_components, columns=[f'PCA{i+1}' for i in range(n_components)])
81
+ return pd.concat([pca_df, embeddings_df[['label', 'corpus']]], axis=1)
82
+
83
+
84
+ # Function to perform logistic regression on embeddings and compute accuracy
85
+ def classify_and_report_accuracy(embeddings_df):
86
+ unique_classes = embeddings_df['corpus'].nunique()
87
+
88
+ # Check if there are at least two unique classes for classification
89
+ if unique_classes < 2:
90
+ # st.sidebar.write("Classification impossible : il n'y a qu'un seul corpus.")
91
+ return
92
+
93
+ # Proceed with classification if there are at least two classes
94
+ X = embeddings_df.drop(columns=['label', 'corpus']) # Use full embeddings
95
+ y_gold = embeddings_df['corpus']
96
+
97
+ # Train logistic regression model
98
+ classifier = LogisticRegression(max_iter=1000)
99
+ classifier.fit(X, y_gold)
100
+
101
+ # Make predictions and compute accuracy
102
+ y_pred = classifier.predict(X)
103
+ balanced_acc = balanced_accuracy_score(y_gold, y_pred)
104
+ st.sidebar.write(f"Classification (précision) : {balanced_acc:.2f}")
105
+
106
+
107
+ # Function to plot embeddings using Plotly (with 2D or 3D switch)
108
+ def plot_embeddings(pca_df):
109
+ # Add a checkbox for selecting 3D plot (2D by default)
110
+ show_3d = st.checkbox("Afficher en 3D", value=False)
111
+
112
+ if show_3d:
113
+ # Plot in 3D using PCA components
114
+ fig = px.scatter_3d(
115
+ pca_df, x='PCA1', y='PCA2', z='PCA3',
116
+ color='corpus', hover_data=['label'],
117
+ title='Visualisation des Embeddings (3D - PCA)'
118
+ )
119
+ else:
120
+ # Plot in 2D using the first two PCA components
121
+ fig = px.scatter(
122
+ pca_df, x='PCA1', y='PCA2', color='corpus', hover_data=['label'],
123
+ title='Visualisation des Embeddings (2D - PCA)'
124
+ )
125
+
126
+ # Update layout and display the plot
127
+ fig.update_layout(width=1200, height=800, margin=dict(l=20, r=20, t=50, b=20))
128
+ st.plotly_chart(fig, use_container_width=True)
129
+
130
+
131
+
132
+ # Function to compute cosine similarity between two corpora
133
+ def compute_corpus_similarity(embeddings_df, corpus_names):
134
+ unique_classes = embeddings_df['corpus'].nunique()
135
+
136
+ # Check if there are at least two unique classes for similarity computation
137
+ if unique_classes < 2:
138
+ # st.sidebar.write("Calcul de similarité impossible : il n'y a qu'un seul corpus.")
139
+ return
140
+
141
+ # Proceed with cosine similarity calculation
142
+ corpus_embeddings = embeddings_df.drop(columns=['label', 'corpus'])
143
+
144
+ # Compute mean embeddings for each corpus
145
+ corpus1_embeddings = corpus_embeddings[embeddings_df['corpus'] == corpus_names[0]].values
146
+ corpus2_embeddings = corpus_embeddings[embeddings_df['corpus'] == corpus_names[1]].values
147
+
148
+ similarity = cosine_similarity(corpus1_embeddings, corpus2_embeddings)[0][0]
149
+
150
+ # Display cosine similarity
151
+ st.sidebar.write(f"Similarité Cosine entre les deux corpus: {similarity:.2f}")
152
+
153
+
154
+ # Main logic of the app
155
+ if uploaded_files and len(uploaded_files) <= 2:
156
+ # Get the corpus names without the .zip extension
157
+ corpus_names = [os.path.splitext(uploaded_file.name)[0] for uploaded_file in uploaded_files]
158
+
159
+ # Hash uploaded files and reset state if needed
160
+ file_hash = compute_file_hash(uploaded_files)
161
+ if 'uploaded_file_hash' not in st.session_state or st.session_state.uploaded_file_hash != file_hash:
162
+ st.session_state.uploaded_file_hash = file_hash
163
+ st.session_state.embeddings_df = None
164
+
165
+ # Load model
166
+ if 'model' not in st.session_state:
167
+ st.session_state.model = SentenceTransformer('distiluse-base-multilingual-cased-v2')
168
+
169
+ # Process files and generate embeddings if they aren't already cached
170
+ if st.session_state.embeddings_df is None:
171
+ st.session_state.embeddings_df = process_files_and_generate_embeddings(uploaded_files, st.session_state.model, corpus_names)
172
+
173
+ embeddings_df = st.session_state.embeddings_df
174
+
175
+ # Get the PCA components
176
+ pca_df = perform_pca(embeddings_df)
177
+
178
+ # Perform classification and report accuracy
179
+ classify_and_report_accuracy(embeddings_df)
180
+
181
+ # Compute and display cosine similarity between corpora
182
+ compute_corpus_similarity(embeddings_df, corpus_names)
183
+
184
+ # plot embeddings
185
+ plot_embeddings(pca_df)
186
+
187
+ else:
188
+ st.warning("Veuillez téléverser 2 corpus sous forme de dossier compressé (.zip) de fichiers texte (.txt).")