File size: 6,646 Bytes
d417898
 
 
 
 
c9b4757
 
d417898
c9b4757
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d417898
c9b4757
 
 
 
d417898
 
 
 
 
 
c9b4757
d417898
 
 
 
 
 
 
 
c9b4757
d417898
 
 
 
 
c9b4757
 
 
 
 
 
d417898
 
c9b4757
d417898
c9b4757
d417898
c9b4757
 
 
d417898
c9b4757
f926cf7
 
 
 
 
 
c9b4757
 
f926cf7
c9b4757
 
f926cf7
c9b4757
 
 
 
 
 
f926cf7
c9b4757
 
f926cf7
c9b4757
 
f926cf7
c9b4757
 
f926cf7
c9b4757
 
f926cf7
c9b4757
 
f926cf7
c9b4757
d417898
f926cf7
d417898
 
 
 
 
 
 
f3eaee0
d417898
 
 
c9b4757
d417898
 
 
 
 
 
 
 
 
c9b4757
 
 
 
d417898
 
 
 
 
 
bddd5ad
d417898
 
 
 
 
 
c9b4757
d417898
 
c9b4757
 
d417898
 
 
 
 
f926cf7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import gradio as gr
import zipfile
import os
import tempfile
import pandas as pd
import spacy
import subprocess

# Ensure the spaCy French model is downloaded
try:
    nlp = spacy.load("fr_core_news_sm")
except OSError:
    print("Downloading spaCy 'fr_core_news_sm' model...")
    subprocess.run(["python", "-m", "spacy", "download", "fr_core_news_sm"])
    nlp = spacy.load("fr_core_news_sm")

# Function to lemmatize text using spaCy
def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

# Global variables to store the corpus
raw_corpus = {}  # To store raw texts
lemmatized_corpus = {}  # To store lemmatized texts
initial_df = pd.DataFrame()

# Function to process the zip file, lemmatize text, get document names, and calculate word counts
def process_zip_initial(zip_file):
    global raw_corpus, lemmatized_corpus, initial_df  # To store the raw texts, lemmatized texts, and DataFrame
    raw_corpus = {}
    lemmatized_corpus = {}  # Reset the corpus on new upload

    # Create a temporary directory to extract files
    with tempfile.TemporaryDirectory() as temp_dir:
        # Extract the zip file
        with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
            zip_ref.extractall(temp_dir)
        
        # Recursively get list of all .txt files in all directories and lemmatize the text
        txt_files = []
        word_counts = []
        for root, dirs, files in os.walk(temp_dir):
            for file in files:
                if file.endswith('.txt'):
                    file_path = os.path.join(root, file)
                    txt_files.append(os.path.basename(file_path))  # Only the file name
                    
                    # Read the text
                    with open(file_path, 'r', encoding='utf-8') as f:
                        text = f.read()
                        word_count = len(text.split())  # Split text by spaces to count words
                        word_counts.append(word_count)

                        # Store raw text in raw_corpus
                        raw_corpus[os.path.basename(file_path)] = text.lower()

                        # Lemmatize the text and store in lemmatized_corpus
                        lemmatized_text = lemmatize_text(text.lower())
                        lemmatized_corpus[os.path.basename(file_path)] = lemmatized_text
        
        # Create a DataFrame with document names and word counts
        initial_df = pd.DataFrame({"Nom du document": txt_files, "N. mots": word_counts})
        
        return initial_df

# Function to search for keywords in the selected corpus (raw or lemmatized)
def process_zip_and_search(keywords_text, search_mode):
    global raw_corpus, lemmatized_corpus, initial_df  # Use the texts stored at corpus upload and initial DataFrame

    # Read the keywords (no lemmatization of keywords)
    keywords = [keyword.strip().lower() for keyword in keywords_text.strip().split("\n") if keyword.strip()]

    if not keywords:
        # If no keywords are provided, return the initial DataFrame (without the keyword columns)
        return initial_df

    # Select the appropriate corpus based on the search mode
    corpus = lemmatized_corpus if search_mode == "Lemmes" else raw_corpus

    # Prepare a dictionary to store the results (initialize with Document Name and empty results)
    results = {doc_name: {keyword: "" for keyword in keywords} for doc_name in corpus.keys()}

    # Search for keyword frequencies in each text file
    for doc_name, text in corpus.items():
        for keyword in keywords:
            keyword_count = text.count(keyword)  # Count occurrences of each keyword
            if keyword_count > 0:
                results[doc_name][keyword] = keyword_count

    # Convert the results dictionary to a DataFrame
    df_keywords = pd.DataFrame(results).T  # Transpose to have files as rows and keywords as columns

    # Reset index to make the document names a column
    df_keywords.reset_index(inplace=True)

    # Rename the first column to 'Nom du document'
    df_keywords.rename(columns={"index": "Nom du document"}, inplace=True)

    # Replace 0 frequencies with empty strings
    df_keywords.replace(0, "", inplace=True)

    # Merge the initial DataFrame with the keyword search results
    final_df = pd.merge(initial_df, df_keywords, on="Nom du document", how="left")

    return final_df


# Function to export the DataFrame to Excel
def export_to_excel(df):
    # Create a temporary directory for storing the Excel file
    with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
        excel_path = tmp.name
        # Save the DataFrame to Excel
        df.to_excel(excel_path, index=False)
    return excel_path

# Create Gradio interface with one results table and export functionality
with gr.Blocks() as demo:
    gr.Markdown("# Recherche simple par mots-clés avec lemmatisation")  # This line adds the title

    with gr.Row():
        # File upload and initial table with document names
        zip_file_input = gr.File(label="Téléversez votre dossier .zip contenant les fichiers texte (format .txt)")
    
    with gr.Row():
        # Textbox for entering keywords
        keywords_input = gr.Textbox(label="Entrez les mots clés (un par ligne, peuvent contenir plus d'un mot)", placeholder="mots-clés...", lines=10)
    
    with gr.Row():
        # Switch button to select between raw tokens and lemmatized search
        search_mode = gr.Radio(label="Choisissez le type de recherche", choices=["Mots", "Lemmes"], value="Lemmes")

    with gr.Row():
        # Button to trigger keyword search
        search_button = gr.Button("Recherche")
    
    # Output the final results table after the search button
    with gr.Row():
        result_table = gr.DataFrame(label="Résultats", col_count=(1, "dynamic"), interactive=False)  # Disable renaming/editing
    
    # Button to trigger the Excel export
    with gr.Row():
        export_button = gr.Button("Exporter vers Excel (.xlsx)")
        download_link = gr.File(label="Télécharger le fichier")

    # Action to display document names and lemmatized text upon ZIP upload
    zip_file_input.change(fn=process_zip_initial, inputs=zip_file_input, outputs=result_table)
    
    # Action to update the table with keywords and results based on the selected search mode
    search_button.click(fn=process_zip_and_search, inputs=[keywords_input, search_mode], outputs=result_table)
    
    # Action to export the results to Excel
    export_button.click(fn=export_to_excel, inputs=result_table, outputs=download_link)

# Launch the app
demo.launch()