gloignon commited on
Commit
4910a63
·
verified ·
1 Parent(s): 3dfd0bf

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +143 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import zipfile
3
+ import os
4
+ import tempfile
5
+ import pandas as pd
6
+
7
+ # Function to process the zip file, get document names, and calculate word counts (without keywords initially)
8
+ def process_zip_initial(zip_file):
9
+ # Create a temporary directory to extract files
10
+ with tempfile.TemporaryDirectory() as temp_dir:
11
+ # Extract the zip file
12
+ with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
13
+ zip_ref.extractall(temp_dir)
14
+
15
+ # Recursively get list of all .txt files in all directories and calculate word count
16
+ txt_files = []
17
+ word_counts = []
18
+ for root, dirs, files in os.walk(temp_dir):
19
+ for file in files:
20
+ if file.endswith('.txt'):
21
+ file_path = os.path.join(root, file)
22
+ txt_files.append(os.path.basename(file_path)) # Only the file name
23
+
24
+ # Calculate word count
25
+ with open(file_path, 'r', encoding='utf-8') as f:
26
+ text = f.read()
27
+ word_count = len(text.split()) # Split text by spaces to count words
28
+ word_counts.append(word_count)
29
+
30
+ # Create a DataFrame with document names and word counts
31
+ df = pd.DataFrame({"Nom du document": txt_files, "N. mots": word_counts})
32
+
33
+ return df
34
+
35
+ # Function to process the zip file and search for keywords (while preserving Word Count)
36
+ def process_zip_and_search(zip_file, keywords_text):
37
+ # Create a temporary directory to extract files
38
+ with tempfile.TemporaryDirectory() as temp_dir:
39
+ # Extract the zip file
40
+ with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
41
+ zip_ref.extractall(temp_dir)
42
+
43
+ # Recursively get list of all .txt files in all directories
44
+ txt_files = []
45
+ word_counts = []
46
+ base_names = []
47
+ for root, dirs, files in os.walk(temp_dir):
48
+ for file in files:
49
+ if file.endswith('.txt'):
50
+ file_path = os.path.join(root, file) # Full path to the file
51
+ txt_files.append(file_path) # Store the full path
52
+ base_names.append(os.path.basename(file_path)) # Store just the base name for display
53
+
54
+ # Calculate word count
55
+ with open(file_path, 'r', encoding='utf-8') as f:
56
+ text = f.read()
57
+ word_count = len(text.split()) # Split text by spaces to count words
58
+ word_counts.append(word_count)
59
+
60
+ # Read keywords (split by line, supports multiword expressions)
61
+ keywords = [keyword.strip().lower() for keyword in keywords_text.strip().split("\n") if keyword.strip()] # Convert keywords to lowercase
62
+
63
+ # Prepare a dictionary to store the results (initialize with Document Name and Word Count)
64
+ results = {base_name: {keyword: "" for keyword in keywords} for base_name in base_names}
65
+
66
+ # Search for keyword frequencies in each text file (support multiword)
67
+ for i, txt_file in enumerate(txt_files):
68
+ with open(txt_file, 'r', encoding='utf-8') as file:
69
+ text = file.read().lower() # Convert to lowercase for case-insensitive search
70
+
71
+ # Count occurrences of each keyword (multiword expressions supported)
72
+ for keyword in keywords:
73
+ keyword_count = text.count(keyword.lower()) # Use count() to get occurrences (both text and keywords are lowercased)
74
+ if keyword_count > 0:
75
+ results[base_names[i]][keyword] = keyword_count
76
+
77
+ # Convert the results dictionary to a DataFrame
78
+ df_keywords = pd.DataFrame(results).T # Transpose to have files as rows and keywords as columns
79
+
80
+ # Reset index to make the document names a column
81
+ df_keywords.reset_index(inplace=True)
82
+
83
+ # Rename the first column to 'Document Name'
84
+ df_keywords.rename(columns={"index": "Nom du document"}, inplace=True)
85
+
86
+ # Replace 0 frequencies with empty strings
87
+ df_keywords.replace(0, "", inplace=True)
88
+
89
+ # Create a DataFrame with document names and word counts
90
+ df_word_counts = pd.DataFrame({"Nom du document": base_names, "N. mots": word_counts})
91
+
92
+ # Merge the Word Count DataFrame with the Keywords DataFrame on "Document Name"
93
+ final_df = pd.merge(df_word_counts, df_keywords, on="Nom du document")
94
+
95
+ return final_df
96
+
97
+
98
+
99
+ # Function to export the DataFrame to Excel
100
+ def export_to_excel(df):
101
+ # Create a temporary directory for storing the Excel file
102
+ with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
103
+ excel_path = tmp.name
104
+ # Save the DataFrame to Excel
105
+ df.to_excel(excel_path, index=False)
106
+ return excel_path
107
+
108
+ # Create Gradio interface with one results table and export functionality
109
+ with gr.Blocks() as demo:
110
+ gr.Markdown("# Recherche simple par mots-clés") # This line adds the title
111
+
112
+ with gr.Row():
113
+ # File upload and initial table with document names
114
+ zip_file_input = gr.File(label="Téléversez votre dossier .zip contenant les fichiers texte (format .txt)")
115
+
116
+ with gr.Row():
117
+ # Textbox for entering keywords
118
+ keywords_input = gr.Textbox(label="Entrez les mots clés (un par ligne, peuvent contenir plus d'un mot)", placeholder="mots-clés...", lines=10)
119
+
120
+ with gr.Row():
121
+ # Button to trigger keyword search
122
+ search_button = gr.Button("Recherche")
123
+
124
+ # Output the final results table after the search button
125
+ with gr.Row():
126
+ result_table = gr.DataFrame(label="Résultats", col_count = (1, "dynamic"), interactive=False, max_height=600) # Disable renaming/editing
127
+
128
+ # Button to trigger the Excel export
129
+ with gr.Row():
130
+ export_button = gr.Button("Exporter vers Excel (.xlsx)")
131
+ download_link = gr.File(label="Télécharger le fichier")
132
+
133
+ # Action to display document names upon ZIP upload
134
+ zip_file_input.change(fn=process_zip_initial, inputs=zip_file_input, outputs=result_table)
135
+
136
+ # Action to update the table with keywords and results
137
+ search_button.click(fn=process_zip_and_search, inputs=[zip_file_input, keywords_input], outputs=result_table)
138
+
139
+ # Action to export the results to Excel
140
+ export_button.click(fn=export_to_excel, inputs=result_table, outputs=download_link)
141
+
142
+ # Launch the app
143
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio
2
+ pandas
3
+ openpyxl