Update app.py
Browse files
app.py
CHANGED
@@ -1,143 +1,143 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
import zipfile
|
3 |
-
import os
|
4 |
-
import tempfile
|
5 |
-
import pandas as pd
|
6 |
-
|
7 |
-
# Function to process the zip file, get document names, and calculate word counts (without keywords initially)
|
8 |
-
def process_zip_initial(zip_file):
|
9 |
-
# Create a temporary directory to extract files
|
10 |
-
with tempfile.TemporaryDirectory() as temp_dir:
|
11 |
-
# Extract the zip file
|
12 |
-
with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
|
13 |
-
zip_ref.extractall(temp_dir)
|
14 |
-
|
15 |
-
# Recursively get list of all .txt files in all directories and calculate word count
|
16 |
-
txt_files = []
|
17 |
-
word_counts = []
|
18 |
-
for root, dirs, files in os.walk(temp_dir):
|
19 |
-
for file in files:
|
20 |
-
if file.endswith('.txt'):
|
21 |
-
file_path = os.path.join(root, file)
|
22 |
-
txt_files.append(os.path.basename(file_path)) # Only the file name
|
23 |
-
|
24 |
-
# Calculate word count
|
25 |
-
with open(file_path, 'r', encoding='utf-8') as f:
|
26 |
-
text = f.read()
|
27 |
-
word_count = len(text.split()) # Split text by spaces to count words
|
28 |
-
word_counts.append(word_count)
|
29 |
-
|
30 |
-
# Create a DataFrame with document names and word counts
|
31 |
-
df = pd.DataFrame({"Nom du document": txt_files, "N. mots": word_counts})
|
32 |
-
|
33 |
-
return df
|
34 |
-
|
35 |
-
# Function to process the zip file and search for keywords (while preserving Word Count)
|
36 |
-
def process_zip_and_search(zip_file, keywords_text):
|
37 |
-
# Create a temporary directory to extract files
|
38 |
-
with tempfile.TemporaryDirectory() as temp_dir:
|
39 |
-
# Extract the zip file
|
40 |
-
with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
|
41 |
-
zip_ref.extractall(temp_dir)
|
42 |
-
|
43 |
-
# Recursively get list of all .txt files in all directories
|
44 |
-
txt_files = []
|
45 |
-
word_counts = []
|
46 |
-
base_names = []
|
47 |
-
for root, dirs, files in os.walk(temp_dir):
|
48 |
-
for file in files:
|
49 |
-
if file.endswith('.txt'):
|
50 |
-
file_path = os.path.join(root, file) # Full path to the file
|
51 |
-
txt_files.append(file_path) # Store the full path
|
52 |
-
base_names.append(os.path.basename(file_path)) # Store just the base name for display
|
53 |
-
|
54 |
-
# Calculate word count
|
55 |
-
with open(file_path, 'r', encoding='utf-8') as f:
|
56 |
-
text = f.read()
|
57 |
-
word_count = len(text.split()) # Split text by spaces to count words
|
58 |
-
word_counts.append(word_count)
|
59 |
-
|
60 |
-
# Read keywords (split by line, supports multiword expressions)
|
61 |
-
keywords = [keyword.strip().lower() for keyword in keywords_text.strip().split("\n") if keyword.strip()] # Convert keywords to lowercase
|
62 |
-
|
63 |
-
# Prepare a dictionary to store the results (initialize with Document Name and Word Count)
|
64 |
-
results = {base_name: {keyword: "" for keyword in keywords} for base_name in base_names}
|
65 |
-
|
66 |
-
# Search for keyword frequencies in each text file (support multiword)
|
67 |
-
for i, txt_file in enumerate(txt_files):
|
68 |
-
with open(txt_file, 'r', encoding='utf-8') as file:
|
69 |
-
text = file.read().lower() # Convert to lowercase for case-insensitive search
|
70 |
-
|
71 |
-
# Count occurrences of each keyword (multiword expressions supported)
|
72 |
-
for keyword in keywords:
|
73 |
-
keyword_count = text.count(keyword.lower()) # Use count() to get occurrences (both text and keywords are lowercased)
|
74 |
-
if keyword_count > 0:
|
75 |
-
results[base_names[i]][keyword] = keyword_count
|
76 |
-
|
77 |
-
# Convert the results dictionary to a DataFrame
|
78 |
-
df_keywords = pd.DataFrame(results).T # Transpose to have files as rows and keywords as columns
|
79 |
-
|
80 |
-
# Reset index to make the document names a column
|
81 |
-
df_keywords.reset_index(inplace=True)
|
82 |
-
|
83 |
-
# Rename the first column to 'Document Name'
|
84 |
-
df_keywords.rename(columns={"index": "Nom du document"}, inplace=True)
|
85 |
-
|
86 |
-
# Replace 0 frequencies with empty strings
|
87 |
-
df_keywords.replace(0, "", inplace=True)
|
88 |
-
|
89 |
-
# Create a DataFrame with document names and word counts
|
90 |
-
df_word_counts = pd.DataFrame({"Nom du document": base_names, "N. mots": word_counts})
|
91 |
-
|
92 |
-
# Merge the Word Count DataFrame with the Keywords DataFrame on "Document Name"
|
93 |
-
final_df = pd.merge(df_word_counts, df_keywords, on="Nom du document")
|
94 |
-
|
95 |
-
return final_df
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
# Function to export the DataFrame to Excel
|
100 |
-
def export_to_excel(df):
|
101 |
-
# Create a temporary directory for storing the Excel file
|
102 |
-
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
|
103 |
-
excel_path = tmp.name
|
104 |
-
# Save the DataFrame to Excel
|
105 |
-
df.to_excel(excel_path, index=False)
|
106 |
-
return excel_path
|
107 |
-
|
108 |
-
# Create Gradio interface with one results table and export functionality
|
109 |
-
with gr.Blocks() as demo:
|
110 |
-
gr.Markdown("# Recherche simple par mots-clés") # This line adds the title
|
111 |
-
|
112 |
-
with gr.Row():
|
113 |
-
# File upload and initial table with document names
|
114 |
-
zip_file_input = gr.File(label="Téléversez votre dossier .zip contenant les fichiers texte (format .txt)")
|
115 |
-
|
116 |
-
with gr.Row():
|
117 |
-
# Textbox for entering keywords
|
118 |
-
keywords_input = gr.Textbox(label="Entrez les mots clés (un par ligne, peuvent contenir plus d'un mot)", placeholder="mots-clés...", lines=10)
|
119 |
-
|
120 |
-
with gr.Row():
|
121 |
-
# Button to trigger keyword search
|
122 |
-
search_button = gr.Button("Recherche")
|
123 |
-
|
124 |
-
# Output the final results table after the search button
|
125 |
-
with gr.Row():
|
126 |
-
result_table = gr.DataFrame(label="Résultats", col_count = (1, "dynamic"), interactive=False
|
127 |
-
|
128 |
-
# Button to trigger the Excel export
|
129 |
-
with gr.Row():
|
130 |
-
export_button = gr.Button("Exporter vers Excel (.xlsx)")
|
131 |
-
download_link = gr.File(label="Télécharger le fichier")
|
132 |
-
|
133 |
-
# Action to display document names upon ZIP upload
|
134 |
-
zip_file_input.change(fn=process_zip_initial, inputs=zip_file_input, outputs=result_table)
|
135 |
-
|
136 |
-
# Action to update the table with keywords and results
|
137 |
-
search_button.click(fn=process_zip_and_search, inputs=[zip_file_input, keywords_input], outputs=result_table)
|
138 |
-
|
139 |
-
# Action to export the results to Excel
|
140 |
-
export_button.click(fn=export_to_excel, inputs=result_table, outputs=download_link)
|
141 |
-
|
142 |
-
# Launch the app
|
143 |
demo.launch()
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import zipfile
|
3 |
+
import os
|
4 |
+
import tempfile
|
5 |
+
import pandas as pd
|
6 |
+
|
7 |
+
# Function to process the zip file, get document names, and calculate word counts (without keywords initially)
|
8 |
+
def process_zip_initial(zip_file):
|
9 |
+
# Create a temporary directory to extract files
|
10 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
11 |
+
# Extract the zip file
|
12 |
+
with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
|
13 |
+
zip_ref.extractall(temp_dir)
|
14 |
+
|
15 |
+
# Recursively get list of all .txt files in all directories and calculate word count
|
16 |
+
txt_files = []
|
17 |
+
word_counts = []
|
18 |
+
for root, dirs, files in os.walk(temp_dir):
|
19 |
+
for file in files:
|
20 |
+
if file.endswith('.txt'):
|
21 |
+
file_path = os.path.join(root, file)
|
22 |
+
txt_files.append(os.path.basename(file_path)) # Only the file name
|
23 |
+
|
24 |
+
# Calculate word count
|
25 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
26 |
+
text = f.read()
|
27 |
+
word_count = len(text.split()) # Split text by spaces to count words
|
28 |
+
word_counts.append(word_count)
|
29 |
+
|
30 |
+
# Create a DataFrame with document names and word counts
|
31 |
+
df = pd.DataFrame({"Nom du document": txt_files, "N. mots": word_counts})
|
32 |
+
|
33 |
+
return df
|
34 |
+
|
35 |
+
# Function to process the zip file and search for keywords (while preserving Word Count)
|
36 |
+
def process_zip_and_search(zip_file, keywords_text):
|
37 |
+
# Create a temporary directory to extract files
|
38 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
39 |
+
# Extract the zip file
|
40 |
+
with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
|
41 |
+
zip_ref.extractall(temp_dir)
|
42 |
+
|
43 |
+
# Recursively get list of all .txt files in all directories
|
44 |
+
txt_files = []
|
45 |
+
word_counts = []
|
46 |
+
base_names = []
|
47 |
+
for root, dirs, files in os.walk(temp_dir):
|
48 |
+
for file in files:
|
49 |
+
if file.endswith('.txt'):
|
50 |
+
file_path = os.path.join(root, file) # Full path to the file
|
51 |
+
txt_files.append(file_path) # Store the full path
|
52 |
+
base_names.append(os.path.basename(file_path)) # Store just the base name for display
|
53 |
+
|
54 |
+
# Calculate word count
|
55 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
56 |
+
text = f.read()
|
57 |
+
word_count = len(text.split()) # Split text by spaces to count words
|
58 |
+
word_counts.append(word_count)
|
59 |
+
|
60 |
+
# Read keywords (split by line, supports multiword expressions)
|
61 |
+
keywords = [keyword.strip().lower() for keyword in keywords_text.strip().split("\n") if keyword.strip()] # Convert keywords to lowercase
|
62 |
+
|
63 |
+
# Prepare a dictionary to store the results (initialize with Document Name and Word Count)
|
64 |
+
results = {base_name: {keyword: "" for keyword in keywords} for base_name in base_names}
|
65 |
+
|
66 |
+
# Search for keyword frequencies in each text file (support multiword)
|
67 |
+
for i, txt_file in enumerate(txt_files):
|
68 |
+
with open(txt_file, 'r', encoding='utf-8') as file:
|
69 |
+
text = file.read().lower() # Convert to lowercase for case-insensitive search
|
70 |
+
|
71 |
+
# Count occurrences of each keyword (multiword expressions supported)
|
72 |
+
for keyword in keywords:
|
73 |
+
keyword_count = text.count(keyword.lower()) # Use count() to get occurrences (both text and keywords are lowercased)
|
74 |
+
if keyword_count > 0:
|
75 |
+
results[base_names[i]][keyword] = keyword_count
|
76 |
+
|
77 |
+
# Convert the results dictionary to a DataFrame
|
78 |
+
df_keywords = pd.DataFrame(results).T # Transpose to have files as rows and keywords as columns
|
79 |
+
|
80 |
+
# Reset index to make the document names a column
|
81 |
+
df_keywords.reset_index(inplace=True)
|
82 |
+
|
83 |
+
# Rename the first column to 'Document Name'
|
84 |
+
df_keywords.rename(columns={"index": "Nom du document"}, inplace=True)
|
85 |
+
|
86 |
+
# Replace 0 frequencies with empty strings
|
87 |
+
df_keywords.replace(0, "", inplace=True)
|
88 |
+
|
89 |
+
# Create a DataFrame with document names and word counts
|
90 |
+
df_word_counts = pd.DataFrame({"Nom du document": base_names, "N. mots": word_counts})
|
91 |
+
|
92 |
+
# Merge the Word Count DataFrame with the Keywords DataFrame on "Document Name"
|
93 |
+
final_df = pd.merge(df_word_counts, df_keywords, on="Nom du document")
|
94 |
+
|
95 |
+
return final_df
|
96 |
+
|
97 |
+
|
98 |
+
|
99 |
+
# Function to export the DataFrame to Excel
|
100 |
+
def export_to_excel(df):
|
101 |
+
# Create a temporary directory for storing the Excel file
|
102 |
+
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
|
103 |
+
excel_path = tmp.name
|
104 |
+
# Save the DataFrame to Excel
|
105 |
+
df.to_excel(excel_path, index=False)
|
106 |
+
return excel_path
|
107 |
+
|
108 |
+
# Create Gradio interface with one results table and export functionality
|
109 |
+
with gr.Blocks() as demo:
|
110 |
+
gr.Markdown("# Recherche simple par mots-clés") # This line adds the title
|
111 |
+
|
112 |
+
with gr.Row():
|
113 |
+
# File upload and initial table with document names
|
114 |
+
zip_file_input = gr.File(label="Téléversez votre dossier .zip contenant les fichiers texte (format .txt)")
|
115 |
+
|
116 |
+
with gr.Row():
|
117 |
+
# Textbox for entering keywords
|
118 |
+
keywords_input = gr.Textbox(label="Entrez les mots clés (un par ligne, peuvent contenir plus d'un mot)", placeholder="mots-clés...", lines=10)
|
119 |
+
|
120 |
+
with gr.Row():
|
121 |
+
# Button to trigger keyword search
|
122 |
+
search_button = gr.Button("Recherche")
|
123 |
+
|
124 |
+
# Output the final results table after the search button
|
125 |
+
with gr.Row():
|
126 |
+
result_table = gr.DataFrame(label="Résultats", col_count = (1, "dynamic"), interactive=False)
|
127 |
+
|
128 |
+
# Button to trigger the Excel export
|
129 |
+
with gr.Row():
|
130 |
+
export_button = gr.Button("Exporter vers Excel (.xlsx)")
|
131 |
+
download_link = gr.File(label="Télécharger le fichier")
|
132 |
+
|
133 |
+
# Action to display document names upon ZIP upload
|
134 |
+
zip_file_input.change(fn=process_zip_initial, inputs=zip_file_input, outputs=result_table)
|
135 |
+
|
136 |
+
# Action to update the table with keywords and results
|
137 |
+
search_button.click(fn=process_zip_and_search, inputs=[zip_file_input, keywords_input], outputs=result_table)
|
138 |
+
|
139 |
+
# Action to export the results to Excel
|
140 |
+
export_button.click(fn=export_to_excel, inputs=result_table, outputs=download_link)
|
141 |
+
|
142 |
+
# Launch the app
|
143 |
demo.launch()
|