gloignon commited on
Commit
d417898
·
verified ·
1 Parent(s): 4910a63

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +142 -142
app.py CHANGED
@@ -1,143 +1,143 @@
1
- import gradio as gr
2
- import zipfile
3
- import os
4
- import tempfile
5
- import pandas as pd
6
-
7
- # Function to process the zip file, get document names, and calculate word counts (without keywords initially)
8
- def process_zip_initial(zip_file):
9
- # Create a temporary directory to extract files
10
- with tempfile.TemporaryDirectory() as temp_dir:
11
- # Extract the zip file
12
- with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
13
- zip_ref.extractall(temp_dir)
14
-
15
- # Recursively get list of all .txt files in all directories and calculate word count
16
- txt_files = []
17
- word_counts = []
18
- for root, dirs, files in os.walk(temp_dir):
19
- for file in files:
20
- if file.endswith('.txt'):
21
- file_path = os.path.join(root, file)
22
- txt_files.append(os.path.basename(file_path)) # Only the file name
23
-
24
- # Calculate word count
25
- with open(file_path, 'r', encoding='utf-8') as f:
26
- text = f.read()
27
- word_count = len(text.split()) # Split text by spaces to count words
28
- word_counts.append(word_count)
29
-
30
- # Create a DataFrame with document names and word counts
31
- df = pd.DataFrame({"Nom du document": txt_files, "N. mots": word_counts})
32
-
33
- return df
34
-
35
- # Function to process the zip file and search for keywords (while preserving Word Count)
36
- def process_zip_and_search(zip_file, keywords_text):
37
- # Create a temporary directory to extract files
38
- with tempfile.TemporaryDirectory() as temp_dir:
39
- # Extract the zip file
40
- with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
41
- zip_ref.extractall(temp_dir)
42
-
43
- # Recursively get list of all .txt files in all directories
44
- txt_files = []
45
- word_counts = []
46
- base_names = []
47
- for root, dirs, files in os.walk(temp_dir):
48
- for file in files:
49
- if file.endswith('.txt'):
50
- file_path = os.path.join(root, file) # Full path to the file
51
- txt_files.append(file_path) # Store the full path
52
- base_names.append(os.path.basename(file_path)) # Store just the base name for display
53
-
54
- # Calculate word count
55
- with open(file_path, 'r', encoding='utf-8') as f:
56
- text = f.read()
57
- word_count = len(text.split()) # Split text by spaces to count words
58
- word_counts.append(word_count)
59
-
60
- # Read keywords (split by line, supports multiword expressions)
61
- keywords = [keyword.strip().lower() for keyword in keywords_text.strip().split("\n") if keyword.strip()] # Convert keywords to lowercase
62
-
63
- # Prepare a dictionary to store the results (initialize with Document Name and Word Count)
64
- results = {base_name: {keyword: "" for keyword in keywords} for base_name in base_names}
65
-
66
- # Search for keyword frequencies in each text file (support multiword)
67
- for i, txt_file in enumerate(txt_files):
68
- with open(txt_file, 'r', encoding='utf-8') as file:
69
- text = file.read().lower() # Convert to lowercase for case-insensitive search
70
-
71
- # Count occurrences of each keyword (multiword expressions supported)
72
- for keyword in keywords:
73
- keyword_count = text.count(keyword.lower()) # Use count() to get occurrences (both text and keywords are lowercased)
74
- if keyword_count > 0:
75
- results[base_names[i]][keyword] = keyword_count
76
-
77
- # Convert the results dictionary to a DataFrame
78
- df_keywords = pd.DataFrame(results).T # Transpose to have files as rows and keywords as columns
79
-
80
- # Reset index to make the document names a column
81
- df_keywords.reset_index(inplace=True)
82
-
83
- # Rename the first column to 'Document Name'
84
- df_keywords.rename(columns={"index": "Nom du document"}, inplace=True)
85
-
86
- # Replace 0 frequencies with empty strings
87
- df_keywords.replace(0, "", inplace=True)
88
-
89
- # Create a DataFrame with document names and word counts
90
- df_word_counts = pd.DataFrame({"Nom du document": base_names, "N. mots": word_counts})
91
-
92
- # Merge the Word Count DataFrame with the Keywords DataFrame on "Document Name"
93
- final_df = pd.merge(df_word_counts, df_keywords, on="Nom du document")
94
-
95
- return final_df
96
-
97
-
98
-
99
- # Function to export the DataFrame to Excel
100
- def export_to_excel(df):
101
- # Create a temporary directory for storing the Excel file
102
- with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
103
- excel_path = tmp.name
104
- # Save the DataFrame to Excel
105
- df.to_excel(excel_path, index=False)
106
- return excel_path
107
-
108
- # Create Gradio interface with one results table and export functionality
109
- with gr.Blocks() as demo:
110
- gr.Markdown("# Recherche simple par mots-clés") # This line adds the title
111
-
112
- with gr.Row():
113
- # File upload and initial table with document names
114
- zip_file_input = gr.File(label="Téléversez votre dossier .zip contenant les fichiers texte (format .txt)")
115
-
116
- with gr.Row():
117
- # Textbox for entering keywords
118
- keywords_input = gr.Textbox(label="Entrez les mots clés (un par ligne, peuvent contenir plus d'un mot)", placeholder="mots-clés...", lines=10)
119
-
120
- with gr.Row():
121
- # Button to trigger keyword search
122
- search_button = gr.Button("Recherche")
123
-
124
- # Output the final results table after the search button
125
- with gr.Row():
126
- result_table = gr.DataFrame(label="Résultats", col_count = (1, "dynamic"), interactive=False, max_height=600) # Disable renaming/editing
127
-
128
- # Button to trigger the Excel export
129
- with gr.Row():
130
- export_button = gr.Button("Exporter vers Excel (.xlsx)")
131
- download_link = gr.File(label="Télécharger le fichier")
132
-
133
- # Action to display document names upon ZIP upload
134
- zip_file_input.change(fn=process_zip_initial, inputs=zip_file_input, outputs=result_table)
135
-
136
- # Action to update the table with keywords and results
137
- search_button.click(fn=process_zip_and_search, inputs=[zip_file_input, keywords_input], outputs=result_table)
138
-
139
- # Action to export the results to Excel
140
- export_button.click(fn=export_to_excel, inputs=result_table, outputs=download_link)
141
-
142
- # Launch the app
143
  demo.launch()
 
1
+ import gradio as gr
2
+ import zipfile
3
+ import os
4
+ import tempfile
5
+ import pandas as pd
6
+
7
+ # Function to process the zip file, get document names, and calculate word counts (without keywords initially)
8
+ def process_zip_initial(zip_file):
9
+ # Create a temporary directory to extract files
10
+ with tempfile.TemporaryDirectory() as temp_dir:
11
+ # Extract the zip file
12
+ with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
13
+ zip_ref.extractall(temp_dir)
14
+
15
+ # Recursively get list of all .txt files in all directories and calculate word count
16
+ txt_files = []
17
+ word_counts = []
18
+ for root, dirs, files in os.walk(temp_dir):
19
+ for file in files:
20
+ if file.endswith('.txt'):
21
+ file_path = os.path.join(root, file)
22
+ txt_files.append(os.path.basename(file_path)) # Only the file name
23
+
24
+ # Calculate word count
25
+ with open(file_path, 'r', encoding='utf-8') as f:
26
+ text = f.read()
27
+ word_count = len(text.split()) # Split text by spaces to count words
28
+ word_counts.append(word_count)
29
+
30
+ # Create a DataFrame with document names and word counts
31
+ df = pd.DataFrame({"Nom du document": txt_files, "N. mots": word_counts})
32
+
33
+ return df
34
+
35
+ # Function to process the zip file and search for keywords (while preserving Word Count)
36
+ def process_zip_and_search(zip_file, keywords_text):
37
+ # Create a temporary directory to extract files
38
+ with tempfile.TemporaryDirectory() as temp_dir:
39
+ # Extract the zip file
40
+ with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
41
+ zip_ref.extractall(temp_dir)
42
+
43
+ # Recursively get list of all .txt files in all directories
44
+ txt_files = []
45
+ word_counts = []
46
+ base_names = []
47
+ for root, dirs, files in os.walk(temp_dir):
48
+ for file in files:
49
+ if file.endswith('.txt'):
50
+ file_path = os.path.join(root, file) # Full path to the file
51
+ txt_files.append(file_path) # Store the full path
52
+ base_names.append(os.path.basename(file_path)) # Store just the base name for display
53
+
54
+ # Calculate word count
55
+ with open(file_path, 'r', encoding='utf-8') as f:
56
+ text = f.read()
57
+ word_count = len(text.split()) # Split text by spaces to count words
58
+ word_counts.append(word_count)
59
+
60
+ # Read keywords (split by line, supports multiword expressions)
61
+ keywords = [keyword.strip().lower() for keyword in keywords_text.strip().split("\n") if keyword.strip()] # Convert keywords to lowercase
62
+
63
+ # Prepare a dictionary to store the results (initialize with Document Name and Word Count)
64
+ results = {base_name: {keyword: "" for keyword in keywords} for base_name in base_names}
65
+
66
+ # Search for keyword frequencies in each text file (support multiword)
67
+ for i, txt_file in enumerate(txt_files):
68
+ with open(txt_file, 'r', encoding='utf-8') as file:
69
+ text = file.read().lower() # Convert to lowercase for case-insensitive search
70
+
71
+ # Count occurrences of each keyword (multiword expressions supported)
72
+ for keyword in keywords:
73
+ keyword_count = text.count(keyword.lower()) # Use count() to get occurrences (both text and keywords are lowercased)
74
+ if keyword_count > 0:
75
+ results[base_names[i]][keyword] = keyword_count
76
+
77
+ # Convert the results dictionary to a DataFrame
78
+ df_keywords = pd.DataFrame(results).T # Transpose to have files as rows and keywords as columns
79
+
80
+ # Reset index to make the document names a column
81
+ df_keywords.reset_index(inplace=True)
82
+
83
+ # Rename the first column to 'Document Name'
84
+ df_keywords.rename(columns={"index": "Nom du document"}, inplace=True)
85
+
86
+ # Replace 0 frequencies with empty strings
87
+ df_keywords.replace(0, "", inplace=True)
88
+
89
+ # Create a DataFrame with document names and word counts
90
+ df_word_counts = pd.DataFrame({"Nom du document": base_names, "N. mots": word_counts})
91
+
92
+ # Merge the Word Count DataFrame with the Keywords DataFrame on "Document Name"
93
+ final_df = pd.merge(df_word_counts, df_keywords, on="Nom du document")
94
+
95
+ return final_df
96
+
97
+
98
+
99
+ # Function to export the DataFrame to Excel
100
+ def export_to_excel(df):
101
+ # Create a temporary directory for storing the Excel file
102
+ with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
103
+ excel_path = tmp.name
104
+ # Save the DataFrame to Excel
105
+ df.to_excel(excel_path, index=False)
106
+ return excel_path
107
+
108
+ # Create Gradio interface with one results table and export functionality
109
+ with gr.Blocks() as demo:
110
+ gr.Markdown("# Recherche simple par mots-clés") # This line adds the title
111
+
112
+ with gr.Row():
113
+ # File upload and initial table with document names
114
+ zip_file_input = gr.File(label="Téléversez votre dossier .zip contenant les fichiers texte (format .txt)")
115
+
116
+ with gr.Row():
117
+ # Textbox for entering keywords
118
+ keywords_input = gr.Textbox(label="Entrez les mots clés (un par ligne, peuvent contenir plus d'un mot)", placeholder="mots-clés...", lines=10)
119
+
120
+ with gr.Row():
121
+ # Button to trigger keyword search
122
+ search_button = gr.Button("Recherche")
123
+
124
+ # Output the final results table after the search button
125
+ with gr.Row():
126
+ result_table = gr.DataFrame(label="Résultats", col_count = (1, "dynamic"), interactive=False)
127
+
128
+ # Button to trigger the Excel export
129
+ with gr.Row():
130
+ export_button = gr.Button("Exporter vers Excel (.xlsx)")
131
+ download_link = gr.File(label="Télécharger le fichier")
132
+
133
+ # Action to display document names upon ZIP upload
134
+ zip_file_input.change(fn=process_zip_initial, inputs=zip_file_input, outputs=result_table)
135
+
136
+ # Action to update the table with keywords and results
137
+ search_button.click(fn=process_zip_and_search, inputs=[zip_file_input, keywords_input], outputs=result_table)
138
+
139
+ # Action to export the results to Excel
140
+ export_button.click(fn=export_to_excel, inputs=result_table, outputs=download_link)
141
+
142
+ # Launch the app
143
  demo.launch()