fixed keyword removal bug
Browse files
app.py
CHANGED
@@ -68,38 +68,43 @@ def process_zip_and_search(keywords_text, search_mode):
|
|
68 |
global raw_corpus, lemmatized_corpus, initial_df # Use the texts stored at corpus upload and initial DataFrame
|
69 |
|
70 |
# Read the keywords (no lemmatization of keywords)
|
71 |
-
keywords = [
|
72 |
-
|
|
|
|
|
|
|
|
|
73 |
# Select the appropriate corpus based on the search mode
|
74 |
corpus = lemmatized_corpus if search_mode == "Lemmes" else raw_corpus
|
75 |
-
|
76 |
# Prepare a dictionary to store the results (initialize with Document Name and empty results)
|
77 |
results = {doc_name: {keyword: "" for keyword in keywords} for doc_name in corpus.keys()}
|
78 |
-
|
79 |
# Search for keyword frequencies in each text file
|
80 |
for doc_name, text in corpus.items():
|
81 |
for keyword in keywords:
|
82 |
keyword_count = text.count(keyword) # Count occurrences of each keyword
|
83 |
if keyword_count > 0:
|
84 |
results[doc_name][keyword] = keyword_count
|
85 |
-
|
86 |
# Convert the results dictionary to a DataFrame
|
87 |
df_keywords = pd.DataFrame(results).T # Transpose to have files as rows and keywords as columns
|
88 |
-
|
89 |
# Reset index to make the document names a column
|
90 |
df_keywords.reset_index(inplace=True)
|
91 |
-
|
92 |
# Rename the first column to 'Nom du document'
|
93 |
df_keywords.rename(columns={"index": "Nom du document"}, inplace=True)
|
94 |
-
|
95 |
# Replace 0 frequencies with empty strings
|
96 |
df_keywords.replace(0, "", inplace=True)
|
97 |
-
|
98 |
# Merge the initial DataFrame with the keyword search results
|
99 |
final_df = pd.merge(initial_df, df_keywords, on="Nom du document", how="left")
|
100 |
-
|
101 |
return final_df
|
102 |
|
|
|
103 |
# Function to export the DataFrame to Excel
|
104 |
def export_to_excel(df):
|
105 |
# Create a temporary directory for storing the Excel file
|
@@ -107,7 +112,7 @@ def export_to_excel(df):
|
|
107 |
excel_path = tmp.name
|
108 |
# Save the DataFrame to Excel
|
109 |
df.to_excel(excel_path, index=False)
|
110 |
-
return
|
111 |
|
112 |
# Create Gradio interface with one results table and export functionality
|
113 |
with gr.Blocks() as demo:
|
@@ -148,4 +153,4 @@ with gr.Blocks() as demo:
|
|
148 |
export_button.click(fn=export_to_excel, inputs=result_table, outputs=download_link)
|
149 |
|
150 |
# Launch the app
|
151 |
-
demo.launch()
|
|
|
68 |
global raw_corpus, lemmatized_corpus, initial_df # Use the texts stored at corpus upload and initial DataFrame
|
69 |
|
70 |
# Read the keywords (no lemmatization of keywords)
|
71 |
+
keywords = [keyword.strip().lower() for keyword in keywords_text.strip().split("\n") if keyword.strip()]
|
72 |
+
|
73 |
+
if not keywords:
|
74 |
+
# If no keywords are provided, return the initial DataFrame (without the keyword columns)
|
75 |
+
return initial_df
|
76 |
+
|
77 |
# Select the appropriate corpus based on the search mode
|
78 |
corpus = lemmatized_corpus if search_mode == "Lemmes" else raw_corpus
|
79 |
+
|
80 |
# Prepare a dictionary to store the results (initialize with Document Name and empty results)
|
81 |
results = {doc_name: {keyword: "" for keyword in keywords} for doc_name in corpus.keys()}
|
82 |
+
|
83 |
# Search for keyword frequencies in each text file
|
84 |
for doc_name, text in corpus.items():
|
85 |
for keyword in keywords:
|
86 |
keyword_count = text.count(keyword) # Count occurrences of each keyword
|
87 |
if keyword_count > 0:
|
88 |
results[doc_name][keyword] = keyword_count
|
89 |
+
|
90 |
# Convert the results dictionary to a DataFrame
|
91 |
df_keywords = pd.DataFrame(results).T # Transpose to have files as rows and keywords as columns
|
92 |
+
|
93 |
# Reset index to make the document names a column
|
94 |
df_keywords.reset_index(inplace=True)
|
95 |
+
|
96 |
# Rename the first column to 'Nom du document'
|
97 |
df_keywords.rename(columns={"index": "Nom du document"}, inplace=True)
|
98 |
+
|
99 |
# Replace 0 frequencies with empty strings
|
100 |
df_keywords.replace(0, "", inplace=True)
|
101 |
+
|
102 |
# Merge the initial DataFrame with the keyword search results
|
103 |
final_df = pd.merge(initial_df, df_keywords, on="Nom du document", how="left")
|
104 |
+
|
105 |
return final_df
|
106 |
|
107 |
+
|
108 |
# Function to export the DataFrame to Excel
|
109 |
def export_to_excel(df):
|
110 |
# Create a temporary directory for storing the Excel file
|
|
|
112 |
excel_path = tmp.name
|
113 |
# Save the DataFrame to Excel
|
114 |
df.to_excel(excel_path, index=False)
|
115 |
+
return excel_pathp
|
116 |
|
117 |
# Create Gradio interface with one results table and export functionality
|
118 |
with gr.Blocks() as demo:
|
|
|
153 |
export_button.click(fn=export_to_excel, inputs=result_table, outputs=download_link)
|
154 |
|
155 |
# Launch the app
|
156 |
+
demo.launch()
|