Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -44,7 +44,7 @@ function refresh() {
|
|
44 |
# Azure OpenAI setup
|
45 |
# ===============================
|
46 |
os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
|
47 |
-
os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")
|
48 |
|
49 |
client = AzureOpenAI(
|
50 |
api_version="2023-05-15",
|
@@ -186,64 +186,8 @@ def merge_files_with_mapping(file_paths, user_fields):
|
|
186 |
final_df.to_csv("merged_data.csv", index=False)
|
187 |
return final_df
|
188 |
|
189 |
-
def
|
190 |
-
|
191 |
-
text = ""
|
192 |
-
for page in reader.pages:
|
193 |
-
page_text = page.extract_text()
|
194 |
-
if page_text:
|
195 |
-
text += page_text
|
196 |
-
return text
|
197 |
-
|
198 |
-
def map_pdf_to_csv_structure(pdf_path, csv_df, user_fields):
|
199 |
-
pdf_text = extract_text_from_pdf(pdf_path)
|
200 |
-
column_headers = list(csv_df.columns)
|
201 |
-
first_row_data = csv_df.iloc[0].to_dict() if len(csv_df) > 0 else {}
|
202 |
-
|
203 |
-
prompt = (
|
204 |
-
f"Based on the following document text extracted from a government project in Thailand:\n{pdf_text}\n\n"
|
205 |
-
f"Please map the information to JSON format using the following structure:\n"
|
206 |
-
f"Column Headers: {column_headers}\n"
|
207 |
-
f"Example Data (from the first row of the CSV): {first_row_data}\n\n"
|
208 |
-
"For each column header, extract the corresponding value from the document text. "
|
209 |
-
"If a column header is not applicable or data is missing, use an empty string.\n\n"
|
210 |
-
"Return only JSON with no additional explanations."
|
211 |
-
)
|
212 |
-
|
213 |
-
completion = client.chat.completions.create(
|
214 |
-
model="gpt-4o",
|
215 |
-
messages=[{"role": "user", "content": prompt}],
|
216 |
-
temperature=0,
|
217 |
-
response_format={"type": "json_object"},
|
218 |
-
)
|
219 |
-
|
220 |
-
try:
|
221 |
-
response_text = completion.choices[0].message.content.strip()
|
222 |
-
result_dict = json.loads(response_text)
|
223 |
-
except Exception as e:
|
224 |
-
raise ValueError(
|
225 |
-
f"Error parsing LLM response for PDF mapping: {e}\n\nResponse:\n{completion.choices[0].message.content}"
|
226 |
-
)
|
227 |
-
|
228 |
-
if len(result_dict) == 1:
|
229 |
-
# If there's only a single top-level key, use its value as data
|
230 |
-
only_value = next(iter(result_dict.values()))
|
231 |
-
new_data_df = pd.DataFrame(only_value)
|
232 |
-
else:
|
233 |
-
new_data_df = pd.DataFrame(result_dict)
|
234 |
-
|
235 |
-
desired_columns = list(user_fields.keys())
|
236 |
-
new_data_df = new_data_df.reindex(columns=desired_columns)
|
237 |
-
return new_data_df
|
238 |
-
|
239 |
-
def combine_all_data(file_paths, pdf_file, user_fields):
|
240 |
-
merged_csv_df = merge_files_with_mapping(file_paths, user_fields)
|
241 |
-
|
242 |
-
if pdf_file and os.path.exists(pdf_file):
|
243 |
-
pdf_data_df = map_pdf_to_csv_structure(pdf_file, merged_csv_df, user_fields)
|
244 |
-
final_df = pd.concat([merged_csv_df, pdf_data_df], ignore_index=True)
|
245 |
-
else:
|
246 |
-
final_df = merged_csv_df
|
247 |
|
248 |
desired_columns = list(user_fields.keys())
|
249 |
final_df = final_df.reindex(columns=desired_columns)
|
@@ -255,7 +199,7 @@ def combine_all_data(file_paths, pdf_file, user_fields):
|
|
255 |
# ===============================
|
256 |
# Gradio Interface Function
|
257 |
# ===============================
|
258 |
-
def process_data(files,
|
259 |
"""
|
260 |
Main function for Gradio to handle user inputs:
|
261 |
- files: list of CSV/Excel files
|
@@ -269,10 +213,9 @@ def process_data(files, pdf_file, field_text):
|
|
269 |
return "No valid fields found. Please use the format:\n\nField Name: Description"
|
270 |
|
271 |
file_paths = [f.name for f in files] if files else []
|
272 |
-
pdf_path = pdf_file.name if pdf_file is not None else None
|
273 |
|
274 |
try:
|
275 |
-
final_df, absolute_path = combine_all_data(file_paths,
|
276 |
except Exception as e:
|
277 |
return f"Error during processing: {e}"
|
278 |
|
@@ -304,7 +247,6 @@ with gr.Blocks(theme=basetheme,js=js_func,fill_height=True) as demo:
|
|
304 |
fn=process_data,
|
305 |
inputs=[
|
306 |
gr.File(label="Upload CSV/Excel files", file_count="multiple",file_types=[".csv", ".xlsx", ".xls"]),
|
307 |
-
gr.File(label="Upload PDF file (optional)", file_types=[".pdf"]),
|
308 |
gr.Textbox(
|
309 |
label="Desired Fields (one per line, use 'Field Name: Description' format)",
|
310 |
placeholder="Example:\nName: Full name\nDOB: Date of birth\nAddress: Full address\n",
|
|
|
44 |
# Azure OpenAI setup
|
45 |
# ===============================
|
46 |
os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
|
47 |
+
os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY") # Replace with your actual API key
|
48 |
|
49 |
client = AzureOpenAI(
|
50 |
api_version="2023-05-15",
|
|
|
186 |
final_df.to_csv("merged_data.csv", index=False)
|
187 |
return final_df
|
188 |
|
189 |
+
def combine_all_data(file_paths, user_fields):
|
190 |
+
final_df = merge_files_with_mapping(file_paths, user_fields)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
|
192 |
desired_columns = list(user_fields.keys())
|
193 |
final_df = final_df.reindex(columns=desired_columns)
|
|
|
199 |
# ===============================
|
200 |
# Gradio Interface Function
|
201 |
# ===============================
|
202 |
+
def process_data(files, field_text):
|
203 |
"""
|
204 |
Main function for Gradio to handle user inputs:
|
205 |
- files: list of CSV/Excel files
|
|
|
213 |
return "No valid fields found. Please use the format:\n\nField Name: Description"
|
214 |
|
215 |
file_paths = [f.name for f in files] if files else []
|
|
|
216 |
|
217 |
try:
|
218 |
+
final_df, absolute_path = combine_all_data(file_paths, user_fields)
|
219 |
except Exception as e:
|
220 |
return f"Error during processing: {e}"
|
221 |
|
|
|
247 |
fn=process_data,
|
248 |
inputs=[
|
249 |
gr.File(label="Upload CSV/Excel files", file_count="multiple",file_types=[".csv", ".xlsx", ".xls"]),
|
|
|
250 |
gr.Textbox(
|
251 |
label="Desired Fields (one per line, use 'Field Name: Description' format)",
|
252 |
placeholder="Example:\nName: Full name\nDOB: Date of birth\nAddress: Full address\n",
|