KoonJamesZ commited on
Commit
9e036d9
·
verified ·
1 Parent(s): 92cea5e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -63
app.py CHANGED
@@ -44,7 +44,7 @@ function refresh() {
44
  # Azure OpenAI setup
45
  # ===============================
46
  os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
47
- os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")
48
 
49
  client = AzureOpenAI(
50
  api_version="2023-05-15",
@@ -186,64 +186,8 @@ def merge_files_with_mapping(file_paths, user_fields):
186
  final_df.to_csv("merged_data.csv", index=False)
187
  return final_df
188
 
189
- def extract_text_from_pdf(pdf_path):
190
- reader = PdfReader(pdf_path)
191
- text = ""
192
- for page in reader.pages:
193
- page_text = page.extract_text()
194
- if page_text:
195
- text += page_text
196
- return text
197
-
198
- def map_pdf_to_csv_structure(pdf_path, csv_df, user_fields):
199
- pdf_text = extract_text_from_pdf(pdf_path)
200
- column_headers = list(csv_df.columns)
201
- first_row_data = csv_df.iloc[0].to_dict() if len(csv_df) > 0 else {}
202
-
203
- prompt = (
204
- f"Based on the following document text extracted from a government project in Thailand:\n{pdf_text}\n\n"
205
- f"Please map the information to JSON format using the following structure:\n"
206
- f"Column Headers: {column_headers}\n"
207
- f"Example Data (from the first row of the CSV): {first_row_data}\n\n"
208
- "For each column header, extract the corresponding value from the document text. "
209
- "If a column header is not applicable or data is missing, use an empty string.\n\n"
210
- "Return only JSON with no additional explanations."
211
- )
212
-
213
- completion = client.chat.completions.create(
214
- model="gpt-4o",
215
- messages=[{"role": "user", "content": prompt}],
216
- temperature=0,
217
- response_format={"type": "json_object"},
218
- )
219
-
220
- try:
221
- response_text = completion.choices[0].message.content.strip()
222
- result_dict = json.loads(response_text)
223
- except Exception as e:
224
- raise ValueError(
225
- f"Error parsing LLM response for PDF mapping: {e}\n\nResponse:\n{completion.choices[0].message.content}"
226
- )
227
-
228
- if len(result_dict) == 1:
229
- # If there's only a single top-level key, use its value as data
230
- only_value = next(iter(result_dict.values()))
231
- new_data_df = pd.DataFrame(only_value)
232
- else:
233
- new_data_df = pd.DataFrame(result_dict)
234
-
235
- desired_columns = list(user_fields.keys())
236
- new_data_df = new_data_df.reindex(columns=desired_columns)
237
- return new_data_df
238
-
239
- def combine_all_data(file_paths, pdf_file, user_fields):
240
- merged_csv_df = merge_files_with_mapping(file_paths, user_fields)
241
-
242
- if pdf_file and os.path.exists(pdf_file):
243
- pdf_data_df = map_pdf_to_csv_structure(pdf_file, merged_csv_df, user_fields)
244
- final_df = pd.concat([merged_csv_df, pdf_data_df], ignore_index=True)
245
- else:
246
- final_df = merged_csv_df
247
 
248
  desired_columns = list(user_fields.keys())
249
  final_df = final_df.reindex(columns=desired_columns)
@@ -255,7 +199,7 @@ def combine_all_data(file_paths, pdf_file, user_fields):
255
  # ===============================
256
  # Gradio Interface Function
257
  # ===============================
258
- def process_data(files, pdf_file, field_text):
259
  """
260
  Main function for Gradio to handle user inputs:
261
  - files: list of CSV/Excel files
@@ -269,10 +213,9 @@ def process_data(files, pdf_file, field_text):
269
  return "No valid fields found. Please use the format:\n\nField Name: Description"
270
 
271
  file_paths = [f.name for f in files] if files else []
272
- pdf_path = pdf_file.name if pdf_file is not None else None
273
 
274
  try:
275
- final_df, absolute_path = combine_all_data(file_paths, pdf_path, user_fields)
276
  except Exception as e:
277
  return f"Error during processing: {e}"
278
 
@@ -304,7 +247,6 @@ with gr.Blocks(theme=basetheme,js=js_func,fill_height=True) as demo:
304
  fn=process_data,
305
  inputs=[
306
  gr.File(label="Upload CSV/Excel files", file_count="multiple",file_types=[".csv", ".xlsx", ".xls"]),
307
- gr.File(label="Upload PDF file (optional)", file_types=[".pdf"]),
308
  gr.Textbox(
309
  label="Desired Fields (one per line, use 'Field Name: Description' format)",
310
  placeholder="Example:\nName: Full name\nDOB: Date of birth\nAddress: Full address\n",
 
44
  # Azure OpenAI setup
45
  # ===============================
46
  os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
47
+ os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY") # Replace with your actual API key
48
 
49
  client = AzureOpenAI(
50
  api_version="2023-05-15",
 
186
  final_df.to_csv("merged_data.csv", index=False)
187
  return final_df
188
 
189
+ def combine_all_data(file_paths, user_fields):
190
+ final_df = merge_files_with_mapping(file_paths, user_fields)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
  desired_columns = list(user_fields.keys())
193
  final_df = final_df.reindex(columns=desired_columns)
 
199
  # ===============================
200
  # Gradio Interface Function
201
  # ===============================
202
+ def process_data(files, field_text):
203
  """
204
  Main function for Gradio to handle user inputs:
205
  - files: list of CSV/Excel files
 
213
  return "No valid fields found. Please use the format:\n\nField Name: Description"
214
 
215
  file_paths = [f.name for f in files] if files else []
 
216
 
217
  try:
218
+ final_df, absolute_path = combine_all_data(file_paths, user_fields)
219
  except Exception as e:
220
  return f"Error during processing: {e}"
221
 
 
247
  fn=process_data,
248
  inputs=[
249
  gr.File(label="Upload CSV/Excel files", file_count="multiple",file_types=[".csv", ".xlsx", ".xls"]),
 
250
  gr.Textbox(
251
  label="Desired Fields (one per line, use 'Field Name: Description' format)",
252
  placeholder="Example:\nName: Full name\nDOB: Date of birth\nAddress: Full address\n",