Betimes commited on
Commit
51f8d0e
·
verified ·
1 Parent(s): f8d2713

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +210 -72
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import os
2
  import pandas as pd
3
  import json
4
- import ast
5
  import gradio as gr
6
  from openai import AzureOpenAI
7
  from PyPDF2 import PdfReader
@@ -41,123 +40,250 @@ function refresh() {
41
  }
42
  }
43
  """
44
-
45
  # Azure OpenAI setup
 
46
  os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
47
  os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")
48
- deployment = os.getenv("AZURE_OPENAI_AI_DEPLOYMENT")
49
 
50
  client = AzureOpenAI(
51
  api_version="2023-05-15",
52
- azure_deployment=deployment,
53
  )
54
- # Step 1: Read files and collect column names and first rows
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  def read_file_metadata(file_path):
56
  df = pd.read_csv(file_path)
57
  column_names = list(df.columns)
58
- first_row = df.iloc[0].to_dict() # Convert first row to a dictionary
59
- return column_names, first_row
 
 
 
 
 
 
 
 
60
 
61
- # Step 2: Create the prompt for column mapping
62
- def create_column_mapping_prompt(metadata):
63
  prompt = (
64
- "You are given CSV data from different sources, where column names for similar data vary slightly. "
65
- "Your task is to suggest mappings to unify columns with similar content under a single name.\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  )
67
- for i, (file_path, column_names, first_row) in enumerate(metadata):
68
- prompt += f"Data from {file_path}:\n"
69
- prompt += f"Column names: {column_names}\n"
70
- prompt += f"Example row: {first_row}\n\n"
71
- prompt += "Suggest mappings to standardize the columns across these files. Please return in JSON format."
72
  return prompt
73
 
74
- # Step 3: Call the LLM to get the column mapping
75
- def get_column_mapping(file_metadata):
76
- column_match_prompt = create_column_mapping_prompt(file_metadata)
77
  completion = client.chat.completions.create(
78
  model="gpt-4o",
79
  messages=[{"role": "user", "content": column_match_prompt}],
80
- temperature=0,
81
  response_format={"type": "json_object"},
82
  )
83
- print(completion.choices[0].message.content)
84
- result_dict = ast.literal_eval(completion.choices[0].message.content)
85
- return result_dict
86
 
87
- # Step 4: Apply the mapping and merge data
88
- def merge_files_with_mapping(file_paths):
 
 
 
 
 
 
 
 
 
89
  file_metadata = []
90
  for file_path in file_paths:
91
- column_names, first_row = read_file_metadata(file_path)
92
- file_metadata.append((file_path, column_names, first_row))
 
 
 
 
 
93
 
94
- result_dict = get_column_mapping(file_metadata)
 
95
 
96
  all_data = []
97
  for file_path in file_paths:
98
- df = pd.read_csv(file_path)
99
- df.rename(columns=result_dict, inplace=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  all_data.append(df)
101
 
 
 
 
102
  final_df = pd.concat(all_data, ignore_index=True)
 
 
 
 
103
  final_df.to_csv("merged_data.csv", index=False)
104
  return final_df
105
 
106
- # Step 5: Extract text from PDF
107
  def extract_text_from_pdf(pdf_path):
108
  reader = PdfReader(pdf_path)
109
  text = ""
110
  for page in reader.pages:
111
- text += page.extract_text() or ""
 
 
112
  return text
113
 
114
- # Step 6: Call the LLM for PDF data mapping
115
- def map_pdf_to_csv_structure(pdf_path, csv_df):
116
  pdf_text = extract_text_from_pdf(pdf_path)
117
  column_headers = list(csv_df.columns)
118
- first_row_data = csv_df.iloc[0].to_dict()
119
-
120
- prompt = f"""
121
- Based on the following document text extracted from a government project in Thailand:
122
- {pdf_text}
123
-
124
- Please map the information to JSON format using the following structure:
125
- Column Headers: {column_headers}
126
- Example Data (from the first row of the CSV): {first_row_data}
127
 
128
- Use the column headers as keys and fill in values based on the information from the document.
129
- If a key is not applicable or data is missing, leave the value as an empty string.
 
 
 
 
 
 
 
130
 
131
- Return only JSON with no additional explanations or modifications.
132
- """
133
  completion = client.chat.completions.create(
134
  model="gpt-4o",
135
  messages=[{"role": "user", "content": prompt}],
136
  temperature=0,
137
  response_format={"type": "json_object"},
138
  )
139
- result_dict = ast.literal_eval(completion.choices[0].message.content)
140
- new_data_df = pd.DataFrame([result_dict])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  return new_data_df
142
 
143
- # Step 7: Combine all data and save as final merged CSV
144
- def combine_all_data(csv_files, pdf_file):
145
- merged_csv_df = merge_files_with_mapping(csv_files)
146
- pdf_data_df = map_pdf_to_csv_structure(pdf_file, merged_csv_df)
147
- final_df = pd.concat([merged_csv_df, pdf_data_df], ignore_index=True)
 
 
 
 
 
 
 
148
  final_df.to_csv("merged_all_data.csv", index=False)
149
  return final_df
150
 
151
- # Gradio interface
152
- def process_data(csv_files, pdf_file):
153
- final_df = combine_all_data(csv_files, pdf_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  return final_df
155
- # Convert the images to Base64
156
  with open("Frame 1.png", "rb") as logo_file:
157
  base64_logo = base64.b64encode(logo_file.read()).decode("utf-8")
158
 
159
- # Gradio app
160
- with gr.Blocks(title="AI Data Transformation (AI can make mistakes)",theme=basetheme,js=js_func) as demo:
 
 
161
  # Add logo at the top using Base64 HTML
162
  with gr.Row():
163
  gr.HTML(
@@ -167,21 +293,33 @@ with gr.Blocks(title="AI Data Transformation (AI can make mistakes)",theme=baset
167
  <img src="data:image/png;base64,{base64_logo}" alt="Logo" style="width: 150px; height: auto;">
168
  </div>
169
  <div style="justify-self: center;">
170
- <h2 style="margin: 0; text-align: center;">AI Data Transformation (AI can make mistakes)</h2>
171
  </div>
172
  <div></div>
173
  </div>
174
  """
175
  )
176
- # Gradio UI
177
  gr.Interface(
178
- fn=process_data,
179
- inputs=[
180
- gr.File(label="Upload CSV files", file_count="multiple"),
181
- gr.File(label="Upload PDF file")
182
-
183
- ],
184
- outputs=gr.Dataframe(label="Final Merged Data (AI can make mistakes)")
185
- )
186
-
187
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import pandas as pd
3
  import json
 
4
  import gradio as gr
5
  from openai import AzureOpenAI
6
  from PyPDF2 import PdfReader
 
40
  }
41
  }
42
  """
43
+ # ===============================
44
  # Azure OpenAI setup
45
+ # ===============================
46
  os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
47
  os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")
 
48
 
49
  client = AzureOpenAI(
50
  api_version="2023-05-15",
51
+ azure_deployment="gpt-4o", # Replace with your actual model deployment name
52
  )
53
+
54
+ # ===============================
55
+ # Helper Functions
56
+ # ===============================
57
+
58
+ def parse_field_definitions(field_text):
59
+ """
60
+ Converts user-entered lines in the format:
61
+ Field Name: Description
62
+ into a dictionary { "Field Name": "Description", ... }.
63
+ Lines without a colon are ignored or added with an empty description.
64
+ """
65
+ user_fields = {}
66
+ lines = field_text.split("\n")
67
+ for line in lines:
68
+ line = line.strip()
69
+ if not line:
70
+ continue
71
+ if ":" in line:
72
+ # Split on the first colon
73
+ field, description = line.split(":", 1)
74
+ field = field.strip()
75
+ description = description.strip()
76
+ user_fields[field] = description
77
+ else:
78
+ # If no colon is found, treat entire line as a field with an empty description
79
+ user_fields[line] = ""
80
+ return user_fields
81
+
82
  def read_file_metadata(file_path):
83
  df = pd.read_csv(file_path)
84
  column_names = list(df.columns)
85
+ sample_columns = column_names[:2]
86
+ sample_data = df[sample_columns].iloc[0].to_dict() if len(df) > 0 else {}
87
+ return column_names, sample_data
88
+
89
+ def read_excel_metadata(file_path):
90
+ df = pd.read_excel(file_path)
91
+ column_names = list(df.columns)
92
+ sample_columns = column_names[:2]
93
+ sample_data = df[sample_columns].iloc[0].to_dict() if len(df) > 0 else {}
94
+ return column_names, sample_data
95
 
96
+ def create_column_mapping_prompt(file_metadata, user_fields):
 
97
  prompt = (
98
+ "You are given CSV/Excel data from different sources. The files contain columns with similar content but with different names.\n"
99
+ "The user has provided the following desired fields and their descriptions:\n"
100
+ f"{json.dumps(user_fields, indent=2)}\n\n"
101
+ "For each file, here are the details (showing example data from the first two columns):\n\n"
102
+ )
103
+ for file_path, column_names, sample_data in file_metadata:
104
+ prompt += f"File: {file_path}\n"
105
+ prompt += f"Columns: {column_names}\n"
106
+ prompt += f"Example Data (first two columns): {sample_data}\n\n"
107
+ prompt += (
108
+ "Your task is to map the existing column names from each file to the desired fields provided by the user. "
109
+ "For each desired field, decide which column name in each file best represents it. "
110
+ "If a field cannot be found, map it to an empty string.\n\n"
111
+ "Return the mapping in JSON format with the following structure:\n"
112
+ "{\n"
113
+ ' "desired_field1": { "source_file1": "matched_column_name_or_empty", "source_file2": "matched_column_name_or_empty", ... },\n'
114
+ ' "desired_field2": { ... },\n'
115
+ " ...\n"
116
+ "}\n\n"
117
+ "Do not include any additional text in your response."
118
  )
 
 
 
 
 
119
  return prompt
120
 
121
+ def get_column_mapping(file_metadata, user_fields):
122
+ column_match_prompt = create_column_mapping_prompt(file_metadata, user_fields)
 
123
  completion = client.chat.completions.create(
124
  model="gpt-4o",
125
  messages=[{"role": "user", "content": column_match_prompt}],
126
+ temperature=0.1,
127
  response_format={"type": "json_object"},
128
  )
 
 
 
129
 
130
+ try:
131
+ response_text = completion.choices[0].message.content.strip()
132
+ result_mapping = json.loads(response_text)
133
+ except Exception as e:
134
+ raise ValueError(
135
+ f"Error parsing LLM response: {e}\n\nResponse:\n{completion.choices[0].message.content}"
136
+ )
137
+
138
+ return result_mapping
139
+
140
+ def merge_files_with_mapping(file_paths, user_fields):
141
  file_metadata = []
142
  for file_path in file_paths:
143
+ if file_path.lower().endswith('.csv'):
144
+ columns, sample_data = read_file_metadata(file_path)
145
+ elif file_path.lower().endswith(('.xlsx', '.xls')):
146
+ columns, sample_data = read_excel_metadata(file_path)
147
+ else:
148
+ continue
149
+ file_metadata.append((file_path, columns, sample_data))
150
 
151
+ # Ask the LLM for a column mapping
152
+ mapping = get_column_mapping(file_metadata, user_fields) if file_metadata else {}
153
 
154
  all_data = []
155
  for file_path in file_paths:
156
+ if file_path.lower().endswith('.csv'):
157
+ df = pd.read_csv(file_path)
158
+ elif file_path.lower().endswith(('.xlsx', '.xls')):
159
+ df = pd.read_excel(file_path)
160
+ else:
161
+ continue
162
+
163
+ new_columns = {}
164
+ for desired_field, file_mapping in mapping.items():
165
+ source_column = ""
166
+ if file_path in file_mapping:
167
+ source_column = file_mapping[file_path]
168
+ else:
169
+ base_name = os.path.basename(file_path)
170
+ source_column = file_mapping.get(base_name, "")
171
+
172
+ if source_column and source_column in df.columns:
173
+ new_columns[source_column] = desired_field
174
+
175
+ df.rename(columns=new_columns, inplace=True)
176
  all_data.append(df)
177
 
178
+ if not all_data:
179
+ raise ValueError("No valid CSV/Excel files to merge.")
180
+
181
  final_df = pd.concat(all_data, ignore_index=True)
182
+ # Only keep columns in the order the user specified
183
+ desired_columns = list(user_fields.keys())
184
+ final_df = final_df.reindex(columns=desired_columns)
185
+
186
  final_df.to_csv("merged_data.csv", index=False)
187
  return final_df
188
 
 
189
  def extract_text_from_pdf(pdf_path):
190
  reader = PdfReader(pdf_path)
191
  text = ""
192
  for page in reader.pages:
193
+ page_text = page.extract_text()
194
+ if page_text:
195
+ text += page_text
196
  return text
197
 
198
+ def map_pdf_to_csv_structure(pdf_path, csv_df, user_fields):
 
199
  pdf_text = extract_text_from_pdf(pdf_path)
200
  column_headers = list(csv_df.columns)
201
+ first_row_data = csv_df.iloc[0].to_dict() if len(csv_df) > 0 else {}
 
 
 
 
 
 
 
 
202
 
203
+ prompt = (
204
+ f"Based on the following document text extracted from a government project in Thailand:\n{pdf_text}\n\n"
205
+ f"Please map the information to JSON format using the following structure:\n"
206
+ f"Column Headers: {column_headers}\n"
207
+ f"Example Data (from the first row of the CSV): {first_row_data}\n\n"
208
+ "For each column header, extract the corresponding value from the document text. "
209
+ "If a column header is not applicable or data is missing, use an empty string.\n\n"
210
+ "Return only JSON with no additional explanations."
211
+ )
212
 
 
 
213
  completion = client.chat.completions.create(
214
  model="gpt-4o",
215
  messages=[{"role": "user", "content": prompt}],
216
  temperature=0,
217
  response_format={"type": "json_object"},
218
  )
219
+
220
+ try:
221
+ response_text = completion.choices[0].message.content.strip()
222
+ result_dict = json.loads(response_text)
223
+ except Exception as e:
224
+ raise ValueError(
225
+ f"Error parsing LLM response for PDF mapping: {e}\n\nResponse:\n{completion.choices[0].message.content}"
226
+ )
227
+
228
+ if len(result_dict) == 1:
229
+ # If there's only a single top-level key, use its value as data
230
+ only_value = next(iter(result_dict.values()))
231
+ new_data_df = pd.DataFrame(only_value)
232
+ else:
233
+ new_data_df = pd.DataFrame(result_dict)
234
+
235
+ desired_columns = list(user_fields.keys())
236
+ new_data_df = new_data_df.reindex(columns=desired_columns)
237
  return new_data_df
238
 
239
+ def combine_all_data(file_paths, pdf_file, user_fields):
240
+ merged_csv_df = merge_files_with_mapping(file_paths, user_fields)
241
+
242
+ if pdf_file and os.path.exists(pdf_file):
243
+ pdf_data_df = map_pdf_to_csv_structure(pdf_file, merged_csv_df, user_fields)
244
+ final_df = pd.concat([merged_csv_df, pdf_data_df], ignore_index=True)
245
+ else:
246
+ final_df = merged_csv_df
247
+
248
+ desired_columns = list(user_fields.keys())
249
+ final_df = final_df.reindex(columns=desired_columns)
250
+
251
  final_df.to_csv("merged_all_data.csv", index=False)
252
  return final_df
253
 
254
+ # ===============================
255
+ # Gradio Interface Function
256
+ # ===============================
257
+ def process_data(files, pdf_file, field_text):
258
+ """
259
+ Main function for Gradio to handle user inputs:
260
+ - files: list of CSV/Excel files
261
+ - pdf_file: a single PDF file
262
+ - field_text: multiline text with lines in the form: "Field Name: Description"
263
+ """
264
+
265
+ # Parse the user's desired fields from multiline text
266
+ user_fields = parse_field_definitions(field_text)
267
+ if not user_fields:
268
+ return "No valid fields found. Please use the format:\n\nField Name: Description"
269
+
270
+ file_paths = [f.name for f in files] if files else []
271
+ pdf_path = pdf_file.name if pdf_file is not None else None
272
+
273
+ try:
274
+ final_df = combine_all_data(file_paths, pdf_path, user_fields)
275
+ except Exception as e:
276
+ return f"Error during processing: {e}"
277
+
278
  return final_df
279
+
280
  with open("Frame 1.png", "rb") as logo_file:
281
  base64_logo = base64.b64encode(logo_file.read()).decode("utf-8")
282
 
283
+ # ===============================
284
+ # Gradio UI
285
+ # ===============================
286
+ with gr.Blocks(theme=basetheme,js=js_func,fill_height=True) as demo:
287
  # Add logo at the top using Base64 HTML
288
  with gr.Row():
289
  gr.HTML(
 
293
  <img src="data:image/png;base64,{base64_logo}" alt="Logo" style="width: 150px; height: auto;">
294
  </div>
295
  <div style="justify-self: center;">
296
+ <h2 style="margin: 0; text-align: center;">AI Data Transformation with User-Selected Fields</h2>
297
  </div>
298
  <div></div>
299
  </div>
300
  """
301
  )
 
302
  gr.Interface(
303
+ fn=process_data,
304
+ inputs=[
305
+ gr.File(label="Upload CSV/Excel files", file_count="multiple",file_types=[".csv", ".xlsx", ".xls"]),
306
+ gr.File(label="Upload PDF file (optional)", file_types=[".pdf"]),
307
+ gr.Textbox(
308
+ label="Desired Fields (one per line, use 'Field Name: Description' format)",
309
+ placeholder="Example:\nName: Full name\nDOB: Date of birth\nAddress: Full address\n",
310
+ lines=6,
311
+ ),
312
+ ],
313
+ outputs=gr.Dataframe(label="Final Merged Data"),
314
+ description=(
315
+ "Upload one or more CSV/Excel files, optionally a PDF file, and enter your desired fields below. "
316
+ "Type each field on a new line in the format:\n"
317
+ "'Field Name: Description'\n\n"
318
+ "The AI will automatically map and merge columns from your files to these fields, "
319
+ "then optionally extract matching data from the PDF."
320
+ ),
321
+ )
322
+
323
+ if __name__ == "__main__":
324
+ # Launch the Gradio app
325
+ demo.launch()