Betimes commited on
Commit
67a9083
·
verified ·
1 Parent(s): a09303f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +187 -0
app.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import json
4
+ import ast
5
+ import gradio as gr
6
+ from openai import AzureOpenAI
7
+ from PyPDF2 import PdfReader
8
+ from gradio.themes.base import Base
9
+ from gradio.themes.utils import colors, fonts, sizes
10
+ import base64
11
+
12
+ class BaseTheme(Base):
13
+ def __init__(
14
+ self,
15
+ *,
16
+ primary_hue: colors.Color | str = colors.orange,
17
+ secondary_hue: colors.Color | str = colors.blue,
18
+ neutral_hue: colors.Color | str = colors.gray,
19
+ spacing_size: sizes.Size | str = sizes.spacing_md,
20
+ radius_size: sizes.Size | str = sizes.radius_md,
21
+ text_size: sizes.Size | str = sizes.text_lg,
22
+ ):
23
+ super().__init__(
24
+ primary_hue=primary_hue,
25
+ secondary_hue=secondary_hue,
26
+ neutral_hue=neutral_hue,
27
+ spacing_size=spacing_size,
28
+ radius_size=radius_size,
29
+ text_size=text_size,
30
+ )
31
+
32
+ basetheme = BaseTheme()
33
+
34
+ js_func = """
35
+ function refresh() {
36
+ const url = new URL(window.location);
37
+
38
+ if (url.searchParams.get('__theme') !== 'dark') {
39
+ url.searchParams.set('__theme', 'dark');
40
+ window.location.href = url.href;
41
+ }
42
+ }
43
+ """
44
+
45
+ # Azure OpenAI setup
46
+ os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
47
+ os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")
48
+ deployment = os.getenv("AZURE_OPENAI_AI_DEPLOYMENT")
49
+
50
+ client = AzureOpenAI(
51
+ api_version="2023-05-15",
52
+ azure_deployment=deployment,
53
+ )
54
+ # Step 1: Read files and collect column names and first rows
55
+ def read_file_metadata(file_path):
56
+ df = pd.read_csv(file_path)
57
+ column_names = list(df.columns)
58
+ first_row = df.iloc[0].to_dict() # Convert first row to a dictionary
59
+ return column_names, first_row
60
+
61
+ # Step 2: Create the prompt for column mapping
62
+ def create_column_mapping_prompt(metadata):
63
+ prompt = (
64
+ "You are given CSV data from different sources, where column names for similar data vary slightly. "
65
+ "Your task is to suggest mappings to unify columns with similar content under a single name.\n\n"
66
+ )
67
+ for i, (file_path, column_names, first_row) in enumerate(metadata):
68
+ prompt += f"Data from {file_path}:\n"
69
+ prompt += f"Column names: {column_names}\n"
70
+ prompt += f"Example row: {first_row}\n\n"
71
+ prompt += "Suggest mappings to standardize the columns across these files. Please return in JSON format."
72
+ return prompt
73
+
74
+ # Step 3: Call the LLM to get the column mapping
75
+ def get_column_mapping(file_metadata):
76
+ column_match_prompt = create_column_mapping_prompt(file_metadata)
77
+ completion = client.chat.completions.create(
78
+ model="gpt-4o",
79
+ messages=[{"role": "user", "content": column_match_prompt}],
80
+ temperature=0.1,
81
+ response_format={"type": "json_object"},
82
+ )
83
+ print(completion.choices[0].message.content)
84
+ result_dict = ast.literal_eval(completion.choices[0].message.content)
85
+ return result_dict
86
+
87
+ # Step 4: Apply the mapping and merge data
88
+ def merge_files_with_mapping(file_paths):
89
+ file_metadata = []
90
+ for file_path in file_paths:
91
+ column_names, first_row = read_file_metadata(file_path)
92
+ file_metadata.append((file_path, column_names, first_row))
93
+
94
+ result_dict = get_column_mapping(file_metadata)
95
+
96
+ all_data = []
97
+ for file_path in file_paths:
98
+ df = pd.read_csv(file_path)
99
+ df.rename(columns=result_dict, inplace=True)
100
+ all_data.append(df)
101
+
102
+ final_df = pd.concat(all_data, ignore_index=True)
103
+ final_df.to_csv("merged_data.csv", index=False)
104
+ return final_df
105
+
106
+ # Step 5: Extract text from PDF
107
+ def extract_text_from_pdf(pdf_path):
108
+ reader = PdfReader(pdf_path)
109
+ text = ""
110
+ for page in reader.pages:
111
+ text += page.extract_text() or ""
112
+ return text
113
+
114
+ # Step 6: Call the LLM for PDF data mapping
115
+ def map_pdf_to_csv_structure(pdf_path, csv_df):
116
+ pdf_text = extract_text_from_pdf(pdf_path)
117
+ column_headers = list(csv_df.columns)
118
+ first_row_data = csv_df.iloc[0].to_dict()
119
+
120
+ prompt = f"""
121
+ Based on the following document text extracted from a government project in Thailand:
122
+ {pdf_text}
123
+
124
+ Please map the information to JSON format using the following structure:
125
+ Column Headers: {column_headers}
126
+ Example Data (from the first row of the CSV): {first_row_data}
127
+
128
+ Use the column headers as keys and fill in values based on the information from the document.
129
+ If a key is not applicable or data is missing, leave the value as an empty string.
130
+
131
+ Return only JSON with no additional explanations or modifications.
132
+ """
133
+ completion = client.chat.completions.create(
134
+ model="gpt-4o",
135
+ messages=[{"role": "user", "content": prompt}],
136
+ temperature=0.1,
137
+ response_format={"type": "json_object"},
138
+ )
139
+ result_dict = ast.literal_eval(completion.choices[0].message.content)
140
+ new_data_df = pd.DataFrame([result_dict])
141
+ return new_data_df
142
+
143
+ # Step 7: Combine all data and save as final merged CSV
144
+ def combine_all_data(csv_files, pdf_file):
145
+ merged_csv_df = merge_files_with_mapping(csv_files)
146
+ pdf_data_df = map_pdf_to_csv_structure(pdf_file, merged_csv_df)
147
+ final_df = pd.concat([merged_csv_df, pdf_data_df], ignore_index=True)
148
+ final_df.to_csv("merged_all_data.csv", index=False)
149
+ return final_df
150
+
151
+ # Gradio interface
152
+ def process_data(csv_files, pdf_file):
153
+ final_df = combine_all_data(csv_files, pdf_file)
154
+ return final_df
155
+ # Convert the images to Base64
156
+ with open("Frame 1.png", "rb") as logo_file:
157
+ base64_logo = base64.b64encode(logo_file.read()).decode("utf-8")
158
+
159
+ # Gradio app
160
+ with gr.Blocks(title="AI Data Transformation (AI can make mistakes)",theme=basetheme,js=js_func) as demo:
161
+ # Add logo at the top using Base64 HTML
162
+ with gr.Row():
163
+ gr.HTML(
164
+ f"""
165
+ <div style="display: grid; grid-template-columns: 1fr 2fr 1fr; align-items: center;">
166
+ <div style="justify-self: start;">
167
+ <img src="data:image/png;base64,{base64_logo}" alt="Logo" style="width: 150px; height: auto;">
168
+ </div>
169
+ <div style="justify-self: center;">
170
+ <h2 style="margin: 0; text-align: center;">AI Data Transformation (AI can make mistakes)</h2>
171
+ </div>
172
+ <div></div>
173
+ </div>
174
+ """
175
+ )
176
+ # Gradio UI
177
+ gr.Interface(
178
+ fn=process_data,
179
+ inputs=[
180
+ gr.File(label="Upload CSV files", file_count="multiple"),
181
+ gr.File(label="Upload PDF file")
182
+
183
+ ],
184
+ outputs=gr.Dataframe(label="Final Merged Data (AI can make mistakes)")
185
+ )
186
+
187
+ demo.launch()