File size: 6,702 Bytes
67a9083
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import os
import pandas as pd
import json
import ast
import gradio as gr
from openai import AzureOpenAI
from PyPDF2 import PdfReader
from gradio.themes.base import Base
from gradio.themes.utils import colors, fonts, sizes
import base64

class BaseTheme(Base):
    def __init__(
        self,
        *,
        primary_hue: colors.Color | str = colors.orange,
        secondary_hue: colors.Color | str = colors.blue,
        neutral_hue: colors.Color | str = colors.gray,
        spacing_size: sizes.Size | str = sizes.spacing_md,
        radius_size: sizes.Size | str = sizes.radius_md,
        text_size: sizes.Size | str = sizes.text_lg,
    ):
        super().__init__(
            primary_hue=primary_hue,
            secondary_hue=secondary_hue,
            neutral_hue=neutral_hue,
            spacing_size=spacing_size,
            radius_size=radius_size,
            text_size=text_size,
        )

basetheme = BaseTheme()

js_func = """
function refresh() {
    const url = new URL(window.location);

    if (url.searchParams.get('__theme') !== 'dark') {
        url.searchParams.set('__theme', 'dark');
        window.location.href = url.href;
    }
}
"""

# Azure OpenAI setup
os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")
deployment = os.getenv("AZURE_OPENAI_AI_DEPLOYMENT")

client = AzureOpenAI(
    api_version="2023-05-15",
    azure_deployment=deployment,
)
# Step 1: Read files and collect column names and first rows
def read_file_metadata(file_path):
    df = pd.read_csv(file_path)
    column_names = list(df.columns)
    first_row = df.iloc[0].to_dict()  # Convert first row to a dictionary
    return column_names, first_row

# Step 2: Create the prompt for column mapping
def create_column_mapping_prompt(metadata):
    prompt = (
        "You are given CSV data from different sources, where column names for similar data vary slightly. "
        "Your task is to suggest mappings to unify columns with similar content under a single name.\n\n"
    )
    for i, (file_path, column_names, first_row) in enumerate(metadata):
        prompt += f"Data from {file_path}:\n"
        prompt += f"Column names: {column_names}\n"
        prompt += f"Example row: {first_row}\n\n"
    prompt += "Suggest mappings to standardize the columns across these files. Please return in JSON format."
    return prompt

# Step 3: Call the LLM to get the column mapping
def get_column_mapping(file_metadata):
    column_match_prompt = create_column_mapping_prompt(file_metadata)
    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": column_match_prompt}],
        temperature=0.1,
        response_format={"type": "json_object"},
    )
    print(completion.choices[0].message.content)
    result_dict = ast.literal_eval(completion.choices[0].message.content)
    return result_dict

# Step 4: Apply the mapping and merge data
def merge_files_with_mapping(file_paths):
    file_metadata = []
    for file_path in file_paths:
        column_names, first_row = read_file_metadata(file_path)
        file_metadata.append((file_path, column_names, first_row))

    result_dict = get_column_mapping(file_metadata)

    all_data = []
    for file_path in file_paths:
        df = pd.read_csv(file_path)
        df.rename(columns=result_dict, inplace=True)
        all_data.append(df)

    final_df = pd.concat(all_data, ignore_index=True)
    final_df.to_csv("merged_data.csv", index=False)
    return final_df

# Step 5: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() or ""
    return text

# Step 6: Call the LLM for PDF data mapping
def map_pdf_to_csv_structure(pdf_path, csv_df):
    pdf_text = extract_text_from_pdf(pdf_path)
    column_headers = list(csv_df.columns)
    first_row_data = csv_df.iloc[0].to_dict()

    prompt = f"""
    Based on the following document text extracted from a government project in Thailand:
    {pdf_text}

    Please map the information to JSON format using the following structure:
    Column Headers: {column_headers}
    Example Data (from the first row of the CSV): {first_row_data}

    Use the column headers as keys and fill in values based on the information from the document.
    If a key is not applicable or data is missing, leave the value as an empty string.

    Return only JSON with no additional explanations or modifications.
    """
    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.1,
        response_format={"type": "json_object"},
    )
    result_dict = ast.literal_eval(completion.choices[0].message.content)
    new_data_df = pd.DataFrame([result_dict])
    return new_data_df

# Step 7: Combine all data and save as final merged CSV
def combine_all_data(csv_files, pdf_file):
    merged_csv_df = merge_files_with_mapping(csv_files)
    pdf_data_df = map_pdf_to_csv_structure(pdf_file, merged_csv_df)
    final_df = pd.concat([merged_csv_df, pdf_data_df], ignore_index=True)
    final_df.to_csv("merged_all_data.csv", index=False)
    return final_df

# Gradio interface
def process_data(csv_files, pdf_file):
    final_df = combine_all_data(csv_files, pdf_file)
    return final_df
# Convert the images to Base64
with open("Frame 1.png", "rb") as logo_file:
    base64_logo = base64.b64encode(logo_file.read()).decode("utf-8")

# Gradio app
with gr.Blocks(title="AI Data Transformation (AI can make mistakes)",theme=basetheme,js=js_func) as demo:
    # Add logo at the top using Base64 HTML
    with gr.Row():
        gr.HTML(
            f"""
                <div style="display: grid; grid-template-columns: 1fr 2fr 1fr; align-items: center;">
                    <div style="justify-self: start;">
                        <img src="data:image/png;base64,{base64_logo}" alt="Logo" style="width: 150px; height: auto;">
                    </div>
                    <div style="justify-self: center;">
                        <h2 style="margin: 0; text-align: center;">AI Data Transformation (AI can make mistakes)</h2>
                    </div>
                    <div></div>
                </div>
            """
        )
    # Gradio UI
    gr.Interface(
        fn=process_data,
        inputs=[
            gr.File(label="Upload CSV files", file_count="multiple"),
            gr.File(label="Upload PDF file")
            
        ],
        outputs=gr.Dataframe(label="Final Merged Data (AI can make mistakes)")
    )
    
demo.launch()