AI-Data-Transformation-v2

Sleeping

File size: 6,702 Bytes

67a9083

import os
import pandas as pd
import json
import ast
import gradio as gr
from openai import AzureOpenAI
from PyPDF2 import PdfReader
from gradio.themes.base import Base
from gradio.themes.utils import colors, fonts, sizes
import base64

class BaseTheme(Base):
    def __init__(
        self,
        *,
        primary_hue: colors.Color | str = colors.orange,
        secondary_hue: colors.Color | str = colors.blue,
        neutral_hue: colors.Color | str = colors.gray,
        spacing_size: sizes.Size | str = sizes.spacing_md,
        radius_size: sizes.Size | str = sizes.radius_md,
        text_size: sizes.Size | str = sizes.text_lg,
    ):
        super().__init__(
            primary_hue=primary_hue,
            secondary_hue=secondary_hue,
            neutral_hue=neutral_hue,
            spacing_size=spacing_size,
            radius_size=radius_size,
            text_size=text_size,
        )

basetheme = BaseTheme()

js_func = """
function refresh() {
    const url = new URL(window.location);

    if (url.searchParams.get('__theme') !== 'dark') {
        url.searchParams.set('__theme', 'dark');
        window.location.href = url.href;
    }
}
"""

# Azure OpenAI setup
os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")
deployment = os.getenv("AZURE_OPENAI_AI_DEPLOYMENT")

client = AzureOpenAI(
    api_version="2023-05-15",
    azure_deployment=deployment,
)
# Step 1: Read files and collect column names and first rows
def read_file_metadata(file_path):
    df = pd.read_csv(file_path)
    column_names = list(df.columns)
    first_row = df.iloc[0].to_dict()  # Convert first row to a dictionary
    return column_names, first_row

# Step 2: Create the prompt for column mapping
def create_column_mapping_prompt(metadata):
    prompt = (
        "You are given CSV data from different sources, where column names for similar data vary slightly. "
        "Your task is to suggest mappings to unify columns with similar content under a single name.\n\n"
    )
    for i, (file_path, column_names, first_row) in enumerate(metadata):
        prompt += f"Data from {file_path}:\n"
        prompt += f"Column names: {column_names}\n"
        prompt += f"Example row: {first_row}\n\n"
    prompt += "Suggest mappings to standardize the columns across these files. Please return in JSON format."
    return prompt

# Step 3: Call the LLM to get the column mapping
def get_column_mapping(file_metadata):
    column_match_prompt = create_column_mapping_prompt(file_metadata)
    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": column_match_prompt}],
        temperature=0.1,
        response_format={"type": "json_object"},
    )
    print(completion.choices[0].message.content)
    result_dict = ast.literal_eval(completion.choices[0].message.content)
    return result_dict

# Step 4: Apply the mapping and merge data
def merge_files_with_mapping(file_paths):
    file_metadata = []
    for file_path in file_paths:
        column_names, first_row = read_file_metadata(file_path)
        file_metadata.append((file_path, column_names, first_row))

    result_dict = get_column_mapping(file_metadata)

    all_data = []
    for file_path in file_paths:
        df = pd.read_csv(file_path)
        df.rename(columns=result_dict, inplace=True)
        all_data.append(df)

    final_df = pd.concat(all_data, ignore_index=True)
    final_df.to_csv("merged_data.csv", index=False)
    return final_df

# Step 5: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() or ""
    return text

# Step 6: Call the LLM for PDF data mapping
def map_pdf_to_csv_structure(pdf_path, csv_df):
    pdf_text = extract_text_from_pdf(pdf_path)
    column_headers = list(csv_df.columns)
    first_row_data = csv_df.iloc[0].to_dict()

    prompt = f"""
    Based on the following document text extracted from a government project in Thailand:
    {pdf_text}

    Please map the information to JSON format using the following structure:
    Column Headers: {column_headers}
    Example Data (from the first row of the CSV): {first_row_data}

    Use the column headers as keys and fill in values based on the information from the document.
    If a key is not applicable or data is missing, leave the value as an empty string.

    Return only JSON with no additional explanations or modifications.
    """
    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.1,
        response_format={"type": "json_object"},
    )
    result_dict = ast.literal_eval(completion.choices[0].message.content)
    new_data_df = pd.DataFrame([result_dict])
    return new_data_df

# Step 7: Combine all data and save as final merged CSV
def combine_all_data(csv_files, pdf_file):
    merged_csv_df = merge_files_with_mapping(csv_files)
    pdf_data_df = map_pdf_to_csv_structure(pdf_file, merged_csv_df)
    final_df = pd.concat([merged_csv_df, pdf_data_df], ignore_index=True)
    final_df.to_csv("merged_all_data.csv", index=False)
    return final_df

# Gradio interface
def process_data(csv_files, pdf_file):
    final_df = combine_all_data(csv_files, pdf_file)
    return final_df
# Convert the images to Base64
with open("Frame 1.png", "rb") as logo_file:
    base64_logo = base64.b64encode(logo_file.read()).decode("utf-8")

# Gradio app
with gr.Blocks(title="AI Data Transformation (AI can make mistakes)",theme=basetheme,js=js_func) as demo:
    # Add logo at the top using Base64 HTML
    with gr.Row():
        gr.HTML(
            f"""
                <div style="display: grid; grid-template-columns: 1fr 2fr 1fr; align-items: center;">
                    <div style="justify-self: start;">
                        <img src="data:image/png;base64,{base64_logo}" alt="Logo" style="width: 150px; height: auto;">
                    </div>
                    <div style="justify-self: center;">
                        <h2 style="margin: 0; text-align: center;">AI Data Transformation (AI can make mistakes)</h2>
                    </div>
                    <div></div>
                </div>
            """
        )
    # Gradio UI
    gr.Interface(
        fn=process_data,
        inputs=[
            gr.File(label="Upload CSV files", file_count="multiple"),
            gr.File(label="Upload PDF file")
            
        ],
        outputs=gr.Dataframe(label="Final Merged Data (AI can make mistakes)")
    )
    
demo.launch()