AI-Data-Transformation-v2

Sleeping

App Files Files Community

Betimes commited on Jan 9

Commit

67a9083

verified ·

1 Parent(s): a09303f

Create app.py

Browse files

Files changed (1) hide show

app.py +187 -0

app.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import os
+import pandas as pd
+import json
+import ast
+import gradio as gr
+from openai import AzureOpenAI
+from PyPDF2 import PdfReader
+from gradio.themes.base import Base
+from gradio.themes.utils import colors, fonts, sizes
+import base64
+class BaseTheme(Base):
+    def __init__(
+        self,
+        *,
+        primary_hue: colors.Color | str = colors.orange,
+        secondary_hue: colors.Color | str = colors.blue,
+        neutral_hue: colors.Color | str = colors.gray,
+        spacing_size: sizes.Size | str = sizes.spacing_md,
+        radius_size: sizes.Size | str = sizes.radius_md,
+        text_size: sizes.Size | str = sizes.text_lg,
+    ):
+        super().__init__(
+            primary_hue=primary_hue,
+            secondary_hue=secondary_hue,
+            neutral_hue=neutral_hue,
+            spacing_size=spacing_size,
+            radius_size=radius_size,
+            text_size=text_size,
+        )
+basetheme = BaseTheme()
+js_func = """
+function refresh() {
+    const url = new URL(window.location);
+    if (url.searchParams.get('__theme') !== 'dark') {
+        url.searchParams.set('__theme', 'dark');
+        window.location.href = url.href;
+    }
+}
+"""
+# Azure OpenAI setup
+os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
+os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")
+deployment = os.getenv("AZURE_OPENAI_AI_DEPLOYMENT")
+client = AzureOpenAI(
+    api_version="2023-05-15",
+    azure_deployment=deployment,
+)
+# Step 1: Read files and collect column names and first rows
+def read_file_metadata(file_path):
+    df = pd.read_csv(file_path)
+    column_names = list(df.columns)
+    first_row = df.iloc[0].to_dict()  # Convert first row to a dictionary
+    return column_names, first_row
+# Step 2: Create the prompt for column mapping
+def create_column_mapping_prompt(metadata):
+    prompt = (
+        "You are given CSV data from different sources, where column names for similar data vary slightly. "
+        "Your task is to suggest mappings to unify columns with similar content under a single name.\n\n"
+    )
+    for i, (file_path, column_names, first_row) in enumerate(metadata):
+        prompt += f"Data from {file_path}:\n"
+        prompt += f"Column names: {column_names}\n"
+        prompt += f"Example row: {first_row}\n\n"
+    prompt += "Suggest mappings to standardize the columns across these files. Please return in JSON format."
+    return prompt
+# Step 3: Call the LLM to get the column mapping
+def get_column_mapping(file_metadata):
+    column_match_prompt = create_column_mapping_prompt(file_metadata)
+    completion = client.chat.completions.create(
+        model="gpt-4o",
+        messages=[{"role": "user", "content": column_match_prompt}],
+        temperature=0.1,
+        response_format={"type": "json_object"},
+    )
+    print(completion.choices[0].message.content)
+    result_dict = ast.literal_eval(completion.choices[0].message.content)
+    return result_dict
+# Step 4: Apply the mapping and merge data
+def merge_files_with_mapping(file_paths):
+    file_metadata = []
+    for file_path in file_paths:
+        column_names, first_row = read_file_metadata(file_path)
+        file_metadata.append((file_path, column_names, first_row))
+    result_dict = get_column_mapping(file_metadata)
+    all_data = []
+    for file_path in file_paths:
+        df = pd.read_csv(file_path)
+        df.rename(columns=result_dict, inplace=True)
+        all_data.append(df)
+    final_df = pd.concat(all_data, ignore_index=True)
+    final_df.to_csv("merged_data.csv", index=False)
+    return final_df
+# Step 5: Extract text from PDF
+def extract_text_from_pdf(pdf_path):
+    reader = PdfReader(pdf_path)
+    text = ""
+    for page in reader.pages:
+        text += page.extract_text() or ""
+    return text
+# Step 6: Call the LLM for PDF data mapping
+def map_pdf_to_csv_structure(pdf_path, csv_df):
+    pdf_text = extract_text_from_pdf(pdf_path)
+    column_headers = list(csv_df.columns)
+    first_row_data = csv_df.iloc[0].to_dict()
+    prompt = f"""
+    Based on the following document text extracted from a government project in Thailand:
+    {pdf_text}
+    Please map the information to JSON format using the following structure:
+    Column Headers: {column_headers}
+    Example Data (from the first row of the CSV): {first_row_data}
+    Use the column headers as keys and fill in values based on the information from the document.
+    If a key is not applicable or data is missing, leave the value as an empty string.
+    Return only JSON with no additional explanations or modifications.
+    """
+    completion = client.chat.completions.create(
+        model="gpt-4o",
+        messages=[{"role": "user", "content": prompt}],
+        temperature=0.1,
+        response_format={"type": "json_object"},
+    )
+    result_dict = ast.literal_eval(completion.choices[0].message.content)
+    new_data_df = pd.DataFrame([result_dict])
+    return new_data_df
+# Step 7: Combine all data and save as final merged CSV
+def combine_all_data(csv_files, pdf_file):
+    merged_csv_df = merge_files_with_mapping(csv_files)
+    pdf_data_df = map_pdf_to_csv_structure(pdf_file, merged_csv_df)
+    final_df = pd.concat([merged_csv_df, pdf_data_df], ignore_index=True)
+    final_df.to_csv("merged_all_data.csv", index=False)
+    return final_df
+# Gradio interface
+def process_data(csv_files, pdf_file):
+    final_df = combine_all_data(csv_files, pdf_file)
+    return final_df
+# Convert the images to Base64
+with open("Frame 1.png", "rb") as logo_file:
+    base64_logo = base64.b64encode(logo_file.read()).decode("utf-8")
+# Gradio app
+with gr.Blocks(title="AI Data Transformation (AI can make mistakes)",theme=basetheme,js=js_func) as demo:
+    # Add logo at the top using Base64 HTML
+    with gr.Row():
+        gr.HTML(
+            f"""
+                <div style="display: grid; grid-template-columns: 1fr 2fr 1fr; align-items: center;">
+                    <div style="justify-self: start;">
+                        <img src="data:image/png;base64,{base64_logo}" alt="Logo" style="width: 150px; height: auto;">
+                    </div>
+                    <div style="justify-self: center;">
+                        <h2 style="margin: 0; text-align: center;">AI Data Transformation (AI can make mistakes)</h2>
+                    </div>
+                    <div></div>
+                </div>
+            """
+        )
+    # Gradio UI
+    gr.Interface(
+        fn=process_data,
+        inputs=[
+            gr.File(label="Upload CSV files", file_count="multiple"),
+            gr.File(label="Upload PDF file")
+        ],
+        outputs=gr.Dataframe(label="Final Merged Data (AI can make mistakes)")
+    )
+demo.launch()