import os import pandas as pd import json import ast import gradio as gr from openai import AzureOpenAI from PyPDF2 import PdfReader from gradio.themes.base import Base from gradio.themes.utils import colors, fonts, sizes import base64 class BaseTheme(Base): def __init__( self, *, primary_hue: colors.Color | str = colors.orange, secondary_hue: colors.Color | str = colors.blue, neutral_hue: colors.Color | str = colors.gray, spacing_size: sizes.Size | str = sizes.spacing_md, radius_size: sizes.Size | str = sizes.radius_md, text_size: sizes.Size | str = sizes.text_lg, ): super().__init__( primary_hue=primary_hue, secondary_hue=secondary_hue, neutral_hue=neutral_hue, spacing_size=spacing_size, radius_size=radius_size, text_size=text_size, ) basetheme = BaseTheme() js_func = """ function refresh() { const url = new URL(window.location); if (url.searchParams.get('__theme') !== 'dark') { url.searchParams.set('__theme', 'dark'); window.location.href = url.href; } } """ # Azure OpenAI setup os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT") os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY") deployment = os.getenv("AZURE_OPENAI_AI_DEPLOYMENT") client = AzureOpenAI( api_version="2023-05-15", azure_deployment=deployment, ) # Step 1: Read files and collect column names and first rows def read_file_metadata(file_path): df = pd.read_csv(file_path) column_names = list(df.columns) first_row = df.iloc[0].to_dict() # Convert first row to a dictionary return column_names, first_row # Step 2: Create the prompt for column mapping def create_column_mapping_prompt(metadata): prompt = ( "You are given CSV data from different sources, where column names for similar data vary slightly. " "Your task is to suggest mappings to unify columns with similar content under a single name.\n\n" ) for i, (file_path, column_names, first_row) in enumerate(metadata): prompt += f"Data from {file_path}:\n" prompt += f"Column names: {column_names}\n" prompt += f"Example row: {first_row}\n\n" prompt += "Suggest mappings to standardize the columns across these files. Please return in JSON format." return prompt # Step 3: Call the LLM to get the column mapping def get_column_mapping(file_metadata): column_match_prompt = create_column_mapping_prompt(file_metadata) completion = client.chat.completions.create( model="gpt-4o", messages=[{"role": "user", "content": column_match_prompt}], temperature=0.1, response_format={"type": "json_object"}, ) print(completion.choices[0].message.content) result_dict = ast.literal_eval(completion.choices[0].message.content) return result_dict # Step 4: Apply the mapping and merge data def merge_files_with_mapping(file_paths): file_metadata = [] for file_path in file_paths: column_names, first_row = read_file_metadata(file_path) file_metadata.append((file_path, column_names, first_row)) result_dict = get_column_mapping(file_metadata) all_data = [] for file_path in file_paths: df = pd.read_csv(file_path) df.rename(columns=result_dict, inplace=True) all_data.append(df) final_df = pd.concat(all_data, ignore_index=True) final_df.to_csv("merged_data.csv", index=False) return final_df # Step 5: Extract text from PDF def extract_text_from_pdf(pdf_path): reader = PdfReader(pdf_path) text = "" for page in reader.pages: text += page.extract_text() or "" return text # Step 6: Call the LLM for PDF data mapping def map_pdf_to_csv_structure(pdf_path, csv_df): pdf_text = extract_text_from_pdf(pdf_path) column_headers = list(csv_df.columns) first_row_data = csv_df.iloc[0].to_dict() prompt = f""" Based on the following document text extracted from a government project in Thailand: {pdf_text} Please map the information to JSON format using the following structure: Column Headers: {column_headers} Example Data (from the first row of the CSV): {first_row_data} Use the column headers as keys and fill in values based on the information from the document. If a key is not applicable or data is missing, leave the value as an empty string. Return only JSON with no additional explanations or modifications. """ completion = client.chat.completions.create( model="gpt-4o", messages=[{"role": "user", "content": prompt}], temperature=0.1, response_format={"type": "json_object"}, ) result_dict = ast.literal_eval(completion.choices[0].message.content) new_data_df = pd.DataFrame([result_dict]) return new_data_df # Step 7: Combine all data and save as final merged CSV def combine_all_data(csv_files, pdf_file): merged_csv_df = merge_files_with_mapping(csv_files) pdf_data_df = map_pdf_to_csv_structure(pdf_file, merged_csv_df) final_df = pd.concat([merged_csv_df, pdf_data_df], ignore_index=True) final_df.to_csv("merged_all_data.csv", index=False) return final_df # Gradio interface def process_data(csv_files, pdf_file): final_df = combine_all_data(csv_files, pdf_file) return final_df # Convert the images to Base64 with open("Frame 1.png", "rb") as logo_file: base64_logo = base64.b64encode(logo_file.read()).decode("utf-8") # Gradio app with gr.Blocks(title="AI Data Transformation (AI can make mistakes)",theme=basetheme,js=js_func) as demo: # Add logo at the top using Base64 HTML with gr.Row(): gr.HTML( f"""