AI-Data-Transformation-v2

Sleeping

App Files Files Community

AI-Data-Transformation-v2 / app.py

Betimes

Create app.py

67a9083 verified 7 months ago

raw

history blame

6.7 kB

	import os
	import pandas as pd
	import json
	import ast
	import gradio as gr
	from openai import AzureOpenAI
	from PyPDF2 import PdfReader
	from gradio.themes.base import Base
	from gradio.themes.utils import colors, fonts, sizes
	import base64

	class BaseTheme(Base):
	def __init__(
	self,
	*,
	primary_hue: colors.Color \| str = colors.orange,
	secondary_hue: colors.Color \| str = colors.blue,
	neutral_hue: colors.Color \| str = colors.gray,
	spacing_size: sizes.Size \| str = sizes.spacing_md,
	radius_size: sizes.Size \| str = sizes.radius_md,
	text_size: sizes.Size \| str = sizes.text_lg,
	):
	super().__init__(
	primary_hue=primary_hue,
	secondary_hue=secondary_hue,
	neutral_hue=neutral_hue,
	spacing_size=spacing_size,
	radius_size=radius_size,
	text_size=text_size,
	)

	basetheme = BaseTheme()

	js_func = """
	function refresh() {
	const url = new URL(window.location);

	if (url.searchParams.get('__theme') !== 'dark') {
	url.searchParams.set('__theme', 'dark');
	window.location.href = url.href;
	}
	}
	"""

	# Azure OpenAI setup
	os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
	os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")
	deployment = os.getenv("AZURE_OPENAI_AI_DEPLOYMENT")

	client = AzureOpenAI(
	api_version="2023-05-15",
	azure_deployment=deployment,
	)
	# Step 1: Read files and collect column names and first rows
	def read_file_metadata(file_path):
	df = pd.read_csv(file_path)
	column_names = list(df.columns)
	first_row = df.iloc[0].to_dict() # Convert first row to a dictionary
	return column_names, first_row

	# Step 2: Create the prompt for column mapping
	def create_column_mapping_prompt(metadata):
	prompt = (
	"You are given CSV data from different sources, where column names for similar data vary slightly. "
	"Your task is to suggest mappings to unify columns with similar content under a single name.\n\n"
	)
	for i, (file_path, column_names, first_row) in enumerate(metadata):
	prompt += f"Data from {file_path}:\n"
	prompt += f"Column names: {column_names}\n"
	prompt += f"Example row: {first_row}\n\n"
	prompt += "Suggest mappings to standardize the columns across these files. Please return in JSON format."
	return prompt

	# Step 3: Call the LLM to get the column mapping
	def get_column_mapping(file_metadata):
	column_match_prompt = create_column_mapping_prompt(file_metadata)
	completion = client.chat.completions.create(
	model="gpt-4o",
	messages=[{"role": "user", "content": column_match_prompt}],
	temperature=0.1,
	response_format={"type": "json_object"},
	)
	print(completion.choices[0].message.content)
	result_dict = ast.literal_eval(completion.choices[0].message.content)
	return result_dict

	# Step 4: Apply the mapping and merge data
	def merge_files_with_mapping(file_paths):
	file_metadata = []
	for file_path in file_paths:
	column_names, first_row = read_file_metadata(file_path)
	file_metadata.append((file_path, column_names, first_row))

	result_dict = get_column_mapping(file_metadata)

	all_data = []
	for file_path in file_paths:
	df = pd.read_csv(file_path)
	df.rename(columns=result_dict, inplace=True)
	all_data.append(df)

	final_df = pd.concat(all_data, ignore_index=True)
	final_df.to_csv("merged_data.csv", index=False)
	return final_df

	# Step 5: Extract text from PDF
	def extract_text_from_pdf(pdf_path):
	reader = PdfReader(pdf_path)
	text = ""
	for page in reader.pages:
	text += page.extract_text() or ""
	return text

	# Step 6: Call the LLM for PDF data mapping
	def map_pdf_to_csv_structure(pdf_path, csv_df):
	pdf_text = extract_text_from_pdf(pdf_path)
	column_headers = list(csv_df.columns)
	first_row_data = csv_df.iloc[0].to_dict()

	prompt = f"""
	Based on the following document text extracted from a government project in Thailand:
	{pdf_text}

	Please map the information to JSON format using the following structure:
	Column Headers: {column_headers}
	Example Data (from the first row of the CSV): {first_row_data}

	Use the column headers as keys and fill in values based on the information from the document.
	If a key is not applicable or data is missing, leave the value as an empty string.

	Return only JSON with no additional explanations or modifications.
	"""
	completion = client.chat.completions.create(
	model="gpt-4o",
	messages=[{"role": "user", "content": prompt}],
	temperature=0.1,
	response_format={"type": "json_object"},
	)
	result_dict = ast.literal_eval(completion.choices[0].message.content)
	new_data_df = pd.DataFrame([result_dict])
	return new_data_df

	# Step 7: Combine all data and save as final merged CSV
	def combine_all_data(csv_files, pdf_file):
	merged_csv_df = merge_files_with_mapping(csv_files)
	pdf_data_df = map_pdf_to_csv_structure(pdf_file, merged_csv_df)
	final_df = pd.concat([merged_csv_df, pdf_data_df], ignore_index=True)
	final_df.to_csv("merged_all_data.csv", index=False)
	return final_df

	# Gradio interface
	def process_data(csv_files, pdf_file):
	final_df = combine_all_data(csv_files, pdf_file)
	return final_df
	# Convert the images to Base64
	with open("Frame 1.png", "rb") as logo_file:
	base64_logo = base64.b64encode(logo_file.read()).decode("utf-8")

	# Gradio app
	with gr.Blocks(title="AI Data Transformation (AI can make mistakes)",theme=basetheme,js=js_func) as demo:
	# Add logo at the top using Base64 HTML
	with gr.Row():
	gr.HTML(
	f"""
	<div style="display: grid; grid-template-columns: 1fr 2fr 1fr; align-items: center;">
	<div style="justify-self: start;">
	<img src="data:image/png;base64,{base64_logo}" alt="Logo" style="width: 150px; height: auto;">
	</div>
	<div style="justify-self: center;">
	<h2 style="margin: 0; text-align: center;">AI Data Transformation (AI can make mistakes)</h2>
	</div>
	<div></div>
	</div>
	"""
	)
	# Gradio UI
	gr.Interface(
	fn=process_data,
	inputs=[
	gr.File(label="Upload CSV files", file_count="multiple"),
	gr.File(label="Upload PDF file")

	],
	outputs=gr.Dataframe(label="Final Merged Data (AI can make mistakes)")
	)

	demo.launch()