Spaces:

mgbam
/

sythenticdata

Sleeping

App Files Files Community

mgbam commited on Feb 9

Commit

fc2842c

verified ·

1 Parent(s): ad9d3f0

Update app.py

Browse files

Files changed (1) hide show

app.py +268 -327

app.py CHANGED Viewed

@@ -4,446 +4,387 @@ import streamlit as st
 import pdfplumber
 import pandas as pd
 import sqlalchemy
-from PIL import Image
 from typing import Any, Dict, List
-# Provider clients
-from openai import OpenAI
-import google.generativeai as genai
-import groq
-# --- CONSTANTS ---
 HF_API_URL = "https://api-inference.huggingface.co/models/"
 DEFAULT_TEMPERATURE = 0.1
-GROQ_MODEL = "mixtral-8x7b-32768"  # Groq model
-API_HEADERS_HEIGHT = 70  # Height for the API headers text area
-class SyntheticDataGenerator:
     """
-    Generates synthetic Q&A data from various input sources using multiple LLM providers.
     """
     def __init__(self) -> None:
         self._setup_providers()
         self._setup_input_handlers()
         self._initialize_session_state()
     def _setup_providers(self) -> None:
-        """Configure available LLM providers and their client initializations."""
         self.providers: Dict[str, Dict[str, Any]] = {
             "Deepseek": {
-                "client": lambda key: OpenAI(base_url="https://api.deepseek.com/v1", api_key=key),
                 "models": ["deepseek-chat"],
             },
             "OpenAI": {
-                "client": lambda key: OpenAI(api_key=key),
-                "models": ["gpt-4-turbo"],
             },
             "Groq": {
-                "client": lambda key: groq.Groq(api_key=key),
                 "models": [GROQ_MODEL],
             },
             "HuggingFace": {
                 "client": lambda key: {"headers": {"Authorization": f"Bearer {key}"}},
                 "models": ["gpt2", "llama-2"],
             },
-            "Google": {
-                "client": lambda key: self._configure_google_genai(key),
-                "models": ["gemini-pro"],
-            },
         }
     def _setup_input_handlers(self) -> None:
-        """Define handlers for different input data types."""
         self.input_handlers: Dict[str, Any] = {
-            "pdf": self.handle_pdf,
             "text": self.handle_text,
             "csv": self.handle_csv,
             "api": self.handle_api,
             "db": self.handle_db,
         }
     def _initialize_session_state(self) -> None:
-        """Initialize Streamlit session state with default configurations."""
-        session_defaults = {
-            "inputs": [],
-            "qa_data": [],
-            "processing": {"stage": "idle", "progress": 0, "errors": []},
             "config": {
-                "provider": "Groq",
-                "model": GROQ_MODEL,
                 "temperature": DEFAULT_TEMPERATURE,
             },
-            "api_key": "",  # Explicitly initialize the API key
         }
-        for key, value in session_defaults.items():
             if key not in st.session_state:
                 st.session_state[key] = value
-    def _configure_google_genai(self, api_key: str) -> Any:
-        """Configure and return the Google Generative AI client."""
-        try:
-            genai.configure(api_key=api_key)
-            return genai.GenerativeModel
-        except Exception as e:
-            st.error(f"Error configuring Google GenAI: {e}")
-            return None
-    # --- INPUT HANDLERS ---
-    def handle_pdf(self, file) -> List[Dict[str, Any]]:
-        """
-        Extract text and images from a PDF file.
-        Returns:
-            A list of dictionaries containing text, images, and metadata.
-        """
         try:
             with pdfplumber.open(file) as pdf:
-                extracted_data = []
-                for i, page in enumerate(pdf.pages):
                     page_text = page.extract_text() or ""
-                    page_images = self.process_images(page)
-                    extracted_data.append({
-                        "text": page_text,
-                        "images": page_images,
-                        "meta": {"type": "pdf", "page": i + 1},
-                    })
-                return extracted_data
         except Exception as e:
-            self._log_error(f"PDF Error: {e}")
-            return []
-    def handle_text(self, text: str) -> List[Dict[str, Any]]:
-        """Handle manual text input."""
-        return [{"text": text, "meta": {"type": "domain", "source": "manual"}}]
-    def handle_csv(self, file) -> List[Dict[str, Any]]:
-        """Process a CSV file and format the data for Q&A generation."""
         try:
             df = pd.read_csv(file)
-            return [
-                {
-                    "text": "\n".join([f"{col}: {row[col]}" for col in df.columns]),
-                    "meta": {"type": "csv", "columns": list(df.columns)},
-                }
-                for _, row in df.iterrows()
-            ]
         except Exception as e:
-            self._log_error(f"CSV Error: {e}")
-            return []
-    def handle_api(self, config: Dict[str, Any]) -> List[Dict[str, Any]]:
-        """Fetch data from an API endpoint and format it for processing."""
         try:
-            response = requests.get(config["url"], headers=config["headers"], timeout=10)
             response.raise_for_status()
-            return [{
-                "text": json.dumps(response.json()),
-                "meta": {"type": "api", "endpoint": config["url"]},
-            }]
-        except requests.exceptions.RequestException as e:
-            self._log_error(f"API Error: {e}")
-            return []
-    def handle_db(self, config: Dict[str, Any]) -> List[Dict[str, Any]]:
-        """Connect to a database, execute a query, and format the results."""
         try:
             engine = sqlalchemy.create_engine(config["connection"])
             with engine.connect() as conn:
                 result = conn.execute(sqlalchemy.text(config["query"]))
-                return [
-                    {
-                        "text": "\n".join([f"{col}: {val}" for col, val in row._asdict().items()]),
-                        "meta": {"type": "db", "table": config.get("table", "")},
-                    }
-                    for row in result
-                ]
         except Exception as e:
-            self._log_error(f"DB Error: {e}")
-            return []
-    def process_images(self, page) -> List[Dict[str, Any]]:
-        """Extract and process images from a PDF page."""
-        images = []
-        for img in page.images:
-            try:
-                stream = img["stream"]
-                width = int(stream.get("Width", 0))
-                height = int(stream.get("Height", 0))
-                image_data = stream.get_data()
-                if width > 0 and height > 0 and image_data:
-                    try:
-                        image = Image.frombytes("RGB", (width, height), image_data)
-                        images.append({"data": image, "meta": {"dims": (width, height)}})
-                    except Exception as e:
-                        self._log_error(f"Image Creation Error: {e} (Width: {width}, Height: {height})")
-                else:
-                    self._log_error(f"Image Error: Insufficient data or invalid dimensions (w={width}, h={height})")
-            except Exception as e:
-                self._log_error(f"Image Extraction Error: {e}")
-        return images
-    # --- LLM INFERENCE ---
-    def generate(self, api_key: str) -> bool:
         """
-        Generate Q&A pairs using the selected LLM provider.
-        Iterates over all the input data, calls the appropriate inference method,
-        and aggregates the generated Q&A pairs into session state.
         """
         if not api_key:
-            st.error("API Key cannot be empty.")
             return False
-        try:
-            provider_name = st.session_state.config["provider"]
-            provider_cfg = self.providers[provider_name]
-            client_initializer = provider_cfg["client"]
-            # Initialize the client
-            if provider_name == "Google":
-                client = client_initializer(api_key)
-                if not client:
-                    return False
-            else:
-                client = client_initializer(api_key)
-            for i, input_data in enumerate(st.session_state.inputs):
-                st.session_state.processing["progress"] = (i + 1) / len(st.session_state.inputs)
-                st.write("--- Input Data ---")
-                st.write(input_data["text"])
-                if provider_name == "HuggingFace":
-                    response = self._huggingface_inference(client, input_data)
-                elif provider_name == "Google":
-                    response = self._google_inference(client, input_data)
-                else:
-                    response = self._standard_inference(client, input_data)
-                if response:
-                    st.write("--- Raw Response ---")
-                    st.write(response)
-                    parsed_response = self._parse_response(response, provider_name)
-                    if parsed_response:
-                        st.session_state.qa_data.extend(parsed_response)
             return True
         except Exception as e:
-            self._log_error(f"Generation Error: {e}")
             return False
-    def _standard_inference(self, client: Any, input_data: Dict[str, Any]) -> Any:
-        """Perform inference using an OpenAI-compatible API."""
         try:
-            return client.chat.completions.create(
-                model=st.session_state.config["model"],
-                messages=[{"role": "user", "content": self._build_prompt(input_data)}],
-                temperature=st.session_state.config["temperature"],
             )
         except Exception as e:
-            self._log_error(f"OpenAI Inference Error: {e}")
             return None
-    def _huggingface_inference(self, client: Dict[str, Any], input_data: Dict[str, Any]) -> Any:
-        """Perform inference using the Hugging Face Inference API."""
         try:
             response = requests.post(
-                HF_API_URL + st.session_state.config["model"],
                 headers=client["headers"],
-                json={"inputs": self._build_prompt(input_data)},
             )
             response.raise_for_status()
             return response.json()
-        except requests.exceptions.RequestException as e:
-            self._log_error(f"Hugging Face Inference Error: {e}")
-            return None
-    def _google_inference(self, client: Any, input_data: Dict[str, Any]) -> Any:
-        """Perform inference using the Google Generative AI API."""
-        try:
-            model = client(st.session_state.config["model"])
-            response = model.generate_content(
-                self._build_prompt(input_data),
-                generation_config=genai.types.GenerationConfig(
-                    temperature=st.session_state.config["temperature"]
-                ),
-            )
-            return response
         except Exception as e:
-            self._log_error(f"Google GenAI Inference Error: {e}")
             return None
-    # --- PROMPT ENGINEERING ---
-    def _build_prompt(self, input_data: Dict[str, Any]) -> str:
-        """
-        Build the prompt for the LLM based on the input data.
-        The prompt instructs the LLM to extract 3 Q&A pairs in JSON format.
-        """
-        base_prompt = (
-            "You are an expert in extracting question and answer pairs from documents. "
-            "Generate 3 Q&A pairs from the following data, formatted as a JSON list of dictionaries.\n"
-            "Each dictionary must have the keys 'question' and 'answer'.\n"
-            "The 'question' should be clear and concise, and the 'answer' should directly answer the question "
-            "using only information from the provided data. Do not hallucinate or invent information.\n"
-            "Answer using the exact information from the document, not external knowledge.\n"
-            "Example JSON Output:\n"
-            '[{"question": "What is the capital of France?", "answer": "The capital of France is Paris."}, '
-            '{"question": "What is the highest mountain in the world?", "answer": "The highest mountain in the world is Mount Everest."}, '
-            '{"question": "What is the chemical symbol for gold?", "answer": "The chemical symbol for gold is Au."}]\n'
-            "Now, generate 3 Q&A pairs from this data:\n"
-        )
-        data_type = input_data["meta"].get("type", "text")
-        if data_type == "csv":
-            return base_prompt + "Data:\n" + input_data["text"]
-        elif data_type == "api":
-            return base_prompt + "API response:\n" + input_data["text"]
-        return base_prompt + input_data["text"]
-    # --- RESPONSE PARSING ---
-    def _parse_response(self, response: Any, provider: str) -> List[Dict[str, str]]:
         """
-        Parse the LLM response into a list of Q&A pairs.
-        Expects the response to be a JSON formatted string.
         """
         try:
-            response_text = ""
             if provider == "HuggingFace":
-                response_text = response[0].get("generated_text", "")
-            elif provider == "Google":
-                response_text = response.text.strip()
-            else:  # OpenAI, Deepseek, Groq
-                if not response or not response.choices or not response.choices[0].message.content:
-                    self._log_error("Empty or malformed response from LLM.")
-                    return []
-                response_text = response.choices[0].message.content
-            try:
-                json_output = json.loads(response_text)
-            except json.JSONDecodeError as e:
-                self._log_error(f"JSON Parse Error: {e}. Raw Response: {response_text}")
-                return []
-            if isinstance(json_output, list):
-                qa_pairs = json_output
-            elif isinstance(json_output, dict) and "questionList" in json_output:
-                qa_pairs = json_output["questionList"]
             else:
-                self._log_error(f"Unexpected JSON structure: {response_text}")
-                return []
-            if not isinstance(qa_pairs, list):
-                self._log_error(f"Expected a list of QA pairs, but got: {type(qa_pairs)}")
-                return []
-            for pair in qa_pairs:
-                if not isinstance(pair, dict) or "question" not in pair or "answer" not in pair:
-                    self._log_error(f"Invalid QA pair structure: {pair}")
-                    return []
-            return qa_pairs
         except Exception as e:
-            self._log_error(f"Parse Error: {e}. Raw Response: {response}")
-            return []
-    def _log_error(self, message: str) -> None:
-        """Log an error message to the session state and display it."""
-        st.session_state.processing["errors"].append(message)
-        st.error(message)
-# --- STREAMLIT UI COMPONENTS ---
-def input_sidebar(generator: SyntheticDataGenerator) -> str:
-    """Create the input sidebar in the Streamlit UI."""
     with st.sidebar:
-        st.header("⚙️ Configuration")
-        provider = st.selectbox("Provider", list(generator.providers.keys()))
-        st.session_state.config["provider"] = provider  # Update provider in session state
         provider_cfg = generator.providers[provider]
-        api_key = st.text_input(f"{provider} API Key", type="password")
-        st.session_state["api_key"] = api_key
-        model = st.selectbox("Model", provider_cfg["models"])
         st.session_state.config["model"] = model
         temperature = st.slider("Temperature", 0.0, 1.0, DEFAULT_TEMPERATURE)
         st.session_state.config["temperature"] = temperature
-        # Data Source Input
-        st.header("🔗 Data Sources")
-        input_type = st.selectbox("Input Type", list(generator.input_handlers.keys()))
-        if input_type == "text":
-            domain_input = st.text_area("Domain Knowledge", height=150)
-            if st.button("Add Domain Input"):
-                st.session_state.inputs.append(generator.input_handlers["text"](domain_input)[0])
-        elif input_type == "csv":
-            csv_file = st.file_uploader("Upload CSV", type=["csv"])
-            if csv_file:
-                st.session_state.inputs.extend(generator.input_handlers["csv"](csv_file))
-        elif input_type == "api":
-            api_url = st.text_input("API Endpoint")
-            api_headers = st.text_area("API Headers (JSON format, optional)", height=API_HEADERS_HEIGHT)
             headers = {}
-            if api_headers:
-                try:
                     headers = json.loads(api_headers)
-                except json.JSONDecodeError:
-                    st.error("Invalid JSON format for API headers.")
-            if st.button("Add API Input"):
-                st.session_state.inputs.extend(generator.input_handlers["api"]({"url": api_url, "headers": headers}))
-        elif input_type == "db":
-            db_connection = st.text_input("Database Connection String")
-            db_query = st.text_area("Database Query")
-            db_table = st.text_input("Table Name (optional)")
-            if st.button("Add DB Input"):
-                st.session_state.inputs.extend(generator.input_handlers["db"]({
-                    "connection": db_connection,
-                    "query": db_query,
-                    "table": db_table
-                }))
-    return api_key
-def main_display(generator: SyntheticDataGenerator) -> None:
-    """Create the main display area in the Streamlit UI."""
-    st.title("🚀 Enterprise Synthetic Data Factory")
-    col1, col2 = st.columns([3, 1])
-    with col1:
-        pdf_file = st.file_uploader("Upload Document", type=["pdf"])
-        if pdf_file:
-            st.session_state.inputs.extend(generator.input_handlers["pdf"](pdf_file))
-    with col2:
-        if st.button("Start Generation"):
-            with st.spinner("Processing..."):
-                if not st.session_state["api_key"]:
-                    st.error("Please provide an API Key.")
-                else:
-                    generator.generate(st.session_state["api_key"])
-    if st.session_state.qa_data:
-        st.header("Generated Data")
-        df = pd.DataFrame(st.session_state.qa_data)
-        st.dataframe(df)
-        st.download_button("Export CSV", df.to_csv(index=False), "synthetic_data.csv")
 def main() -> None:
-    """Main function to run the Streamlit application."""
-    generator = SyntheticDataGenerator()
-    _ = input_sidebar(generator)
-    main_display(generator)
 if __name__ == "__main__":
     main()

 import pdfplumber
 import pandas as pd
 import sqlalchemy
+import time
+import concurrent.futures
 from typing import Any, Dict, List
+# Provider clients (make sure you have these installed)
+try:
+    from openai import OpenAI
+except ImportError:
+    OpenAI = None
+try:
+    import groq
+except ImportError:
+    groq = None
+# Hugging Face inference URL
 HF_API_URL = "https://api-inference.huggingface.co/models/"
 DEFAULT_TEMPERATURE = 0.1
+GROQ_MODEL = "mixtral-8x7b-32768"
+class AdvancedSyntheticDataGenerator:
     """
+    Advanced Synthetic Data Generator
+    This class handles multiple input sources, advanced prompt engineering, and
+    supports multiple LLM providers to generate synthetic data.
     """
     def __init__(self) -> None:
         self._setup_providers()
         self._setup_input_handlers()
         self._initialize_session_state()
+        # A customizable prompt template (you can modify it via the UI)
+        self.custom_prompt_template = (
+            "You are an expert synthetic data generator. "
+            "Given the data below and following the instructions provided, generate high-quality, diverse synthetic data. "
+            "Ensure the output adheres to the specified format.\n\n"
+            "-------------------------\n"
+            "Data:\n{data}\n\n"
+            "Instructions:\n{instructions}\n\n"
+            "Output Format: {format}\n"
+            "-------------------------\n"
+        )
     def _setup_providers(self) -> None:
+        """Configure available LLM providers and their initialization routines."""
         self.providers: Dict[str, Dict[str, Any]] = {
             "Deepseek": {
+                "client": lambda key: OpenAI(base_url="https://api.deepseek.com/v1", api_key=key) if OpenAI else None,
                 "models": ["deepseek-chat"],
             },
             "OpenAI": {
+                "client": lambda key: OpenAI(api_key=key) if OpenAI else None,
+                "models": ["gpt-4-turbo", "gpt-3.5-turbo"],
             },
             "Groq": {
+                "client": lambda key: groq.Groq(api_key=key) if groq else None,
                 "models": [GROQ_MODEL],
             },
             "HuggingFace": {
                 "client": lambda key: {"headers": {"Authorization": f"Bearer {key}"}},
                 "models": ["gpt2", "llama-2"],
             },
         }
     def _setup_input_handlers(self) -> None:
+        """Register handlers for different input data types."""
         self.input_handlers: Dict[str, Any] = {
             "text": self.handle_text,
+            "pdf": self.handle_pdf,
             "csv": self.handle_csv,
             "api": self.handle_api,
             "db": self.handle_db,
         }
     def _initialize_session_state(self) -> None:
+        """Initialize Streamlit session state with default configuration."""
+        defaults = {
             "config": {
+                "provider": "OpenAI",
+                "model": "gpt-4-turbo",
                 "temperature": DEFAULT_TEMPERATURE,
+                "output_format": "plain_text",  # Options: plain_text, json, csv
             },
+            "api_key": "",
+            "inputs": [],         # A list to store input sources
+            "instructions": "",   # Custom instructions for data generation
+            "synthetic_data": "", # The generated output
+            "error_logs": [],     # Any errors that occur during processing
         }
+        for key, value in defaults.items():
             if key not in st.session_state:
                 st.session_state[key] = value
+    def log_error(self, message: str) -> None:
+        """Log an error message both to the session state and in the UI."""
+        st.session_state.error_logs.append(message)
+        st.error(message)
+    # ===== INPUT HANDLERS =====
+    def handle_text(self, text: str) -> Dict[str, Any]:
+        return {"data": text, "source": "text"}
+    def handle_pdf(self, file) -> Dict[str, Any]:
         try:
             with pdfplumber.open(file) as pdf:
+                full_text = ""
+                for page in pdf.pages:
                     page_text = page.extract_text() or ""
+                    full_text += page_text + "\n"
+                return {"data": full_text, "source": "pdf"}
         except Exception as e:
+            self.log_error(f"PDF Processing Error: {e}")
+            return {"data": "", "source": "pdf"}
+    def handle_csv(self, file) -> Dict[str, Any]:
         try:
             df = pd.read_csv(file)
+            # For simplicity, we convert the dataframe to JSON.
+            return {"data": df.to_json(orient="records"), "source": "csv"}
         except Exception as e:
+            self.log_error(f"CSV Processing Error: {e}")
+            return {"data": "", "source": "csv"}
+    def handle_api(self, config: Dict[str, str]) -> Dict[str, Any]:
         try:
+            response = requests.get(config["url"], headers=config.get("headers", {}), timeout=10)
             response.raise_for_status()
+            return {"data": json.dumps(response.json()), "source": "api"}
+        except Exception as e:
+            self.log_error(f"API Processing Error: {e}")
+            return {"data": "", "source": "api"}
+    def handle_db(self, config: Dict[str, str]) -> Dict[str, Any]:
         try:
             engine = sqlalchemy.create_engine(config["connection"])
             with engine.connect() as conn:
                 result = conn.execute(sqlalchemy.text(config["query"]))
+                rows = [dict(row) for row in result]
+            return {"data": json.dumps(rows), "source": "db"}
         except Exception as e:
+            self.log_error(f"Database Processing Error: {e}")
+            return {"data": "", "source": "db"}
+    def aggregate_inputs(self) -> str:
+        """Combine all input sources into a single data string."""
+        aggregated_data = ""
+        for item in st.session_state.inputs:
+            aggregated_data += f"Source: {item.get('source', 'unknown')}\n"
+            aggregated_data += item.get("data", "") + "\n\n"
+        return aggregated_data.strip()
+    def build_prompt(self) -> str:
+        """
+        Build the complete prompt by combining the aggregated input data with
+        custom instructions and the desired output format.
+        """
+        aggregated_data = self.aggregate_inputs()
+        instructions = st.session_state.instructions or "Generate diverse, coherent synthetic data."
+        output_format = st.session_state.config.get("output_format", "plain_text")
+        return self.custom_prompt_template.format(
+            data=aggregated_data, instructions=instructions, format=output_format
+        )
+    def generate_synthetic_data(self) -> bool:
         """
+        Generate synthetic data by sending the built prompt to the selected LLM provider.
+        Returns True if generation succeeds.
         """
+        api_key = st.session_state.api_key
         if not api_key:
+            self.log_error("API key is missing!")
             return False
+        provider_name = st.session_state.config["provider"]
+        provider_cfg = self.providers.get(provider_name)
+        if not provider_cfg:
+            self.log_error(f"Provider {provider_name} is not configured.")
+            return False
+        client_initializer = provider_cfg["client"]
+        client = client_initializer(api_key)
+        model = st.session_state.config["model"]
+        temperature = st.session_state.config["temperature"]
+        prompt = self.build_prompt()
+        st.info(f"Using provider {provider_name} with model {model} at temperature {temperature:.2f}")
+        # (Optionally) simulate asynchronous processing with a thread pool if needed.
+        try:
+            if provider_name == "HuggingFace":
+                response = self._huggingface_inference(client, prompt, model)
+            else:
+                response = self._standard_inference(client, prompt, model, temperature)
+            synthetic_data = self._parse_response(response, provider_name)
+            st.session_state.synthetic_data = synthetic_data
             return True
         except Exception as e:
+            self.log_error(f"Generation failed: {e}")
             return False
+    def _standard_inference(self, client: Any, prompt: str, model: str, temperature: float) -> Any:
+        """
+        Inference method for providers using an OpenAI-compatible API.
+        """
         try:
+            result = client.chat.completions.create(
+                model=model,
+                messages=[{"role": "user", "content": prompt}],
+                temperature=temperature,
             )
+            return result
         except Exception as e:
+            self.log_error(f"Standard Inference Error: {e}")
             return None
+    def _huggingface_inference(self, client: Dict[str, Any], prompt: str, model: str) -> Any:
+        """
+        Inference method for the Hugging Face Inference API.
+        """
         try:
             response = requests.post(
+                HF_API_URL + model,
                 headers=client["headers"],
+                json={"inputs": prompt},
+                timeout=30,
             )
             response.raise_for_status()
             return response.json()
         except Exception as e:
+            self.log_error(f"HuggingFace Inference Error: {e}")
             return None
+    def _parse_response(self, response: Any, provider: str) -> str:
         """
+        Parse the LLM response into a synthetic data string.
         """
         try:
             if provider == "HuggingFace":
+                if isinstance(response, list) and "generated_text" in response[0]:
+                    return response[0]["generated_text"]
+                else:
+                    self.log_error("Unexpected HuggingFace response format.")
+                    return ""
             else:
+                if response and hasattr(response, "choices") and response.choices:
+                    return response.choices[0].message.content
+                else:
+                    self.log_error("Unexpected response format.")
+                    return ""
         except Exception as e:
+            self.log_error(f"Response Parsing Error: {e}")
+            return ""
+# ===== ADVANCED UI COMPONENTS =====
+def advanced_config_ui(generator: AdvancedSyntheticDataGenerator):
+    """Advanced configuration options in the sidebar."""
     with st.sidebar:
+        st.header("Advanced Configuration")
+        provider = st.selectbox("Select Provider", list(generator.providers.keys()))
+        st.session_state.config["provider"] = provider
         provider_cfg = generator.providers[provider]
+        model = st.selectbox("Select Model", provider_cfg["models"])
         st.session_state.config["model"] = model
         temperature = st.slider("Temperature", 0.0, 1.0, DEFAULT_TEMPERATURE)
         st.session_state.config["temperature"] = temperature
+        output_format = st.radio("Output Format", ["plain_text", "json", "csv"])
+        st.session_state.config["output_format"] = output_format
+        api_key = st.text_input(f"{provider} API Key", type="password")
+        st.session_state.api_key = api_key
+        instructions = st.text_area("Custom Instructions",
+                                    "Generate diverse, coherent synthetic data based on the input sources.",
+                                    height=100)
+        st.session_state.instructions = instructions
+def advanced_input_ui(generator: AdvancedSyntheticDataGenerator):
+    """UI for adding input sources using tabs."""
+    st.header("Input Data Sources")
+    tabs = st.tabs(["Text", "PDF", "CSV", "API", "Database"])
+    with tabs[0]:
+        text_input = st.text_area("Enter text input", height=150)
+        if st.button("Add Text Input", key="text_input"):
+            if text_input.strip():
+                st.session_state.inputs.append(generator.handle_text(text_input))
+                st.success("Text input added!")
+    with tabs[1]:
+        pdf_file = st.file_uploader("Upload PDF", type=["pdf"])
+        if pdf_file is not None:
+            st.session_state.inputs.append(generator.handle_pdf(pdf_file))
+            st.success("PDF input added!")
+    with tabs[2]:
+        csv_file = st.file_uploader("Upload CSV", type=["csv"])
+        if csv_file is not None:
+            st.session_state.inputs.append(generator.handle_csv(csv_file))
+            st.success("CSV input added!")
+    with tabs[3]:
+        api_url = st.text_input("API Endpoint URL")
+        api_headers = st.text_area("API Headers (JSON format, optional)", height=100)
+        if st.button("Add API Input", key="api_input"):
             headers = {}
+            try:
+                if api_headers:
                     headers = json.loads(api_headers)
+            except Exception as e:
+                generator.log_error(f"Invalid JSON for API Headers: {e}")
+            st.session_state.inputs.append(generator.handle_api({"url": api_url, "headers": headers}))
+            st.success("API input added!")
+    with tabs[4]:
+        db_conn = st.text_input("Database Connection String")
+        db_query = st.text_area("Database Query", height=100)
+        if st.button("Add Database Input", key="db_input"):
+            st.session_state.inputs.append(generator.handle_db({"connection": db_conn, "query": db_query}))
+            st.success("Database input added!")
+def advanced_output_ui(generator: AdvancedSyntheticDataGenerator):
+    """Display the generated synthetic data with various output options."""
+    st.header("Synthetic Data Output")
+    if st.session_state.synthetic_data:
+        output_format = st.session_state.config.get("output_format", "plain_text")
+        if output_format == "json":
+            try:
+                json_output = json.loads(st.session_state.synthetic_data)
+                st.json(json_output)
+            except Exception:
+                st.text_area("Output", st.session_state.synthetic_data, height=300)
+        else:
+            st.text_area("Output", st.session_state.synthetic_data, height=300)
+        st.download_button("Download Output", st.session_state.synthetic_data,
+                           file_name="synthetic_data.txt", mime="text/plain")
+    else:
+        st.info("No synthetic data generated yet.")
+def advanced_logs_ui():
+    """Display error logs and debug information in an expandable section."""
+    with st.expander("Error Logs & Debug Info", expanded=False):
+        if st.session_state.error_logs:
+            for log in st.session_state.error_logs:
+                st.write(log)
+        else:
+            st.write("No logs yet.")
+# ===== MAIN APPLICATION =====
 def main() -> None:
+    st.set_page_config(page_title="Advanced Synthetic Data Generator", layout="wide")
+    generator = AdvancedSyntheticDataGenerator()
+    advanced_config_ui(generator)
+    # Create main tabs for Input, Output, and Logs
+    main_tabs = st.tabs(["Input", "Output", "Logs"])
+    with main_tabs[0]:
+        advanced_input_ui(generator)
+        if st.button("Clear Inputs"):
+            st.session_state.inputs = []
+            st.success("Inputs cleared!")
+    with main_tabs[1]:
+        if st.button("Generate Synthetic Data"):
+            with st.spinner("Generating synthetic data..."):
+                if generator.generate_synthetic_data():
+                    st.success("Data generated successfully!")
+                else:
+                    st.error("Data generation failed. Check logs for details.")
+        advanced_output_ui(generator)
+    with main_tabs[2]:
+        advanced_logs_ui()
 if __name__ == "__main__":
     main()