Spaces:

mgbam
/

sythenticdata

Sleeping

App Files Files Community

mgbam commited on Feb 9

Commit

e9a68df

verified ·

1 Parent(s): 7b16658

Update app.py

Browse files

Files changed (1) hide show

app.py +199 -223

app.py CHANGED Viewed

@@ -1,156 +1,145 @@
 import streamlit as st
 import pdfplumber
-import pytesseract
 import pandas as pd
 import requests
 import json
 from PIL import Image
-from io import BytesIO
 from openai import OpenAI
-import google.generativeai as genai # Added Google GenAI
 import groq
 import sqlalchemy
 from typing import Dict, Any
-# Constants for Default Values and API URLs
 HF_API_URL = "https://api-inference.huggingface.co/models/"
-DEFAULT_TEMPERATURE = 0.1 # Lower Temperature
-MODEL = "mixtral-8x7b-32768" #constant string
 class SyntheticDataGenerator:
-    """
-    A class to generate synthetic Q&A data from various input sources using different LLM providers.
-    """
     def __init__(self):
-        """Initializes the SyntheticDataGenerator with supported providers, input handlers, and session state."""
         self.providers = {
             "Deepseek": {
                 "client": lambda key: OpenAI(base_url="https://api.deepseek.com/v1", api_key=key),
-                "models": ["deepseek-chat"]
             },
             "OpenAI": {
                 "client": lambda key: OpenAI(api_key=key),
-                "models": ["gpt-4-turbo"]
             },
             "Groq": {
                 "client": lambda key: groq.Groq(api_key=key),
-                "models": [MODEL]
             },
             "HuggingFace": {
                 "client": lambda key: {"headers": {"Authorization": f"Bearer {key}"}},
-                "models": ["gpt2", "llama-2"]
             },
-             "Google": {
-                "client": lambda key: self._configure_google_genai(key), # Using a custom configure function
-                "models": ["gemini-pro"]  # Use gemini-pro. Consider adding "gemini-pro" when released.
             },
         }
         self.input_handlers = {
             "pdf": self.handle_pdf,
             "text": self.handle_text,
             "csv": self.handle_csv,
             "api": self.handle_api,
-            "db": self.handle_db
         }
-        self.init_session()
     def _configure_google_genai(self, api_key: str):
         """Configures the Google Generative AI client."""
         try:
             genai.configure(api_key=api_key)
-            return genai.GenerativeModel # return the model class, not an instantiation
         except Exception as e:
             st.error(f"Error configuring Google GenAI: {e}")
-            return None # Important: Handle the case where configuration fails
-    def init_session(self):
-        """Initializes the Streamlit session state with default values."""
-        session_defaults = {
-            'inputs': [],
-            'qa_data': [],
-            'processing': {
-                'stage': 'idle',
-                'progress': 0,
-                'errors': []
-            },
-            'config': {
-                'provider': "Groq",
-                'model': MODEL,
-                'temperature': DEFAULT_TEMPERATURE
-            }
-        }
-        for key, val in session_defaults.items():
-            if key not in st.session_state:
-                st.session_state[key] = val
-    # Input Processors
     def handle_pdf(self, file):
-       """Extracts text and images from a PDF file."""
-       try:
             with pdfplumber.open(file) as pdf:
                 extracted_data = []
                 for i, page in enumerate(pdf.pages):
                     page_text = page.extract_text() or ""
                     page_images = self.process_images(page)
-                    extracted_data.append({
-                        "text": page_text,
-                        "images": page_images,
-                        "meta": {"type": "pdf", "page": i + 1}
-                    })
                 return extracted_data
-       except Exception as e:
-           self.log_error(f"PDF Error: {str(e)}")
-           return []
     def handle_text(self, text):
         """Handles manual text input."""
-        return [{
-            "text": text,
-            "meta": {"type": "domain", "source": "manual"}
-        }]
     def handle_csv(self, file):
         """Reads a CSV file and prepares data for Q&A generation."""
         try:
             df = pd.read_csv(file)
-            return [{
-                "text": "\n".join([f"{col}: {row[col]}" for col in df.columns]),
-                "meta": {"type": "csv", "columns": list(df.columns)}
-            } for _, row in df.iterrows()]
         except Exception as e:
-            self.log_error(f"CSV Error: {str(e)}")
             return []
     def handle_api(self, config):
         """Fetches data from an API endpoint."""
         try:
-            response = requests.get(config['url'], headers=config['headers'])
-            response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
-            return [{
-                "text": json.dumps(response.json()),
-                "meta": {"type": "api", "endpoint": config['url']}
-            }]
         except requests.exceptions.RequestException as e:
-            self.log_error(f"API Error: {str(e)}")
             return []
     def handle_db(self, config):
         """Connects to a database and executes a query."""
         try:
-            engine = sqlalchemy.create_engine(config['connection'])
             with engine.connect() as conn:
-                result = conn.execute(sqlalchemy.text(config['query']))
-                return [{
-                    "text": "\n".join([f"{col}: {val}" for col, val in row._asdict().items()]),
-                    "meta": {"type": "db", "table": config.get('table', '')}
-                } for row in result]
         except Exception as e:
-            self.log_error(f"DB Error: {str(e)}")
             return []
     def process_images(self, page):
@@ -158,211 +147,206 @@ class SyntheticDataGenerator:
         images = []
         for img in page.images:
             try:
-                stream = img['stream']
-                width = int(stream.get('Width', 0))
-                height = int(stream.get('Height', 0))
-                image_data = stream.get_data()  # Get the image data
-                if width > 0 and height > 0 and image_data: #CHECK image_data
                     try:
                         image = Image.frombytes("RGB", (width, height), image_data)
-                        images.append({
-                            "data": image,
-                            "meta": {"dims": (width, height)}
-                        })
                     except Exception as e:
-                        self.log_error(f"Image Creation Error: {str(e)}") # Log specific image creation errors.
                 else:
-                    self.log_error(f"Image Error: Insufficient image data or invalid dimensions (width={width}, height={height})")
             except Exception as e:
-                self.log_error(f"Image Extraction Error: {str(e)}") # More general extraction error
         return images
-    # Core Generation Engine
     def generate(self, api_key: str) -> bool:
-        """
-        Generates Q&A pairs using the selected LLM provider.
-        Args:
-            api_key (str): The API key for the selected LLM provider.
-        Returns:
-            bool: True if generation was successful, False otherwise.
-        """
         try:
-            provider_cfg = self.providers[st.session_state.config['provider']]
-            client_initializer = provider_cfg["client"] #Get the client init function.
-            # Check that the key is not an empty string
             if not api_key:
                 st.error("API Key cannot be empty.")
                 return False
-            # Initialize the client
-            if st.session_state.config['provider'] == "Google":
-                client = client_initializer(api_key) # Client is the class
                 if not client:
                     return False  # Google config failed
             else:
                 client = client_initializer(api_key)
             for i, input_data in enumerate(st.session_state.inputs):
-                st.session_state.processing['progress'] = (i+1)/len(st.session_state.inputs)
-                if st.session_state.config['provider'] == "HuggingFace":
                     response = self._huggingface_inference(client, input_data)
-                elif st.session_state.config['provider'] == "Google":
-                   response = self._google_inference(client, input_data)
                 else:
                     response = self._standard_inference(client, input_data)
                 if response:
-                    # Check if the parsing function needs access to the provider
-                    st.session_state.qa_data.extend(self._parse_response(response, st.session_state.config['provider']))
             return True
         except Exception as e:
-            self.log_error(f"Generation Error: {str(e)}")
             return False
     def _standard_inference(self, client, input_data):
-         """Performs inference using standard OpenAI-compatible API."""
-         try:
-            #st.write(input_data['text']) # debugging data
             return client.chat.completions.create(
-                model=st.session_state.config['model'],
-                messages=[{
-                    "role": "user",
-                    "content": self._build_prompt(input_data)
-                }],
-                temperature=st.session_state.config['temperature'],
-                response_format={"type": "json_object"} #Request json
             )
-         except Exception as e:
-             self.log_error(f"OpenAI Inference Error: {e}")
-             return None
     def _huggingface_inference(self, client, input_data):
         """Performs inference using Hugging Face Inference API."""
         try:
             response = requests.post(
-                HF_API_URL + st.session_state.config['model'],
                 headers=client["headers"],
-                json={"inputs": self._build_prompt(input_data)}
             )
-            response.raise_for_status() #Check for HTTP errors
             return response.json()
         except requests.exceptions.RequestException as e:
-            self.log_error(f"Hugging Face Inference Error: {e}")
             return None
     def _google_inference(self, client, input_data):
         """Performs inference using Google Generative AI API."""
         try:
-            model = client(st.session_state.config['model'])  # Instantiate the model with the selected model name
             response = model.generate_content(
                 self._build_prompt(input_data),
-                generation_config = genai.types.GenerationConfig(temperature=st.session_state.config['temperature'])
             )
-            st.write("Google API Response:")  # Debugging: Print the raw response
-            st.write(response.text)
             return response
         except Exception as e:
-            self.log_error(f"Google GenAI Inference Error: {e}")
             return None
     def _build_prompt(self, input_data):
         """Builds the prompt for the LLM based on the input data type."""
-        base = "Generate a JSON list of 3 dictionaries like this: \n"
-        base+= '[{"question":"Example Question", "answer":"Example Answer"},'
-        base+= '{"question":"Example Question", "answer":"Example Answer"},'
-        base+= '{"question":"Example Question", "answer":"Example Answer"}]'
-        base+= 'Here is the data:\n'
-        if input_data['meta']['type'] == 'csv':
-            return base + "Data:\n" + input_data['text']
-        elif input_data['meta']['type'] == 'api':
-            return base + "API response:\n" + input_data['text']
-        return base + input_data['text']
-    def _parse_response(self, response, provider):
-        """Parses the response from the LLM into a list of Q&A pairs."""
         try:
             if provider == "HuggingFace":
-                return response[0]['generated_text']
             elif provider == "Google":
-                # Expecting a text response from Gemini
-                try:
-                    json_string = response.text.strip()  # Removes surrounding whitespace that can cause errors
-                    qa_pairs = json.loads(json_string).get("qa_pairs", []) # Extract the qa_pairs
-                    # Validate the structure of qa_pairs
-                    if not isinstance(qa_pairs, list):
-                        raise ValueError("Expected a list of QA pairs.")
-                    for pair in qa_pairs:
-                        if not isinstance(pair, dict) or "question" not in pair or "answer" not in pair:
-                            raise ValueError("Each item in the list must be a dictionary with 'question' and 'answer' keys.")
-                    return qa_pairs  # Return the extracted and validated list
-                except (json.JSONDecodeError, ValueError) as e:
-                     self.log_error(f"Google JSON Parse Error: {e}.  Raw Response: {response.text}")
-                     return [] # Return empty in case of parsing failure
-            else:
-                # Assuming JSON response from other providers (OpenAI, Deepseek, Groq)
                 if not response or not response.choices or not response.choices[0].message.content:
-                    self.log_error("Empty or malformed response from LLM.")
                     return []
-                try:
-                    json_output = json.loads(response.choices[0].message.content) # load the JSON data
-                    return json_output.get("qa_pairs", []) # Return the qa_pairs
-                except json.JSONDecodeError as e:
-                    self.log_error(f"JSON Parse Error: {e}.  Raw Response: {response.choices[0].message.content}")
                     return []
         except Exception as e:
-            self.log_error(f"Parse Error: {e}. Raw Response: {response}")
             return []
-    def log_error(self, message):
-        """Logs an error message to the Streamlit session state and displays it in the UI."""
-        st.session_state.processing['errors'].append(message)
         st.error(message)
-# Streamlit UI Components
-def input_sidebar(gen: SyntheticDataGenerator):
-    """
-    Creates the input sidebar in the Streamlit UI.
-    Args:
-        gen (SyntheticDataGenerator): The SyntheticDataGenerator instance.
-    Returns:
-        str: The API key entered by the user.
-    """
     with st.sidebar:
         st.header("⚙️ Configuration")
-        # AI Provider Settings
         provider = st.selectbox("Provider", list(gen.providers.keys()))
         provider_cfg = gen.providers[provider]
         api_key = st.text_input(f"{provider} API Key", type="password")
-        st.session_state['api_key'] = api_key  #Store API Key
         model = st.selectbox("Model", provider_cfg["models"])
-        temp = st.slider("Temperature", 0.0, 1.0, DEFAULT_TEMPERATURE) #Lower
-        # Update session config
-        st.session_state.config.update({
-            "provider": provider,
-            "model": model,
-            "temperature": temp
-        })
         # Input Source Selection
         st.header("🔗 Data Sources")
@@ -376,11 +360,11 @@ def input_sidebar(gen: SyntheticDataGenerator):
         elif input_type == "csv":
             csv_file = st.file_uploader("Upload CSV", type=["csv"])
             if csv_file:
-                 st.session_state.inputs.extend(gen.input_handlers["csv"](csv_file))
         elif input_type == "api":
             api_url = st.text_input("API Endpoint")
-            api_headers = st.text_area("API Headers (JSON format, optional)", height=50)
             headers = {}
             try:
                 if api_headers:
@@ -395,47 +379,38 @@ def input_sidebar(gen: SyntheticDataGenerator):
             db_query = st.text_area("Database Query")
             db_table = st.text_input("Table Name (optional)")
             if st.button("Add DB Input"):
-                 st.session_state.inputs.extend(gen.input_handlers["db"]({"connection": db_connection, "query": db_query, "table": db_table}))
         return api_key
-def main_display(gen: SyntheticDataGenerator):
-    """
-    Creates the main display area in the Streamlit UI.
-    Args:
-        gen (SyntheticDataGenerator): The SyntheticDataGenerator instance.
-    """
     st.title("🚀 Enterprise Synthetic Data Factory")
-    # Input Processing
     col1, col2 = st.columns([3, 1])
     with col1:
         pdf_file = st.file_uploader("Upload Document", type=["pdf"])
         if pdf_file:
-           st.session_state.inputs.extend(gen.input_handlers["pdf"](pdf_file))
-    # Generation Controls
     with col2:
         if st.button("Start Generation"):
             with st.status("Processing..."):
-                if not st.session_state.get('api_key'):
-                     st.error("Please provide an API Key.")
                 else:
-                    gen.generate(st.session_state.get('api_key'))
-    # Results Display
     if st.session_state.qa_data:
         st.header("Generated Data")
         df = pd.DataFrame(st.session_state.qa_data)
         st.dataframe(df)
-        # Export Options
-        st.download_button(
-            "Export CSV",
-            df.to_csv(index=False),
-            "synthetic_data.csv"
-        )
 def main():
     """Main function to run the Streamlit application."""
@@ -443,5 +418,6 @@ def main():
     api_key = input_sidebar(gen)
     main_display(gen)
 if __name__ == "__main__":
     main()

 import streamlit as st
 import pdfplumber
 import pandas as pd
 import requests
 import json
 from PIL import Image
 from openai import OpenAI
+import google.generative_ai as genai
 import groq
 import sqlalchemy
 from typing import Dict, Any
+# --- CONSTANTS ---
 HF_API_URL = "https://api-inference.huggingface.co/models/"
+DEFAULT_TEMPERATURE = 0.1
+MODEL = "mixtral-8x7b-32768"  # Groq model
+API_HEADERS_HEIGHT = 70  # Minimum height for st.text_area
 class SyntheticDataGenerator:
+    """Generates synthetic Q&A data from various input sources using LLMs."""
     def __init__(self):
+        self._setup_providers()
+        self._setup_input_handlers()
+        self._initialize_session_state()
+    def _setup_providers(self):
+        """Defines the available LLM providers and their configurations."""
         self.providers = {
             "Deepseek": {
                 "client": lambda key: OpenAI(base_url="https://api.deepseek.com/v1", api_key=key),
+                "models": ["deepseek-chat"],
             },
             "OpenAI": {
                 "client": lambda key: OpenAI(api_key=key),
+                "models": ["gpt-4-turbo"],
             },
             "Groq": {
                 "client": lambda key: groq.Groq(api_key=key),
+                "models": [MODEL],
             },
             "HuggingFace": {
                 "client": lambda key: {"headers": {"Authorization": f"Bearer {key}"}},
+                "models": ["gpt2", "llama-2"],
             },
+            "Google": {
+                "client": lambda key: self._configure_google_genai(key),
+                "models": ["gemini-pro"],
             },
         }
+    def _setup_input_handlers(self):
+        """Defines handlers for different input data types."""
         self.input_handlers = {
             "pdf": self.handle_pdf,
             "text": self.handle_text,
             "csv": self.handle_csv,
             "api": self.handle_api,
+            "db": self.handle_db,
         }
+    def _initialize_session_state(self):
+        """Initializes Streamlit session state variables."""
+        session_defaults = {
+            "inputs": [],
+            "qa_data": [],
+            "processing": {"stage": "idle", "progress": 0, "errors": []},
+            "config": {"provider": "Groq", "model": MODEL, "temperature": DEFAULT_TEMPERATURE},
+            "api_key": "",  # Explicitly initialize api_key in session state
+        }
+        for key, value in session_defaults.items():
+            if key not in st.session_state:
+                st.session_state[key] = value
     def _configure_google_genai(self, api_key: str):
         """Configures the Google Generative AI client."""
         try:
             genai.configure(api_key=api_key)
+            return genai.GenerativeModel
         except Exception as e:
             st.error(f"Error configuring Google GenAI: {e}")
+            return None
+    # --- INPUT HANDLERS ---
     def handle_pdf(self, file):
+        """Extracts text and images from a PDF file."""
+        try:
             with pdfplumber.open(file) as pdf:
                 extracted_data = []
                 for i, page in enumerate(pdf.pages):
                     page_text = page.extract_text() or ""
                     page_images = self.process_images(page)
+                    extracted_data.append(
+                        {"text": page_text, "images": page_images, "meta": {"type": "pdf", "page": i + 1}}
+                    )
                 return extracted_data
+        except Exception as e:
+            self._log_error(f"PDF Error: {str(e)}")
+            return []
     def handle_text(self, text):
         """Handles manual text input."""
+        return [{"text": text, "meta": {"type": "domain", "source": "manual"}}]
     def handle_csv(self, file):
         """Reads a CSV file and prepares data for Q&A generation."""
         try:
             df = pd.read_csv(file)
+            return [
+                {"text": "\n".join([f"{col}: {row[col]}" for col in df.columns]), "meta": {"type": "csv", "columns": list(df.columns)}}
+                for _, row in df.iterrows()
+            ]
         except Exception as e:
+            self._log_error(f"CSV Error: {str(e)}")
             return []
     def handle_api(self, config):
         """Fetches data from an API endpoint."""
         try:
+            response = requests.get(config["url"], headers=config["headers"], timeout=10)  # Add timeout
+            response.raise_for_status()  # Raise HTTPError for bad responses
+            return [{"text": json.dumps(response.json()), "meta": {"type": "api", "endpoint": config["url"]}}]
         except requests.exceptions.RequestException as e:
+            self._log_error(f"API Error: {str(e)}")
             return []
     def handle_db(self, config):
         """Connects to a database and executes a query."""
         try:
+            engine = sqlalchemy.create_engine(config["connection"])
             with engine.connect() as conn:
+                result = conn.execute(sqlalchemy.text(config["query"]))
+                return [
+                    {
+                        "text": "\n".join([f"{col}: {val}" for col, val in row._asdict().items()]),
+                        "meta": {"type": "db", "table": config.get("table", "")},
+                    }
+                    for row in result
+                ]
         except Exception as e:
+            self._log_error(f"DB Error: {str(e)}")
             return []
     def process_images(self, page):
         images = []
         for img in page.images:
             try:
+                stream = img["stream"]
+                width = int(stream.get("Width", 0))
+                height = int(stream.get("Height", 0))
+                image_data = stream.get_data()
+                if width > 0 and height > 0 and image_data:
                     try:
                         image = Image.frombytes("RGB", (width, height), image_data)
+                        images.append({"data": image, "meta": {"dims": (width, height)}})
                     except Exception as e:
+                        self._log_error(f"Image Creation Error: {str(e)}. Width: {width}, Height: {height}")
                 else:
+                    self._log_error(
+                        f"Image Error: Insufficient data or invalid dimensions (w={width}, h={height})"
+                    )
             except Exception as e:
+                self._log_error(f"Image Extraction Error: {str(e)}")
         return images
+    # --- LLM INFERENCE ---
     def generate(self, api_key: str) -> bool:
+        """Generates Q&A pairs using the selected LLM provider."""
         try:
             if not api_key:
                 st.error("API Key cannot be empty.")
                 return False
+            provider_cfg = self.providers[st.session_state.config["provider"]]
+            client_initializer = provider_cfg["client"]
+            if st.session_state.config["provider"] == "Google":
+                client = client_initializer(api_key)
                 if not client:
                     return False  # Google config failed
             else:
                 client = client_initializer(api_key)
             for i, input_data in enumerate(st.session_state.inputs):
+                st.session_state.processing["progress"] = (i + 1) / len(st.session_state.inputs)
+                # Debugging: Display input data
+                st.write("--- Input Data ---")
+                st.write(input_data["text"])
+                if st.session_state.config["provider"] == "HuggingFace":
                     response = self._huggingface_inference(client, input_data)
+                elif st.session_state.config["provider"] == "Google":
+                    response = self._google_inference(client, input_data)
                 else:
                     response = self._standard_inference(client, input_data)
                 if response:
+                    # Debugging: Display raw response
+                    st.write("--- Raw Response ---")
+                    st.write(response)
+                    st.session_state.qa_data.extend(self._parse_response(response, st.session_state.config["provider"]))
             return True
         except Exception as e:
+            self._log_error(f"Generation Error: {str(e)}")
             return False
     def _standard_inference(self, client, input_data):
+        """Performs inference using OpenAI-compatible API."""
+        try:
             return client.chat.completions.create(
+                model=st.session_state.config["model"],
+                messages=[{"role": "user", "content": self._build_prompt(input_data)}],
+                temperature=st.session_state.config["temperature"],
             )
+        except Exception as e:
+            self._log_error(f"OpenAI Inference Error: {e}")
+            return None
     def _huggingface_inference(self, client, input_data):
         """Performs inference using Hugging Face Inference API."""
         try:
             response = requests.post(
+                HF_API_URL + st.session_state.config["model"],
                 headers=client["headers"],
+                json={"inputs": self._build_prompt(input_data)},
             )
+            response.raise_for_status()
             return response.json()
         except requests.exceptions.RequestException as e:
+            self._log_error(f"Hugging Face Inference Error: {e}")
             return None
     def _google_inference(self, client, input_data):
         """Performs inference using Google Generative AI API."""
         try:
+            model = client(st.session_state.config["model"])
             response = model.generate_content(
                 self._build_prompt(input_data),
+                generation_config=genai.types.GenerationConfig(temperature=st.session_state.config["temperature"]),
             )
             return response
         except Exception as e:
+            self._log_error(f"Google GenAI Inference Error: {e}")
             return None
+    # --- PROMPT ENGINEERING ---
     def _build_prompt(self, input_data):
         """Builds the prompt for the LLM based on the input data type."""
+        base = (
+            "You are an expert in extracting question and answer pairs from documents. "
+            "Generate 3 Q&A pairs from the following data, formatted as a JSON list of dictionaries.\n"
+            "Each dictionary must have the keys 'question' and 'answer'.\n"
+            "The 'question' should be clear and concise, and the 'answer' should directly answer the question using only "
+            "information from the data. Do not hallucinate or invent information.\n"
+            "Answer from the exact same document, not outside from the document\n"
+            "Example JSON Output:\n"
+            '[{"question": "What is the capital of France?", "answer": "The capital of France is Paris."}, '
+            '{"question": "What is the highest mountain in the world?", "answer": "The highest mountain in the world is Mount Everest."}, '
+            '{"question": "What is the chemical symbol for gold?", "answer": "The chemical symbol for gold is Au."}]\n'
+            "Now, generate 3 Q&A pairs from this data:\n"
+        )
+        if input_data["meta"]["type"] == "csv":
+            return base + "Data:\n" + input_data["text"]
+        elif input_data["meta"]["type"] == "api":
+            return base + "API response:\n" + input_data["text"]
+        return base + input_data["text"]
+    # --- RESPONSE PARSING ---
+    def _parse_response(self, response: Any, provider: str) -> list[dict[str, str]]:
+        """Parses the LLM response into a list of Q&A pairs."""
         try:
+            response_text = ""
             if provider == "HuggingFace":
+                response_text = response[0]["generated_text"]
+                return response_text
             elif provider == "Google":
+                response_text = response.text.strip()
+            else:  # OpenAI, Deepseek, Groq
                 if not response or not response.choices or not response.choices[0].message.content:
+                    self._log_error("Empty or malformed response from LLM.")
                     return []
+                response_text = response.choices[0].message.content
+            try:
+                json_output = json.loads(response_text)
+                if isinstance(json_output, list):
+                    qa_pairs = json_output
+                elif isinstance(json_output, dict) and "questionList" in json_output:
+                    qa_pairs = json_output["questionList"]
+                else:
+                    self._log_error(f"Unexpected JSON structure: {response_text}")
+                    return []
+                if not isinstance(qa_pairs, list):
+                    self._log_error(f"Expected a list of QA pairs, but got: {type(qa_pairs)}")
                     return []
+                for pair in qa_pairs:
+                    if not isinstance(pair, dict) or "question" not in pair or "answer" not in pair:
+                        self._log_error(f"Invalid QA pair structure: {pair}")
+                        return []
+                return qa_pairs
+            except json.JSONDecodeError as e:
+                self._log_error(f"JSON Parse Error: {e}. Raw Response: {response_text}")
+                return []
         except Exception as e:
+            self._log_error(f"Parse Error: {e}. Raw Response: {response}")
             return []
+    def _log_error(self, message):
+        """Logs an error message to Streamlit session state and displays it."""
+        st.session_state.processing["errors"].append(message)
         st.error(message)
+# --- STREAMLIT UI COMPONENTS ---
+def input_sidebar(gen: SyntheticDataGenerator) -> str:
+    """Creates the input sidebar in the Streamlit UI."""
     with st.sidebar:
         st.header("⚙️ Configuration")
         provider = st.selectbox("Provider", list(gen.providers.keys()))
+        st.session_state.config["provider"] = provider  # Update session state immediately
         provider_cfg = gen.providers[provider]
         api_key = st.text_input(f"{provider} API Key", type="password")
+        st.session_state["api_key"] = api_key
         model = st.selectbox("Model", provider_cfg["models"])
+        st.session_state.config["model"] = model  # Update model selection
+        temp = st.slider("Temperature", 0.0, 1.0, DEFAULT_TEMPERATURE)
+        st.session_state.config["temperature"] = temp  # Update temperature
         # Input Source Selection
         st.header("🔗 Data Sources")
         elif input_type == "csv":
             csv_file = st.file_uploader("Upload CSV", type=["csv"])
             if csv_file:
+                st.session_state.inputs.extend(gen.input_handlers["csv"](csv_file))
         elif input_type == "api":
             api_url = st.text_input("API Endpoint")
+            api_headers = st.text_area("API Headers (JSON format, optional)", height=API_HEADERS_HEIGHT)
             headers = {}
             try:
                 if api_headers:
             db_query = st.text_area("Database Query")
             db_table = st.text_input("Table Name (optional)")
             if st.button("Add DB Input"):
+                st.session_state.inputs.extend(
+                    gen.input_handlers["db"]({"connection": db_connection, "query": db_query, "table": db_table})
+                )
         return api_key
+def main_display(gen: SyntheticDataGenerator):
+    """Creates the main display area in the Streamlit UI."""
     st.title("🚀 Enterprise Synthetic Data Factory")
     col1, col2 = st.columns([3, 1])
     with col1:
         pdf_file = st.file_uploader("Upload Document", type=["pdf"])
         if pdf_file:
+            st.session_state.inputs.extend(gen.input_handlers["pdf"](pdf_file))
     with col2:
         if st.button("Start Generation"):
             with st.status("Processing..."):
+                if not st.session_state["api_key"]:
+                    st.error("Please provide an API Key.")
                 else:
+                    gen.generate(st.session_state["api_key"])
     if st.session_state.qa_data:
         st.header("Generated Data")
         df = pd.DataFrame(st.session_state.qa_data)
         st.dataframe(df)
+        st.download_button("Export CSV", df.to_csv(index=False), "synthetic_data.csv")
 def main():
     """Main function to run the Streamlit application."""
     api_key = input_sidebar(gen)
     main_display(gen)
 if __name__ == "__main__":
     main()