Spaces:

mgbam
/

sythenticdata

Sleeping

App Files Files Community

sythenticdata / app.py

mgbam

Update app.py

ee72f5e verified 7 months ago

raw

history blame

16.2 kB

	import json
	import requests
	import streamlit as st
	import pdfplumber
	import pandas as pd
	import sqlalchemy
	from typing import Any, Dict, List

	# Provider clients – ensure these libraries are installed
	try:
	from openai import OpenAI
	except ImportError:
	OpenAI = None

	try:
	import groq
	except ImportError:
	groq = None

	# Hugging Face inference endpoint
	HF_API_URL = "https://api-inference.huggingface.co/models/"
	DEFAULT_TEMPERATURE = 0.1
	GROQ_MODEL = "mixtral-8x7b-32768"


	class QADataGenerator:
	"""
	A Q&A Synthetic Generator that extracts and generates question-answer pairs
	from various input sources using an LLM provider.
	"""
	def __init__(self) -> None:
	self._setup_providers()
	self._setup_input_handlers()
	self._initialize_session_state()
	# This prompt instructs the LLM to generate three Q&A pairs.
	self.custom_prompt_template = (
	"You are an expert in extracting question and answer pairs from documents. "
	"Generate 3 Q&A pairs from the following data, formatted as a JSON list of dictionaries. "
	"Each dictionary must have keys 'question' and 'answer'. "
	"The questions should be clear and concise, and the answers must be based solely on the provided data with no external information. "
	"Do not hallucinate. \n\n"
	"Example JSON Output:\n"
	"[{'question': 'What is the capital of France?', 'answer': 'Paris'}, "
	"{'question': 'What is the highest mountain in the world?', 'answer': 'Mount Everest'}, "
	"{'question': 'What is the chemical symbol for gold?', 'answer': 'Au'}]\n\n"
	"Now, generate 3 Q&A pairs from this data:\n{data}"
	)

	def _setup_providers(self) -> None:
	"""Configure available LLM providers and their client initialization routines."""
	self.providers: Dict[str, Dict[str, Any]] = {
	"Deepseek": {
	"client": lambda key: OpenAI(base_url="https://api.deepseek.com/v1", api_key=key) if OpenAI else None,
	"models": ["deepseek-chat"],
	},
	"OpenAI": {
	"client": lambda key: OpenAI(api_key=key) if OpenAI else None,
	"models": ["gpt-4-turbo", "gpt-3.5-turbo"],
	},
	"Groq": {
	"client": lambda key: groq.Groq(api_key=key) if groq else None,
	"models": [GROQ_MODEL],
	},
	"HuggingFace": {
	"client": lambda key: {"headers": {"Authorization": f"Bearer {key}"}},
	"models": ["gpt2", "llama-2"],
	},
	}

	def _setup_input_handlers(self) -> None:
	"""Register handlers for different input data types."""
	self.input_handlers: Dict[str, Any] = {
	"text": self.handle_text,
	"pdf": self.handle_pdf,
	"csv": self.handle_csv,
	"api": self.handle_api,
	"db": self.handle_db,
	}

	def _initialize_session_state(self) -> None:
	"""Initialize Streamlit session state with default configuration."""
	defaults = {
	"config": {
	"provider": "OpenAI",
	"model": "gpt-4-turbo",
	"temperature": DEFAULT_TEMPERATURE,
	},
	"api_key": "",
	"inputs": [], # List to store input sources
	"qa_pairs": "", # Generated Q&A pairs output
	"error_logs": [], # To store any error messages
	}
	for key, value in defaults.items():
	if key not in st.session_state:
	st.session_state[key] = value

	def log_error(self, message: str) -> None:
	"""Log an error message to session state and display it."""
	st.session_state.error_logs.append(message)
	st.error(message)

	# ----- Input Handlers -----
	def handle_text(self, text: str) -> Dict[str, Any]:
	return {"data": text, "source": "text"}

	def handle_pdf(self, file) -> Dict[str, Any]:
	try:
	with pdfplumber.open(file) as pdf:
	full_text = ""
	for page in pdf.pages:
	page_text = page.extract_text() or ""
	full_text += page_text + "\n"
	return {"data": full_text, "source": "pdf"}
	except Exception as e:
	self.log_error(f"PDF Processing Error: {e}")
	return {"data": "", "source": "pdf"}

	def handle_csv(self, file) -> Dict[str, Any]:
	try:
	df = pd.read_csv(file)
	# Convert the DataFrame to a JSON string
	return {"data": df.to_json(orient="records"), "source": "csv"}
	except Exception as e:
	self.log_error(f"CSV Processing Error: {e}")
	return {"data": "", "source": "csv"}

	def handle_api(self, config: Dict[str, str]) -> Dict[str, Any]:
	try:
	response = requests.get(config["url"], headers=config.get("headers", {}), timeout=10)
	response.raise_for_status()
	return {"data": json.dumps(response.json()), "source": "api"}
	except Exception as e:
	self.log_error(f"API Processing Error: {e}")
	return {"data": "", "source": "api"}

	def handle_db(self, config: Dict[str, str]) -> Dict[str, Any]:
	try:
	engine = sqlalchemy.create_engine(config["connection"])
	with engine.connect() as conn:
	result = conn.execute(sqlalchemy.text(config["query"]))
	rows = [dict(row) for row in result]
	return {"data": json.dumps(rows), "source": "db"}
	except Exception as e:
	self.log_error(f"Database Processing Error: {e}")
	return {"data": "", "source": "db"}

	def aggregate_inputs(self) -> str:
	"""Combine all input sources into a single aggregated string."""
	aggregated_data = ""
	for item in st.session_state.inputs:
	aggregated_data += f"Source: {item.get('source', 'unknown')}\n"
	aggregated_data += item.get("data", "") + "\n\n"
	return aggregated_data.strip()

	def build_prompt(self) -> str:
	"""
	Build the complete prompt using the custom template and aggregated inputs.
	"""
	data = self.aggregate_inputs()
	prompt = self.custom_prompt_template.format(data=data)
	st.write("### Built Prompt")
	st.write(prompt)
	return prompt

	def generate_qa_pairs(self) -> bool:
	"""
	Generate Q&A pairs by sending the built prompt to the selected LLM provider.
	"""
	api_key = st.session_state.api_key
	if not api_key:
	self.log_error("API key is missing!")
	return False

	provider_name = st.session_state.config["provider"]
	provider_cfg = self.providers.get(provider_name)
	if not provider_cfg:
	self.log_error(f"Provider {provider_name} is not configured.")
	return False

	client_initializer = provider_cfg["client"]
	client = client_initializer(api_key)
	model = st.session_state.config["model"]
	temperature = st.session_state.config["temperature"]
	prompt = self.build_prompt()

	st.info(f"Using {provider_name} with model {model} at temperature {temperature:.2f}")
	try:
	if provider_name == "HuggingFace":
	response = self._huggingface_inference(client, prompt, model)
	else:
	response = self._standard_inference(client, prompt, model, temperature)

	st.write("### Raw API Response")
	st.write(response)

	qa_pairs = self._parse_response(response, provider_name)
	st.write("### Parsed Q&A Pairs")
	st.write(qa_pairs)

	st.session_state.qa_pairs = qa_pairs
	return True
	except Exception as e:
	self.log_error(f"Generation failed: {e}")
	return False

	def _standard_inference(self, client: Any, prompt: str, model: str, temperature: float) -> Any:
	"""Inference method for providers using an OpenAI-compatible API."""
	try:
	st.write("Sending prompt via standard inference...")
	result = client.chat.completions.create(
	model=model,
	messages=[{"role": "user", "content": prompt}],
	temperature=temperature,
	)
	st.write("Standard inference result received.")
	return result
	except Exception as e:
	self.log_error(f"Standard Inference Error: {e}")
	return None

	def _huggingface_inference(self, client: Dict[str, Any], prompt: str, model: str) -> Any:
	"""Inference method for the Hugging Face Inference API."""
	try:
	st.write("Sending prompt to HuggingFace API...")
	response = requests.post(
	HF_API_URL + model,
	headers=client["headers"],
	json={"inputs": prompt},
	timeout=30,
	)
	response.raise_for_status()
	st.write("HuggingFace API response received.")
	return response.json()
	except Exception as e:
	self.log_error(f"HuggingFace Inference Error: {e}")
	return None

	def _parse_response(self, response: Any, provider: str) -> List[Dict[str, str]]:
	"""
	Parse the LLM response and return a list of Q&A pairs.
	Expects the response to be JSON formatted.
	"""
	st.write("Parsing response for provider:", provider)
	try:
	if provider == "HuggingFace":
	# For HuggingFace, assume the generated text is under "generated_text"
	if isinstance(response, list) and response and "generated_text" in response[0]:
	raw_text = response[0]["generated_text"]
	else:
	self.log_error("Unexpected HuggingFace response format.")
	return []
	else:
	# For OpenAI (and similar providers) assume the response is similar to:
	# response.choices[0].message.content
	if response and hasattr(response, "choices") and response.choices:
	raw_text = response.choices[0].message.content
	else:
	self.log_error("Unexpected response format from provider.")
	return []

	# Try parsing the raw text as JSON
	try:
	qa_list = json.loads(raw_text)
	if isinstance(qa_list, list):
	return qa_list
	else:
	self.log_error("Parsed output is not a list.")
	return []
	except json.JSONDecodeError as e:
	self.log_error(f"JSON Parsing Error: {e}. Raw output: {raw_text}")
	return []
	except Exception as e:
	self.log_error(f"Response Parsing Error: {e}")
	return []


	# ============ UI Components ============

	def config_ui(generator: QADataGenerator):
	"""Display configuration options in the sidebar."""
	with st.sidebar:
	st.header("Configuration")
	provider = st.selectbox("Select Provider", list(generator.providers.keys()))
	st.session_state.config["provider"] = provider
	provider_cfg = generator.providers[provider]

	model = st.selectbox("Select Model", provider_cfg["models"])
	st.session_state.config["model"] = model

	temperature = st.slider("Temperature", 0.0, 1.0, DEFAULT_TEMPERATURE)
	st.session_state.config["temperature"] = temperature

	api_key = st.text_input(f"{provider} API Key", type="password")
	st.session_state.api_key = api_key

	def input_ui(generator: QADataGenerator):
	"""Display input data source options using tabs."""
	st.subheader("Input Data Sources")
	tabs = st.tabs(["Text", "PDF", "CSV", "API", "Database"])

	with tabs[0]:
	text_input = st.text_area("Enter text input", height=150)
	if st.button("Add Text Input", key="text_input"):
	if text_input.strip():
	st.session_state.inputs.append(generator.handle_text(text_input))
	st.success("Text input added!")
	else:
	st.warning("Empty text input.")

	with tabs[1]:
	pdf_file = st.file_uploader("Upload PDF", type=["pdf"])
	if pdf_file is not None:
	st.session_state.inputs.append(generator.handle_pdf(pdf_file))
	st.success("PDF input added!")

	with tabs[2]:
	csv_file = st.file_uploader("Upload CSV", type=["csv"])
	if csv_file is not None:
	st.session_state.inputs.append(generator.handle_csv(csv_file))
	st.success("CSV input added!")

	with tabs[3]:
	api_url = st.text_input("API Endpoint URL")
	api_headers = st.text_area("API Headers (JSON format, optional)", height=100)
	if st.button("Add API Input", key="api_input"):
	headers = {}
	try:
	if api_headers:
	headers = json.loads(api_headers)
	except Exception as e:
	generator.log_error(f"Invalid JSON for API Headers: {e}")
	st.session_state.inputs.append(generator.handle_api({"url": api_url, "headers": headers}))
	st.success("API input added!")

	with tabs[4]:
	db_conn = st.text_input("Database Connection String")
	db_query = st.text_area("Database Query", height=100)
	if st.button("Add Database Input", key="db_input"):
	st.session_state.inputs.append(generator.handle_db({"connection": db_conn, "query": db_query}))
	st.success("Database input added!")

	def output_ui(generator: QADataGenerator):
	"""Display the generated Q&A pairs and provide a download option."""
	st.subheader("Q&A Pairs Output")
	if st.session_state.qa_pairs:
	st.write("### Generated Q&A Pairs")
	st.write(st.session_state.qa_pairs)
	st.download_button(
	"Download Output",
	json.dumps(st.session_state.qa_pairs, indent=2),
	file_name="qa_pairs.json",
	mime="application/json"
	)
	else:
	st.info("No Q&A pairs generated yet.")

	def logs_ui():
	"""Display error logs and debugging information in an expandable section."""
	with st.expander("Error Logs & Debug Info", expanded=False):
	if st.session_state.error_logs:
	for log in st.session_state.error_logs:
	st.write(log)
	else:
	st.write("No logs yet.")


	def main():
	st.set_page_config(page_title="Advanced Q&A Synthetic Generator", layout="wide")
	st.title("Advanced Q&A Synthetic Generator")
	st.markdown(
	"""
	Welcome to the Advanced Q&A Synthetic Generator. This tool extracts and generates question-answer pairs
	from various input sources. Configure your provider in the sidebar, add input data, and click the button below to generate Q&A pairs.
	"""
	)

	# Initialize generator and display configuration UI
	generator = QADataGenerator()
	config_ui(generator)

	st.header("1. Input Data")
	input_ui(generator)
	if st.button("Clear All Inputs"):
	st.session_state.inputs = []
	st.success("All inputs have been cleared!")

	st.header("2. Generate Q&A Pairs")
	if st.button("Generate Q&A Pairs", key="generate_qa"):
	with st.spinner("Generating Q&A pairs..."):
	if generator.generate_qa_pairs():
	st.success("Q&A pairs generated successfully!")
	else:
	st.error("Q&A generation failed. Check logs for details.")

	st.header("3. Output")
	output_ui(generator)

	st.header("4. Logs & Debug Information")
	logs_ui()


	if __name__ == "__main__":
	main()