Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pdfplumber | |
import pytesseract | |
from PIL import Image | |
import json | |
import pandas as pd | |
from io import BytesIO | |
import time | |
from openai import OpenAI | |
import groq | |
class SyntheticDataFactory: | |
PROVIDER_CONFIG = { | |
"Deepseek": { | |
"client": lambda key: OpenAI(base_url="https://api.deepseek.com/v1", api_key=key), | |
"models": ["deepseek-chat"], | |
"key_label": "Deepseek API Key" | |
}, | |
"OpenAI": { | |
"client": lambda key: OpenAI(api_key=key), | |
"models": ["gpt-4-turbo"], | |
"key_label": "OpenAI API Key" | |
}, | |
"Groq": { | |
"client": lambda key: groq.Groq(api_key=key), | |
"models": ["mixtral-8x7b-32768", "llama2-70b-4096"], | |
"key_label": "Groq API Key" | |
} | |
} | |
def __init__(self): | |
self.init_session_state() | |
def init_session_state(self): | |
if 'qa_data' not in st.session_state: | |
st.session_state.qa_data = [] | |
if 'processing' not in st.session_state: | |
st.session_state.processing = { | |
'stage': 'idle', | |
'errors': [], | |
'progress': 0 | |
} | |
def process_pdf(self, file): | |
"""Process PDF with error handling""" | |
try: | |
with pdfplumber.open(file) as pdf: | |
pages = pdf.pages | |
for i, page in enumerate(pages): | |
# Update progress | |
st.session_state.processing['progress'] = (i+1)/len(pages) | |
# Process page content | |
text = page.extract_text() or "" | |
images = self.process_images(page) | |
# Store in session state | |
st.session_state.qa_data.append({ | |
"page": i+1, | |
"text": text, | |
"images": images | |
}) | |
time.sleep(0.1) # Simulate processing | |
return True | |
except Exception as e: | |
st.error(f"PDF processing failed: {str(e)}") | |
return False | |
def process_images(self, page): | |
"""Robust image processing""" | |
images = [] | |
for img in page.images: | |
try: | |
# Handle different PDF image formats | |
stream = img['stream'] | |
width = int(stream.get('Width', stream.get('W', 0))) | |
height = int(stream.get('Height', stream.get('H', 0))) | |
if width > 0 and height > 0: | |
image = Image.frombytes( | |
"RGB" if 'ColorSpace' in stream else "L", | |
(width, height), | |
stream.get_data() | |
) | |
images.append(image) | |
except Exception as e: | |
st.warning(f"Image processing error: {str(e)[:100]}") | |
return images | |
def generate_qa(self, provider, api_key, model, temp): | |
"""Generate Q&A pairs with selected provider""" | |
try: | |
client = self.PROVIDER_CONFIG[provider]["client"](api_key) | |
for item in st.session_state.qa_data: | |
prompt = f"Generate 3 Q&A pairs from this financial content:\n{item['text']}\nOutput JSON format with keys: question, answer_1, answer_2" | |
response = client.chat.completions.create( | |
model=model, | |
messages=[{"role": "user", "content": prompt}], | |
temperature=temp, | |
response_format={"type": "json_object"} | |
) | |
try: | |
result = json.loads(response.choices[0].message.content) | |
item["qa_pairs"] = result.get("qa_pairs", []) | |
except json.JSONDecodeError: | |
st.error("Failed to parse AI response") | |
st.session_state.processing['stage'] = 'complete' | |
return True | |
except Exception as e: | |
st.error(f"Generation failed: {str(e)}") | |
return False | |
def main(): | |
st.set_page_config( | |
page_title="Enterprise Data Factory", | |
page_icon="π", | |
layout="wide" | |
) | |
factory = SyntheticDataFactory() | |
# Sidebar Configuration | |
with st.sidebar: | |
st.header("βοΈ AI Configuration") | |
provider = st.selectbox("Provider", list(factory.PROVIDER_CONFIG.keys())) | |
config = factory.PROVIDER_CONFIG[provider] | |
api_key = st.text_input(config["key_label"], type="password") | |
model = st.selectbox("Model", config["models"]) | |
temp = st.slider("Temperature", 0.0, 1.0, 0.3) | |
# Main Interface | |
st.title("π Enterprise Synthetic Data Factory") | |
uploaded_file = st.file_uploader("Upload Financial PDF", type=["pdf"]) | |
if uploaded_file and api_key and st.button("Start Synthetic Generation"): | |
with st.status("Processing document...", expanded=True) as status: | |
# Process PDF | |
st.write("Extracting text and images...") | |
if factory.process_pdf(uploaded_file): | |
# Generate Q&A pairs | |
st.write("Generating synthetic data...") | |
if factory.generate_qa(provider, api_key, model, temp): | |
status.update(label="Processing complete!", state="complete", expanded=False) | |
# Display Results | |
if st.session_state.processing.get('stage') == 'complete': | |
st.subheader("Generated Q&A Pairs") | |
# Convert to DataFrame | |
all_qa = [] | |
for item in st.session_state.qa_data: | |
for qa in item.get("qa_pairs", []): | |
qa["page"] = item["page"] | |
all_qa.append(qa) | |
if len(all_qa) > 0: | |
df = pd.DataFrame(all_qa) | |
st.dataframe(df) | |
# Export options | |
csv = df.to_csv(index=False).encode('utf-8') | |
st.download_button( | |
label="Download as CSV", | |
data=csv, | |
file_name="synthetic_data.csv", | |
mime="text/csv" | |
) | |
else: | |
st.warning("No Q&A pairs generated. Check your document content and API settings.") | |
if __name__ == "__main__": | |
main() |