Spaces:
Sleeping
Sleeping
# app.py | |
import streamlit as st | |
import pdfplumber | |
import pytesseract | |
from PIL import Image | |
import os | |
import json | |
import openai | |
import pandas as pd | |
import numpy as np | |
from io import BytesIO | |
from concurrent.futures import ThreadPoolExecutor | |
from transformers import pipeline | |
import hashlib | |
import time | |
# Configuration | |
MAX_THREADS = 4 | |
SUPPORTED_MODELS = { | |
"Deepseek": "deepseek-chat", | |
"Llama-3-70B": "meta-llama/Meta-Llama-3-70B-Instruct", | |
"Mixtral": "mistralai/Mixtral-8x7B-Instruct-v0.1" | |
} | |
def secure_api_handler(): | |
"""Advanced API key management with encryption""" | |
if 'api_keys' not in st.session_state: | |
st.session_state.api_keys = {} | |
with st.sidebar: | |
st.header("π API Management") | |
provider = st.selectbox("Provider", list(SUPPORTED_MODELS.keys())) | |
new_key = st.text_input(f"Enter {provider} API Key", type="password") | |
if st.button("Store Key"): | |
if new_key: | |
hashed_key = hashlib.sha256(new_key.encode()).hexdigest() | |
st.session_state.api_keys[provider] = hashed_key | |
st.success("Key stored securely") | |
else: | |
st.error("Please enter a valid API key") | |
def advanced_pdf_processor(uploaded_file): | |
"""Multi-threaded PDF processing with fault tolerance""" | |
st.session_state.document_data = [] | |
def process_page(page_data): | |
page_num, page = page_data | |
try: | |
text = page.extract_text() or "" | |
images = [] | |
for idx, img in enumerate(page.images): | |
try: | |
width = int(img["width"]) | |
height = int(img["height"]) | |
stream = img["stream"] | |
# Advanced image processing | |
img_mode = "RGB" | |
if hasattr(stream, "colorspace"): | |
if "/DeviceCMYK" in str(stream.colorspace): | |
img_mode = "CMYK" | |
image = Image.frombytes(img_mode, (width, height), stream.get_data()) | |
if img_mode != "RGB": | |
image = image.convert("RGB") | |
images.append(image) | |
except Exception as e: | |
st.error(f"Image processing error: {str(e)[:100]}") | |
return {"page": page_num, "text": text, "images": images} | |
except Exception as e: | |
st.error(f"Page {page_num} error: {str(e)[:100]}") | |
return None | |
with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor: | |
with pdfplumber.open(uploaded_file) as pdf: | |
results = executor.map(process_page, enumerate(pdf.pages, 1)) | |
for result in results: | |
if result: | |
st.session_state.document_data.append(result) | |
st.experimental_rerun() | |
def hybrid_text_extractor(entry): | |
"""Multimodal text extraction with fallback strategies""" | |
text_content = entry["text"].strip() | |
if not text_content and entry["images"]: | |
ocr_texts = [] | |
for img in entry["images"]: | |
try: | |
ocr_texts.append(pytesseract.image_to_string(img)) | |
except Exception as e: | |
st.warning(f"OCR failed: {str(e)[:100]}") | |
text_content = " ".join(ocr_texts).strip() | |
return text_content | |
def generate_with_retry(model, messages, max_retries=3): | |
"""Advanced LLM generation with automatic fallback""" | |
for attempt in range(max_retries): | |
try: | |
client = openai.OpenAI( | |
base_url="https://api.deepseek.com/v1", | |
api_key=st.secrets.get("DEEPSEEK_API_KEY") | |
) | |
response = client.chat.completions.create( | |
model=SUPPORTED_MODELS[model], | |
messages=messages, | |
max_tokens=2048, | |
response_format={"type": "json_object"}, | |
temperature=st.session_state.temperature | |
) | |
return json.loads(response.choices[0].message.content) | |
except Exception as e: | |
if attempt == max_retries - 1: | |
raise | |
time.sleep(2 ** attempt) | |
def qa_generation_workflow(): | |
"""Enterprise-grade Q&A generation pipeline""" | |
if not st.session_state.document_data: | |
st.error("No document data loaded") | |
return | |
progress_bar = st.progress(0) | |
status_text = st.empty() | |
total_pages = len(st.session_state.document_data) | |
qa_pairs = [] | |
for idx, entry in enumerate(st.session_state.document_data): | |
status_text.text(f"Processing page {idx+1}/{total_pages}...") | |
progress_bar.progress((idx+1)/total_pages) | |
text_content = hybrid_text_extractor(entry) | |
prompt = f"""Generate 3 sophisticated Q&A pairs from: | |
Page {entry['page']} Content: | |
{text_content} | |
Return JSON format: {{"qa_pairs": [{{"question": "...", "answer_1": "...", "answer_2": "..."}}]}}""" | |
try: | |
response = generate_with_retry( | |
st.session_state.model_choice, | |
[{"role": "user", "content": prompt}] | |
) | |
qa_pairs.extend(response.get("qa_pairs", [])) | |
except Exception as e: | |
st.error(f"Generation failed: {str(e)[:100]}") | |
st.session_state.qa_pairs = qa_pairs | |
progress_bar.empty() | |
status_text.success("Q&A generation completed!") | |
def evaluation_workflow(): | |
"""Hybrid human-AI evaluation system""" | |
if not st.session_state.get("qa_pairs"): | |
st.error("No Q&A pairs generated") | |
return | |
st.header("Quality Control Center") | |
with st.expander("Automated Evaluation"): | |
if st.button("Run AI Evaluation"): | |
# Implementation for automated evaluation | |
pass | |
with st.expander("Human Evaluation"): | |
for idx, pair in enumerate(st.session_state.qa_pairs[:5]): | |
st.write(f"**Question {idx+1}:** {pair['question']}") | |
col1, col2 = st.columns(2) | |
with col1: | |
st.write("Answer 1:", pair["answer_1"]) | |
with col2: | |
st.write("Answer 2:", pair["answer_2"]) | |
st.selectbox( | |
f"Select better answer for Q{idx+1}", | |
["Answer 1", "Answer 2", "Both Bad"], | |
key=f"human_eval_{idx}" | |
) | |
def main(): | |
"""Main Streamlit application""" | |
st.set_page_config( | |
page_title="Synthetic Data Factory", | |
page_icon="π", | |
layout="wide" | |
) | |
# Initialize session state | |
if 'document_data' not in st.session_state: | |
st.session_state.document_data = [] | |
if 'qa_pairs' not in st.session_state: | |
st.session_state.qa_pairs = [] | |
# Sidebar configuration | |
with st.sidebar: | |
st.title("βοΈ Configuration") | |
st.session_state.model_choice = st.selectbox( | |
"LLM Provider", | |
list(SUPPORTED_MODELS.keys()) | |
) | |
st.session_state.temperature = st.slider( | |
"Creativity Level", | |
0.0, 1.0, 0.3 | |
) | |
st.file_uploader( | |
"Upload PDF Document", | |
type=["pdf"], | |
key="doc_upload" | |
) | |
# Main interface | |
st.title("π Synthetic Data Factory") | |
st.write("Enterprise-grade synthetic data generation powered by cutting-edge AI") | |
# Document processing pipeline | |
if st.session_state.doc_upload: | |
if st.button("Initialize Data Generation"): | |
with st.spinner("Deploying AI Workers..."): | |
advanced_pdf_processor(st.session_state.doc_upload) | |
# Q&A Generation | |
if st.session_state.document_data: | |
qa_generation_workflow() | |
# Evaluation system | |
if st.session_state.qa_pairs: | |
evaluation_workflow() | |
# Data export | |
if st.session_state.qa_pairs: | |
st.divider() | |
st.header("Data Export") | |
export_format = st.radio( | |
"Export Format", | |
["JSON", "CSV", "Parquet"] | |
) | |
if st.button("Generate Export Package"): | |
df = pd.DataFrame(st.session_state.qa_pairs) | |
buffer = BytesIO() | |
if export_format == "JSON": | |
df.to_json(buffer, orient="records") | |
elif export_format == "CSV": | |
df.to_csv(buffer, index=False) | |
else: | |
df.to_parquet(buffer) | |
st.download_button( | |
label="Download Dataset", | |
data=buffer.getvalue(), | |
file_name=f"synthetic_data_{int(time.time())}.{export_format.lower()}", | |
mime="application/octet-stream" | |
) | |
if __name__ == "__main__": | |
main() |