mgbam commited on
Commit
253afd8
·
1 Parent(s): 59a8974

Add application file

Browse files
Files changed (1) hide show
  1. app.py +267 -0
app.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import streamlit as st
3
+ import pdfplumber
4
+ import pytesseract
5
+ from PIL import Image
6
+ import os
7
+ import json
8
+ import openai
9
+ import pandas as pd
10
+ import numpy as np
11
+ from io import BytesIO
12
+ from concurrent.futures import ThreadPoolExecutor
13
+ from transformers import pipeline
14
+ import hashlib
15
+ import time
16
+
17
+ # Configuration
18
+ MAX_THREADS = 4
19
+ SUPPORTED_MODELS = {
20
+ "Deepseek": "deepseek-chat",
21
+ "Llama-3-70B": "meta-llama/Meta-Llama-3-70B-Instruct",
22
+ "Mixtral": "mistralai/Mixtral-8x7B-Instruct-v0.1"
23
+ }
24
+
25
+ def secure_api_handler():
26
+ """Advanced API key management with encryption"""
27
+ if 'api_keys' not in st.session_state:
28
+ st.session_state.api_keys = {}
29
+
30
+ with st.sidebar:
31
+ st.header("🔑 API Management")
32
+ provider = st.selectbox("Provider", list(SUPPORTED_MODELS.keys()))
33
+ new_key = st.text_input(f"Enter {provider} API Key", type="password")
34
+
35
+ if st.button("Store Key"):
36
+ if new_key:
37
+ hashed_key = hashlib.sha256(new_key.encode()).hexdigest()
38
+ st.session_state.api_keys[provider] = hashed_key
39
+ st.success("Key stored securely")
40
+ else:
41
+ st.error("Please enter a valid API key")
42
+
43
+ def advanced_pdf_processor(uploaded_file):
44
+ """Multi-threaded PDF processing with fault tolerance"""
45
+ st.session_state.document_data = []
46
+
47
+ def process_page(page_data):
48
+ page_num, page = page_data
49
+ try:
50
+ text = page.extract_text() or ""
51
+ images = []
52
+
53
+ for idx, img in enumerate(page.images):
54
+ try:
55
+ width = int(img["width"])
56
+ height = int(img["height"])
57
+ stream = img["stream"]
58
+
59
+ # Advanced image processing
60
+ img_mode = "RGB"
61
+ if hasattr(stream, "colorspace"):
62
+ if "/DeviceCMYK" in str(stream.colorspace):
63
+ img_mode = "CMYK"
64
+
65
+ image = Image.frombytes(img_mode, (width, height), stream.get_data())
66
+ if img_mode != "RGB":
67
+ image = image.convert("RGB")
68
+
69
+ images.append(image)
70
+ except Exception as e:
71
+ st.error(f"Image processing error: {str(e)[:100]}")
72
+
73
+ return {"page": page_num, "text": text, "images": images}
74
+ except Exception as e:
75
+ st.error(f"Page {page_num} error: {str(e)[:100]}")
76
+ return None
77
+
78
+ with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
79
+ with pdfplumber.open(uploaded_file) as pdf:
80
+ results = executor.map(process_page, enumerate(pdf.pages, 1))
81
+
82
+ for result in results:
83
+ if result:
84
+ st.session_state.document_data.append(result)
85
+ st.experimental_rerun()
86
+
87
+ def hybrid_text_extractor(entry):
88
+ """Multimodal text extraction with fallback strategies"""
89
+ text_content = entry["text"].strip()
90
+
91
+ if not text_content and entry["images"]:
92
+ ocr_texts = []
93
+ for img in entry["images"]:
94
+ try:
95
+ ocr_texts.append(pytesseract.image_to_string(img))
96
+ except Exception as e:
97
+ st.warning(f"OCR failed: {str(e)[:100]}")
98
+ text_content = " ".join(ocr_texts).strip()
99
+
100
+ return text_content
101
+
102
+ def generate_with_retry(model, messages, max_retries=3):
103
+ """Advanced LLM generation with automatic fallback"""
104
+ for attempt in range(max_retries):
105
+ try:
106
+ client = openai.OpenAI(
107
+ base_url="https://api.deepseek.com/v1",
108
+ api_key=st.secrets.get("DEEPSEEK_API_KEY")
109
+ )
110
+
111
+ response = client.chat.completions.create(
112
+ model=SUPPORTED_MODELS[model],
113
+ messages=messages,
114
+ max_tokens=2048,
115
+ response_format={"type": "json_object"},
116
+ temperature=st.session_state.temperature
117
+ )
118
+
119
+ return json.loads(response.choices[0].message.content)
120
+ except Exception as e:
121
+ if attempt == max_retries - 1:
122
+ raise
123
+ time.sleep(2 ** attempt)
124
+
125
+ def qa_generation_workflow():
126
+ """Enterprise-grade Q&A generation pipeline"""
127
+ if not st.session_state.document_data:
128
+ st.error("No document data loaded")
129
+ return
130
+
131
+ progress_bar = st.progress(0)
132
+ status_text = st.empty()
133
+
134
+ total_pages = len(st.session_state.document_data)
135
+ qa_pairs = []
136
+
137
+ for idx, entry in enumerate(st.session_state.document_data):
138
+ status_text.text(f"Processing page {idx+1}/{total_pages}...")
139
+ progress_bar.progress((idx+1)/total_pages)
140
+
141
+ text_content = hybrid_text_extractor(entry)
142
+
143
+ prompt = f"""Generate 3 sophisticated Q&A pairs from:
144
+ Page {entry['page']} Content:
145
+ {text_content}
146
+
147
+ Return JSON format: {{"qa_pairs": [{{"question": "...", "answer_1": "...", "answer_2": "..."}}]}}"""
148
+
149
+ try:
150
+ response = generate_with_retry(
151
+ st.session_state.model_choice,
152
+ [{"role": "user", "content": prompt}]
153
+ )
154
+ qa_pairs.extend(response.get("qa_pairs", []))
155
+ except Exception as e:
156
+ st.error(f"Generation failed: {str(e)[:100]}")
157
+
158
+ st.session_state.qa_pairs = qa_pairs
159
+ progress_bar.empty()
160
+ status_text.success("Q&A generation completed!")
161
+
162
+ def evaluation_workflow():
163
+ """Hybrid human-AI evaluation system"""
164
+ if not st.session_state.get("qa_pairs"):
165
+ st.error("No Q&A pairs generated")
166
+ return
167
+
168
+ st.header("Quality Control Center")
169
+
170
+ with st.expander("Automated Evaluation"):
171
+ if st.button("Run AI Evaluation"):
172
+ # Implementation for automated evaluation
173
+ pass
174
+
175
+ with st.expander("Human Evaluation"):
176
+ for idx, pair in enumerate(st.session_state.qa_pairs[:5]):
177
+ st.write(f"**Question {idx+1}:** {pair['question']}")
178
+ col1, col2 = st.columns(2)
179
+ with col1:
180
+ st.write("Answer 1:", pair["answer_1"])
181
+ with col2:
182
+ st.write("Answer 2:", pair["answer_2"])
183
+ st.selectbox(
184
+ f"Select better answer for Q{idx+1}",
185
+ ["Answer 1", "Answer 2", "Both Bad"],
186
+ key=f"human_eval_{idx}"
187
+ )
188
+
189
+ def main():
190
+ """Main Streamlit application"""
191
+ st.set_page_config(
192
+ page_title="Synthetic Data Factory",
193
+ page_icon="🏭",
194
+ layout="wide"
195
+ )
196
+
197
+ # Initialize session state
198
+ if 'document_data' not in st.session_state:
199
+ st.session_state.document_data = []
200
+ if 'qa_pairs' not in st.session_state:
201
+ st.session_state.qa_pairs = []
202
+
203
+ # Sidebar configuration
204
+ with st.sidebar:
205
+ st.title("⚙️ Configuration")
206
+ st.session_state.model_choice = st.selectbox(
207
+ "LLM Provider",
208
+ list(SUPPORTED_MODELS.keys())
209
+ )
210
+ st.session_state.temperature = st.slider(
211
+ "Creativity Level",
212
+ 0.0, 1.0, 0.3
213
+ )
214
+ st.file_uploader(
215
+ "Upload PDF Document",
216
+ type=["pdf"],
217
+ key="doc_upload"
218
+ )
219
+
220
+ # Main interface
221
+ st.title("🏭 Synthetic Data Factory")
222
+ st.write("Enterprise-grade synthetic data generation powered by cutting-edge AI")
223
+
224
+ # Document processing pipeline
225
+ if st.session_state.doc_upload:
226
+ if st.button("Initialize Data Generation"):
227
+ with st.spinner("Deploying AI Workers..."):
228
+ advanced_pdf_processor(st.session_state.doc_upload)
229
+
230
+ # Q&A Generation
231
+ if st.session_state.document_data:
232
+ qa_generation_workflow()
233
+
234
+ # Evaluation system
235
+ if st.session_state.qa_pairs:
236
+ evaluation_workflow()
237
+
238
+ # Data export
239
+ if st.session_state.qa_pairs:
240
+ st.divider()
241
+ st.header("Data Export")
242
+
243
+ export_format = st.radio(
244
+ "Export Format",
245
+ ["JSON", "CSV", "Parquet"]
246
+ )
247
+
248
+ if st.button("Generate Export Package"):
249
+ df = pd.DataFrame(st.session_state.qa_pairs)
250
+
251
+ buffer = BytesIO()
252
+ if export_format == "JSON":
253
+ df.to_json(buffer, orient="records")
254
+ elif export_format == "CSV":
255
+ df.to_csv(buffer, index=False)
256
+ else:
257
+ df.to_parquet(buffer)
258
+
259
+ st.download_button(
260
+ label="Download Dataset",
261
+ data=buffer.getvalue(),
262
+ file_name=f"synthetic_data_{int(time.time())}.{export_format.lower()}",
263
+ mime="application/octet-stream"
264
+ )
265
+
266
+ if __name__ == "__main__":
267
+ main()