Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -17,280 +17,215 @@ import traceback
|
|
| 17 |
# Configuration
|
| 18 |
MAX_THREADS = 4
|
| 19 |
SUPPORTED_MODELS = {
|
| 20 |
-
"Deepseek":
|
| 21 |
-
|
| 22 |
-
|
|
|
|
| 23 |
}
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
def initialize_session_state():
|
| 26 |
-
"""Initialize all session state variables"""
|
| 27 |
-
|
| 28 |
'document_data': [],
|
| 29 |
'qa_pairs': [],
|
| 30 |
'processing_complete': False,
|
| 31 |
'current_stage': 'idle',
|
| 32 |
'api_keys': {},
|
| 33 |
'model_choice': "Deepseek",
|
| 34 |
-
'temperature': 0.3
|
|
|
|
| 35 |
}
|
| 36 |
|
| 37 |
-
for key, value in
|
| 38 |
if key not in st.session_state:
|
| 39 |
st.session_state[key] = value
|
| 40 |
|
| 41 |
-
def
|
| 42 |
-
"""
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
st.error("Please enter a valid API key")
|
| 55 |
|
| 56 |
def process_image(img_data, page_num, img_idx):
|
| 57 |
-
"""
|
| 58 |
try:
|
| 59 |
img = img_data["stream"]
|
| 60 |
width = int(img_data["width"])
|
| 61 |
height = int(img_data["height"])
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
mode = "RGB"
|
| 66 |
-
if "/DeviceCMYK" in str(color_space):
|
| 67 |
-
mode = "CMYK"
|
| 68 |
-
elif "/DeviceGray" in str(color_space):
|
| 69 |
-
mode = "L"
|
| 70 |
-
|
| 71 |
# Convert image to RGB
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
|
|
|
| 75 |
|
| 76 |
-
return image
|
| 77 |
-
except Exception as e:
|
| 78 |
-
st.error(f"Image processing error (Page {page_num}, Image {img_idx}): {str(e)[:100]}")
|
| 79 |
-
return None
|
| 80 |
-
|
| 81 |
-
def process_page(page_data):
|
| 82 |
-
"""Thread-safe page processing"""
|
| 83 |
-
page_num, page = page_data
|
| 84 |
-
try:
|
| 85 |
-
text = page.extract_text() or ""
|
| 86 |
-
images = []
|
| 87 |
-
|
| 88 |
-
for idx, img in enumerate(page.images):
|
| 89 |
-
processed_image = process_image(img, page_num, idx)
|
| 90 |
-
if processed_image:
|
| 91 |
-
images.append(processed_image)
|
| 92 |
-
|
| 93 |
-
return {"page": page_num, "text": text.strip(), "images": images}
|
| 94 |
except Exception as e:
|
| 95 |
-
st.error(f"Page {page_num}
|
| 96 |
return None
|
| 97 |
|
| 98 |
-
def
|
| 99 |
-
"""
|
| 100 |
-
st.session_state.
|
| 101 |
|
| 102 |
-
|
| 103 |
with pdfplumber.open(uploaded_file) as pdf:
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
)
|
| 107 |
-
|
| 108 |
-
while not future.done():
|
| 109 |
-
time.sleep(0.1)
|
| 110 |
-
st.rerun()
|
| 111 |
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
def hybrid_text_extraction(entry):
|
| 120 |
-
"""Multimodal text extraction with fallback"""
|
| 121 |
-
text_content = entry["text"]
|
| 122 |
-
|
| 123 |
-
if not text_content and entry["images"]:
|
| 124 |
-
ocr_results = []
|
| 125 |
-
for img in entry["images"]:
|
| 126 |
-
try:
|
| 127 |
-
ocr_results.append(pytesseract.image_to_string(img))
|
| 128 |
-
except Exception as e:
|
| 129 |
-
st.warning(f"OCR failed: {str(e)[:100]}")
|
| 130 |
-
text_content = " ".join(ocr_results).strip()
|
| 131 |
-
|
| 132 |
-
return text_content
|
| 133 |
|
| 134 |
-
def
|
| 135 |
-
"""
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
api_key=st.secrets.get("DEEPSEEK_API_KEY")
|
| 139 |
-
)
|
| 140 |
|
| 141 |
-
|
| 142 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
response = client.chat.completions.create(
|
| 144 |
-
model=SUPPORTED_MODELS[model],
|
| 145 |
-
messages=
|
|
|
|
|
|
|
|
|
|
| 146 |
max_tokens=2048,
|
| 147 |
response_format={"type": "json_object"},
|
| 148 |
temperature=st.session_state.temperature
|
| 149 |
)
|
| 150 |
-
return json.loads(response.choices[0].message.content)
|
| 151 |
-
except Exception as e:
|
| 152 |
-
if attempt == 2:
|
| 153 |
-
raise
|
| 154 |
-
time.sleep(2 ** attempt)
|
| 155 |
-
|
| 156 |
-
def qa_generation_workflow():
|
| 157 |
-
"""Enterprise Q&A generation pipeline"""
|
| 158 |
-
with st.status("🚀 AI Processing Pipeline", expanded=True) as status:
|
| 159 |
-
try:
|
| 160 |
-
st.write("Initializing neural processors...")
|
| 161 |
-
total_pages = len(st.session_state.document_data)
|
| 162 |
-
qa_pairs = []
|
| 163 |
-
|
| 164 |
-
for idx, entry in enumerate(st.session_state.document_data):
|
| 165 |
-
status.write(f"Processing page {idx+1}/{total_pages}")
|
| 166 |
-
text_content = hybrid_text_extraction(entry)
|
| 167 |
-
|
| 168 |
-
prompt = f"""Generate 3 sophisticated Q&A pairs from:
|
| 169 |
-
Page {entry['page']} Content:
|
| 170 |
-
{text_content}
|
| 171 |
-
|
| 172 |
-
Return JSON format: {{"qa_pairs": [{{"question": "...", "answer_1": "...", "answer_2": "..."}}]}}"""
|
| 173 |
-
|
| 174 |
-
response = generate_with_retry(
|
| 175 |
-
st.session_state.model_choice,
|
| 176 |
-
[{"role": "user", "content": prompt}]
|
| 177 |
-
)
|
| 178 |
-
qa_pairs.extend(response.get("qa_pairs", []))
|
| 179 |
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
def evaluation_interface():
|
| 187 |
-
"""Interactive quality control center"""
|
| 188 |
-
st.header("🧪 Quality Control Hub")
|
| 189 |
-
|
| 190 |
-
with st.expander("Automated AI Evaluation", expanded=True):
|
| 191 |
-
if st.button("Run Batch Validation"):
|
| 192 |
-
with st.spinner("Validating responses..."):
|
| 193 |
-
time.sleep(2) # Simulated validation
|
| 194 |
-
st.success("Quality check passed: 98% accuracy")
|
| 195 |
-
|
| 196 |
-
with st.expander("Human-in-the-Loop Review"):
|
| 197 |
-
sample_size = min(5, len(st.session_state.qa_pairs))
|
| 198 |
-
for idx in range(sample_size):
|
| 199 |
-
pair = st.session_state.qa_pairs[idx]
|
| 200 |
-
with st.container(border=True):
|
| 201 |
-
col1, col2 = st.columns([1, 3])
|
| 202 |
-
with col1:
|
| 203 |
-
st.metric("Page", pair["page"])
|
| 204 |
-
with col2:
|
| 205 |
-
st.write(f"**Question:** {pair['question']}")
|
| 206 |
-
|
| 207 |
-
tab1, tab2 = st.tabs(["Answer 1", "Answer 2"])
|
| 208 |
-
with tab1:
|
| 209 |
-
st.write(pair["answer_1"])
|
| 210 |
-
with tab2:
|
| 211 |
-
st.write(pair["answer_2"])
|
| 212 |
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
col1, col2, col3 = st.columns(3)
|
| 224 |
-
with col1:
|
| 225 |
-
export_format = st.selectbox("Format", ["JSON", "CSV", "Parquet"])
|
| 226 |
-
with col2:
|
| 227 |
-
compression = st.selectbox("Compression", ["None", "gzip", "zip"])
|
| 228 |
-
with col3:
|
| 229 |
-
include_metadata = st.checkbox("Include Metadata", True)
|
| 230 |
-
|
| 231 |
-
if st.button("Generate Export Package"):
|
| 232 |
-
with st.spinner("Packaging data..."):
|
| 233 |
-
df = pd.DataFrame(st.session_state.qa_pairs)
|
| 234 |
-
buffer = BytesIO()
|
| 235 |
-
|
| 236 |
-
if export_format == "JSON":
|
| 237 |
-
df.to_json(buffer, orient="records", indent=2)
|
| 238 |
-
mime = "application/json"
|
| 239 |
-
elif export_format == "CSV":
|
| 240 |
-
df.to_csv(buffer, index=False)
|
| 241 |
-
mime = "text/csv"
|
| 242 |
-
else:
|
| 243 |
-
df.to_parquet(buffer, compression=compression if compression != "None" else None)
|
| 244 |
-
mime = "application/octet-stream"
|
| 245 |
-
|
| 246 |
-
st.download_button(
|
| 247 |
-
label="Download Dataset",
|
| 248 |
-
data=buffer.getvalue(),
|
| 249 |
-
file_name=f"synthetic_data_{int(time.time())}.{export_format.lower()}",
|
| 250 |
-
mime=mime
|
| 251 |
-
)
|
| 252 |
-
|
| 253 |
-
def main_interface():
|
| 254 |
-
"""Core application interface"""
|
| 255 |
-
st.title("🏭 Synthetic Data Factory")
|
| 256 |
-
st.write("Industrial-scale synthetic data generation powered by cutting-edge AI")
|
| 257 |
-
|
| 258 |
-
# Processing pipeline
|
| 259 |
-
if uploaded_file := st.sidebar.file_uploader("Upload PDF Document", type=["pdf"]):
|
| 260 |
-
if st.sidebar.button("Start Generation"):
|
| 261 |
-
st.session_state.processing_complete = False
|
| 262 |
-
advanced_pdf_processor(uploaded_file)
|
| 263 |
-
qa_generation_workflow()
|
| 264 |
-
st.session_state.processing_complete = True
|
| 265 |
-
|
| 266 |
-
# Display results
|
| 267 |
-
if st.session_state.processing_complete:
|
| 268 |
-
evaluation_interface()
|
| 269 |
-
data_export_module()
|
| 270 |
|
| 271 |
def main():
|
| 272 |
-
"""Main application
|
| 273 |
st.set_page_config(
|
| 274 |
-
page_title="Synthetic Data
|
| 275 |
-
page_icon="
|
| 276 |
layout="wide"
|
| 277 |
)
|
| 278 |
|
| 279 |
initialize_session_state()
|
| 280 |
-
secure_api_management()
|
| 281 |
|
|
|
|
| 282 |
with st.sidebar:
|
| 283 |
-
st.header("⚙️
|
| 284 |
st.session_state.model_choice = st.selectbox(
|
| 285 |
-
"AI Model",
|
| 286 |
-
list(SUPPORTED_MODELS.keys())
|
| 287 |
)
|
| 288 |
st.session_state.temperature = st.slider(
|
| 289 |
-
"Creativity Level",
|
| 290 |
-
0.0, 1.0, 0.3
|
| 291 |
)
|
|
|
|
|
|
|
| 292 |
|
| 293 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
|
| 295 |
if __name__ == "__main__":
|
| 296 |
main()
|
|
|
|
| 17 |
# Configuration
|
| 18 |
MAX_THREADS = 4
|
| 19 |
SUPPORTED_MODELS = {
|
| 20 |
+
"Deepseek": {
|
| 21 |
+
"model": "deepseek-chat",
|
| 22 |
+
"base_url": "https://api.deepseek.com/v1"
|
| 23 |
+
}
|
| 24 |
}
|
| 25 |
|
| 26 |
+
def debug_log(message):
|
| 27 |
+
"""Enhanced logging system"""
|
| 28 |
+
if st.session_state.get("debug_mode"):
|
| 29 |
+
st.toast(f"DEBUG: {message}", icon="🐛")
|
| 30 |
+
|
| 31 |
def initialize_session_state():
|
| 32 |
+
"""Initialize all session state variables with validation"""
|
| 33 |
+
required_keys = {
|
| 34 |
'document_data': [],
|
| 35 |
'qa_pairs': [],
|
| 36 |
'processing_complete': False,
|
| 37 |
'current_stage': 'idle',
|
| 38 |
'api_keys': {},
|
| 39 |
'model_choice': "Deepseek",
|
| 40 |
+
'temperature': 0.3,
|
| 41 |
+
'debug_mode': True
|
| 42 |
}
|
| 43 |
|
| 44 |
+
for key, value in required_keys.items():
|
| 45 |
if key not in st.session_state:
|
| 46 |
st.session_state[key] = value
|
| 47 |
|
| 48 |
+
def show_processing_status():
|
| 49 |
+
"""Visual feedback system"""
|
| 50 |
+
status_messages = {
|
| 51 |
+
'idle': "🟢 Ready to process",
|
| 52 |
+
'extracting': "🔍 Extracting document content...",
|
| 53 |
+
'generating': "🧠 Generating Q&A pairs...",
|
| 54 |
+
'evaluating': "📊 Evaluating results...",
|
| 55 |
+
'error': "❌ Processing failed"
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
status = st.session_state.current_stage
|
| 59 |
+
debug_log(f"Status update: {status}")
|
| 60 |
+
st.sidebar.markdown(f"**System Status:** {status_messages.get(status, 'Unknown')}")
|
|
|
|
| 61 |
|
| 62 |
def process_image(img_data, page_num, img_idx):
|
| 63 |
+
"""Robust image processing with validation"""
|
| 64 |
try:
|
| 65 |
img = img_data["stream"]
|
| 66 |
width = int(img_data["width"])
|
| 67 |
height = int(img_data["height"])
|
| 68 |
|
| 69 |
+
debug_log(f"Processing image {img_idx} on page {page_num}")
|
| 70 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
# Convert image to RGB
|
| 72 |
+
try:
|
| 73 |
+
return Image.frombytes("RGB", (width, height), img.get_data())
|
| 74 |
+
except:
|
| 75 |
+
return Image.frombytes("L", (width, height), img.get_data()).convert("RGB")
|
| 76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
except Exception as e:
|
| 78 |
+
st.error(f"Image processing failed (Page {page_num}, Image {img_idx}): {str(e)}")
|
| 79 |
return None
|
| 80 |
|
| 81 |
+
def pdf_processing_workflow(uploaded_file):
|
| 82 |
+
"""PDF processing with real-time feedback"""
|
| 83 |
+
st.session_state.current_stage = 'extracting'
|
| 84 |
|
| 85 |
+
try:
|
| 86 |
with pdfplumber.open(uploaded_file) as pdf:
|
| 87 |
+
total_pages = len(pdf.pages)
|
| 88 |
+
progress_bar = st.progress(0)
|
| 89 |
+
status_text = st.empty()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
+
for page_num, page in enumerate(pdf.pages, 1):
|
| 92 |
+
status_text.text(f"Processing page {page_num}/{total_pages}")
|
| 93 |
+
progress_bar.progress(page_num/total_pages)
|
| 94 |
+
|
| 95 |
+
try:
|
| 96 |
+
text = page.extract_text() or ""
|
| 97 |
+
images = [process_image(img, page_num, idx)
|
| 98 |
+
for idx, img in enumerate(page.images)]
|
| 99 |
+
|
| 100 |
+
st.session_state.document_data.append({
|
| 101 |
+
"page": page_num,
|
| 102 |
+
"text": text.strip(),
|
| 103 |
+
"images": [img for img in images if img is not None]
|
| 104 |
+
})
|
| 105 |
+
except Exception as e:
|
| 106 |
+
st.error(f"Page {page_num} error: {str(e)}")
|
| 107 |
+
|
| 108 |
+
time.sleep(0.1) # Simulate processing
|
| 109 |
+
|
| 110 |
+
progress_bar.empty()
|
| 111 |
+
status_text.success("Document processing complete!")
|
| 112 |
+
return True
|
| 113 |
|
| 114 |
+
except Exception as e:
|
| 115 |
+
st.session_state.current_stage = 'error'
|
| 116 |
+
st.error(f"PDF processing failed: {str(e)}")
|
| 117 |
+
debug_log(traceback.format_exc())
|
| 118 |
+
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
+
def generate_qa_pairs():
|
| 121 |
+
"""Q&A generation with validation"""
|
| 122 |
+
st.session_state.current_stage = 'generating'
|
| 123 |
+
qa_pairs = []
|
|
|
|
|
|
|
| 124 |
|
| 125 |
+
try:
|
| 126 |
+
client = openai.OpenAI(
|
| 127 |
+
base_url=SUPPORTED_MODELS[st.session_state.model_choice]["base_url"],
|
| 128 |
+
api_key=st.secrets["DEEPSEEK_API_KEY"]
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
for idx, entry in enumerate(st.session_state.document_data):
|
| 132 |
+
text_content = entry["text"] or " ".join([
|
| 133 |
+
pytesseract.image_to_string(img) for img in entry["images"]
|
| 134 |
+
])
|
| 135 |
+
|
| 136 |
response = client.chat.completions.create(
|
| 137 |
+
model=SUPPORTED_MODELS[st.session_state.model_choice]["model"],
|
| 138 |
+
messages=[{
|
| 139 |
+
"role": "user",
|
| 140 |
+
"content": f"Generate 3 Q&A pairs from:\n{text_content}\nReturn JSON format: {{'qa_pairs': [{{'question': '...', 'answer_1': '...', 'answer_2': '...'}}]}}"
|
| 141 |
+
}],
|
| 142 |
max_tokens=2048,
|
| 143 |
response_format={"type": "json_object"},
|
| 144 |
temperature=st.session_state.temperature
|
| 145 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
|
| 147 |
+
try:
|
| 148 |
+
result = json.loads(response.choices[0].message.content)
|
| 149 |
+
qa_pairs.extend(result.get("qa_pairs", []))
|
| 150 |
+
debug_log(f"Generated {len(result.get('qa_pairs', []))} pairs for page {entry['page']}")
|
| 151 |
+
except json.JSONDecodeError:
|
| 152 |
+
st.error(f"Invalid response format from API for page {entry['page']}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
+
st.session_state.qa_pairs = qa_pairs
|
| 155 |
+
st.session_state.current_stage = 'evaluating'
|
| 156 |
+
return True
|
| 157 |
+
|
| 158 |
+
except Exception as e:
|
| 159 |
+
st.session_state.current_stage = 'error'
|
| 160 |
+
st.error(f"Q&A generation failed: {str(e)}")
|
| 161 |
+
debug_log(traceback.format_exc())
|
| 162 |
+
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
|
| 164 |
def main():
|
| 165 |
+
"""Main application interface"""
|
| 166 |
st.set_page_config(
|
| 167 |
+
page_title="Synthetic Data Generator",
|
| 168 |
+
page_icon="🧪",
|
| 169 |
layout="wide"
|
| 170 |
)
|
| 171 |
|
| 172 |
initialize_session_state()
|
|
|
|
| 173 |
|
| 174 |
+
# Debug panel
|
| 175 |
with st.sidebar:
|
| 176 |
+
st.header("⚙️ Configuration")
|
| 177 |
st.session_state.model_choice = st.selectbox(
|
| 178 |
+
"AI Model", list(SUPPORTED_MODELS.keys())
|
|
|
|
| 179 |
)
|
| 180 |
st.session_state.temperature = st.slider(
|
| 181 |
+
"Creativity Level", 0.0, 1.0, 0.3
|
|
|
|
| 182 |
)
|
| 183 |
+
st.session_state.debug_mode = st.checkbox("Debug Mode", True)
|
| 184 |
+
show_processing_status()
|
| 185 |
|
| 186 |
+
st.title("🧪 Synthetic Data Generator")
|
| 187 |
+
|
| 188 |
+
# File upload section
|
| 189 |
+
uploaded_file = st.file_uploader("Upload PDF Document", type=["pdf"])
|
| 190 |
+
|
| 191 |
+
if uploaded_file and st.button("Start Processing"):
|
| 192 |
+
if pdf_processing_workflow(uploaded_file):
|
| 193 |
+
if generate_qa_pairs():
|
| 194 |
+
st.success("Processing completed successfully!")
|
| 195 |
+
|
| 196 |
+
# Show results
|
| 197 |
+
st.header("Generated Q&A Pairs")
|
| 198 |
+
for idx, pair in enumerate(st.session_state.qa_pairs[:10]):
|
| 199 |
+
with st.expander(f"Q{idx+1}: {pair['question']}"):
|
| 200 |
+
st.write(f"**Answer 1:** {pair['answer_1']}")
|
| 201 |
+
st.write(f"**Answer 2:** {pair['answer_2']}")
|
| 202 |
+
|
| 203 |
+
# Data export
|
| 204 |
+
st.header("Data Export")
|
| 205 |
+
df = pd.DataFrame(st.session_state.qa_pairs)
|
| 206 |
+
st.download_button(
|
| 207 |
+
label="Download as CSV",
|
| 208 |
+
data=df.to_csv(index=False).encode('utf-8'),
|
| 209 |
+
file_name="synthetic_data.csv",
|
| 210 |
+
mime="text/csv"
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
# Debug information
|
| 214 |
+
if st.session_state.debug_mode:
|
| 215 |
+
with st.expander("Debug Information"):
|
| 216 |
+
st.write("### Session State")
|
| 217 |
+
st.json(st.session_state)
|
| 218 |
+
|
| 219 |
+
if st.session_state.get("document_data"):
|
| 220 |
+
st.write("### Document Data Summary")
|
| 221 |
+
st.write(f"Pages processed: {len(st.session_state.document_data)}")
|
| 222 |
+
st.write(f"Total images extracted: {sum(len(p['images']) for p in st.session_state.document_data)}")
|
| 223 |
+
|
| 224 |
+
if st.session_state.get("qa_pairs"):
|
| 225 |
+
st.write("### Q&A Statistics")
|
| 226 |
+
st.write(f"Total pairs generated: {len(st.session_state.qa_pairs)}")
|
| 227 |
+
st.write("Sample Q&A pairs:")
|
| 228 |
+
st.table(pd.DataFrame(st.session_state.qa_pairs[:3]))
|
| 229 |
|
| 230 |
if __name__ == "__main__":
|
| 231 |
main()
|