mgbam commited on
Commit
fe72195
·
verified ·
1 Parent(s): 4e9339a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -219
app.py CHANGED
@@ -1,250 +1,87 @@
1
  import streamlit as st
2
  import pdfplumber
3
  import pytesseract
4
- import openai
5
- from openai import OpenAI
6
  import json
7
  import pandas as pd
8
- import numpy as np
9
- from PIL import Image
10
  from io import BytesIO
11
  import time
12
- import traceback
13
- import os
14
- import hashlib
15
  import groq
16
 
17
- class SyntheticDataGenerator:
18
- def __init__(self):
19
- self.SUPPORTED_MODELS = {
20
- "Deepseek": {
21
- "client": lambda key: OpenAI(base_url="https://api.deepseek.com/v1", api_key=key),
22
- "models": ["deepseek-chat"],
23
- "key_name": "DEEPSEEK_KEY"
24
- },
25
- "OpenAI": {
26
- "client": lambda key: OpenAI(api_key=key),
27
- "models": ["gpt-4-turbo"],
28
- "key_name": "OPENAI_KEY"
29
- },
30
- "Mistral-Groq": {
31
- "client": lambda key: groq.Groq(api_key=key),
32
- "models": ["mixtral-8x7b-32768", "llama2-70b-4096"],
33
- "key_name": "GROQ_KEY"
34
- }
35
  }
36
- self.init_session()
37
 
38
- def init_session(self):
39
- if 'qa_pairs' not in st.session_state:
40
- st.session_state.qa_pairs = []
41
- if 'doc_data' not in st.session_state:
42
- st.session_state.doc_data = []
 
 
 
 
 
43
  if 'processing' not in st.session_state:
44
  st.session_state.processing = {
45
  'stage': 'idle',
46
- 'errors': [],
47
- 'warnings': []
48
  }
49
 
50
- def process_pdf(self, uploaded_file):
51
- """Robust PDF processing with advanced image handling"""
52
- st.session_state.processing = {'stage': 'extracting', 'errors': [], 'warnings': []}
53
-
54
- try:
55
- with pdfplumber.load(uploaded_file) as pdf:
56
- for page_num, page in enumerate(pdf.pages, 1):
57
- page_data = self._process_page(page, page_num)
58
- st.session_state.doc_data.append(page_data)
59
-
60
- if len(st.session_state.processing['errors']) > 0:
61
- st.error(f"Processed with {len(st.session_state.processing['errors'])} errors")
62
- return True
63
- except Exception as e:
64
- self._log_error(f"PDF loading failed: {str(e)}")
65
- return False
66
-
67
- def _process_page(self, page, page_num):
68
- """Process individual page with nested error handling"""
69
- page_data = {"page": page_num, "text": "", "images": []}
70
-
71
- try:
72
- page_data["text"] = page.extract_text() or ""
73
- except Exception as e:
74
- self._log_error(f"Page {page_num} text extraction failed: {str(e)}")
75
-
76
- try:
77
- for img_idx, img in enumerate(page.images):
78
- img_data = self._process_image(img, page_num, img_idx)
79
- if img_data:
80
- page_data["images"].append(img_data)
81
- except Exception as e:
82
- self._log_error(f"Page {page_num} image processing failed: {str(e)}")
83
-
84
- return page_data
85
-
86
- def _process_image(self, img, page_num, img_idx):
87
- """Advanced image processing with multiple fallbacks"""
88
- try:
89
- stream = img['stream']
90
- width = self._get_dimension(stream, 'width')
91
- height = self._get_dimension(stream, 'height')
92
-
93
- if width <= 0 or height <= 0:
94
- raise ValueError("Invalid image dimensions")
95
-
96
- try:
97
- return Image.frombytes("RGB", (width, height), stream.get_data())
98
- except:
99
- return Image.frombytes("L", (width, height), stream.get_data()).convert("RGB")
100
- except Exception as e:
101
- self._log_error(f"Page {page_num} image {img_idx} failed: {str(e)}")
102
- return None
103
-
104
- def _get_dimension(self, stream, dimension):
105
- """Safe dimension extraction with multiple fallbacks"""
106
- try:
107
- return int(stream[dimension])
108
- except:
109
- try:
110
- return int(stream['stream'][dimension])
111
- except:
112
- try:
113
- return int(stream['data'][dimension])
114
- except:
115
- return 0
116
 
117
- def generate_qa(self, model_provider, model_name, temperature):
118
- """Multi-model generation engine"""
119
- st.session_state.processing = {'stage': 'generating', 'errors': []}
120
- qa_pairs = []
121
-
122
- try:
123
- client = self.SUPPORTED_MODELS[model_provider]["client"](
124
- st.session_state[model_provider.lower() + "_key"]
125
- )
126
-
127
- for page in st.session_state.doc_data:
128
- content = self._get_page_content(page)
129
- response = self._generate(client, model_name, content, temperature)
130
- qa_pairs.extend(self._parse_response(response))
131
-
132
- st.session_state.qa_pairs = qa_pairs
133
- return True
134
- except Exception as e:
135
- self._log_error(f"Generation failed: {str(e)}")
136
- return False
137
-
138
- def _generate(self, client, model, content, temp):
139
- """Unified generation interface"""
140
- if isinstance(client, groq.Groq):
141
- return client.chat.completions.create(
142
- messages=[{"role": "user", "content": content}],
143
- model=model,
144
- temperature=temp,
145
- response_format={"type": "json_object"}
146
- )
147
- else:
148
- return client.chat.completions.create(
149
- model=model,
150
- messages=[{"role": "user", "content": content}],
151
- temperature=temp,
152
- response_format={"type": "json_object"}
153
- )
154
-
155
- def _parse_response(self, response):
156
- """Safe response parsing"""
157
- try:
158
- content = json.loads(response.choices[0].message.content)
159
- return content.get('qa_pairs', [])
160
- except Exception as e:
161
- self._log_error(f"Response parsing failed: {str(e)}")
162
- return []
163
-
164
- def export_data(self, formats):
165
- """Multi-format export system"""
166
- exports = {}
167
- df = pd.DataFrame(st.session_state.qa_pairs)
168
 
169
- if 'JSON' in formats:
170
- exports['synthetic_data.json'] = df.to_json(orient='records').encode()
171
- if 'CSV' in formats:
172
- exports['synthetic_data.csv'] = df.to_csv(index=False).encode()
173
- if 'Parquet' in formats:
174
- buffer = BytesIO()
175
- df.to_parquet(buffer)
176
- exports['synthetic_data.parquet'] = buffer.getvalue()
177
 
178
- return exports
179
-
180
- def _log_error(self, message):
181
- """Centralized error logging"""
182
- st.session_state.processing['errors'].append(message)
183
- st.error(message)
184
 
185
- def _get_page_content(self, page):
186
- """Multimodal content extraction"""
187
- text = page["text"]
188
- if not text:
189
- text = " ".join([pytesseract.image_to_string(img) for img in page["images"]])
190
- return text
191
-
192
- def ui_setup():
193
- """Enterprise-grade UI configuration"""
194
  st.set_page_config(
195
- page_title="Synthetic Data Factory Pro",
196
  page_icon="🏭",
197
- layout="wide",
198
- initial_sidebar_state="expanded"
199
  )
200
 
201
- with st.sidebar:
202
- st.header("🔑 API Key Management")
203
- for provider in ["Deepseek", "OpenAI", "Mistral-Groq"]:
204
- st.text_input(
205
- f"{provider} API Key",
206
- type="password",
207
- key=f"{provider.lower()}_key"
208
- )
209
-
210
- st.header("🧠 AI Configuration")
211
- provider = st.selectbox("Model Provider", ["Deepseek", "OpenAI", "Mistral-Groq"])
212
- model = st.selectbox("Model", generator.SUPPORTED_MODELS[provider]["models"])
213
- temp = st.slider("Temperature", 0.0, 1.0, 0.3)
214
 
215
- return provider, model, temp
216
-
217
- def main():
218
- """Main application flow"""
219
- provider, model, temp = ui_setup()
220
- generator = SyntheticDataGenerator()
221
 
222
- st.title("🏭 Synthetic Data Factory Pro")
223
- st.write("Enterprise-grade document processing with multi-modal AI")
224
 
225
- uploaded_file = st.file_uploader("Upload PDF Document", type=["pdf"])
 
226
 
227
- if uploaded_file and st.button("Start Generation"):
228
- if generator.process_pdf(uploaded_file):
229
- if generator.generate_qa(provider, model, temp):
230
- st.success("Generation completed successfully!")
231
-
232
- with st.expander("📊 Results Preview"):
233
- st.dataframe(pd.DataFrame(st.session_state.qa_pairs))
234
-
235
- with st.expander("📦 Advanced Export"):
236
- formats = st.multiselect(
237
- "Select formats",
238
- ["JSON", "CSV", "Parquet"],
239
- default=["JSON", "CSV"]
240
- )
241
- exports = generator.export_data(formats)
242
-
243
- if st.download_button("Export Package",
244
- data=json.dumps(exports),
245
- file_name="synthetic_data.zip",
246
- mime="application/zip"):
247
- st.success("Export package generated!")
248
 
249
  if __name__ == "__main__":
250
  main()
 
1
  import streamlit as st
2
  import pdfplumber
3
  import pytesseract
4
+ from PIL import Image
 
5
  import json
6
  import pandas as pd
 
 
7
  from io import BytesIO
8
  import time
9
+ from openai import OpenAI
 
 
10
  import groq
11
 
12
+ class SyntheticDataFactory:
13
+ PROVIDER_CONFIG = {
14
+ "Deepseek": {
15
+ "client": lambda key: OpenAI(base_url="https://api.deepseek.com/v1", api_key=key),
16
+ "models": ["deepseek-chat"],
17
+ "key_label": "Deepseek API Key"
18
+ },
19
+ "OpenAI": {
20
+ "client": lambda key: OpenAI(api_key=key),
21
+ "models": ["gpt-4-turbo"],
22
+ "key_label": "OpenAI API Key"
23
+ },
24
+ "Groq": {
25
+ "client": lambda key: groq.Groq(api_key=key),
26
+ "models": ["mixtral-8x7b-32768", "llama2-70b-4096"],
27
+ "key_label": "Groq API Key"
 
 
28
  }
29
+ }
30
 
31
+ def __init__(self):
32
+ self.init_session_state()
33
+
34
+ def init_session_state(self):
35
+ if 'qa_data' not in st.session_state:
36
+ st.session_state.qa_data = {
37
+ 'pairs': [],
38
+ 'metadata': {},
39
+ 'exports': {}
40
+ }
41
  if 'processing' not in st.session_state:
42
  st.session_state.processing = {
43
  'stage': 'idle',
44
+ 'errors': []
 
45
  }
46
 
47
+ # Add remaining class methods from previous implementation
48
+ # (process_pdf, generate_qa, etc.)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ def setup_sidebar():
51
+ """Configure sidebar with provider settings"""
52
+ with st.sidebar:
53
+ st.header("⚙️ AI Configuration")
54
+ provider = st.selectbox("Provider", list(SyntheticDataFactory.PROVIDER_CONFIG.keys()))
55
+ config = SyntheticDataFactory.PROVIDER_CONFIG[provider]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
+ api_key = st.text_input(config["key_label"], type="password")
58
+ model = st.selectbox("Model", config["models"])
59
+ temp = st.slider("Temperature", 0.0, 1.0, 0.3)
 
 
 
 
 
60
 
61
+ return provider, api_key, model, temp
 
 
 
 
 
62
 
63
+ def main():
 
 
 
 
 
 
 
 
64
  st.set_page_config(
65
+ page_title="Enterprise Data Factory",
66
  page_icon="🏭",
67
+ layout="wide"
 
68
  )
69
 
70
+ # Initialize factory instance
71
+ factory = SyntheticDataFactory()
 
 
 
 
 
 
 
 
 
 
 
72
 
73
+ # Setup UI components
74
+ provider, api_key, model, temp = setup_sidebar()
 
 
 
 
75
 
76
+ st.title("🚀 Enterprise Synthetic Data Factory")
 
77
 
78
+ # File upload and processing logic
79
+ uploaded_file = st.file_uploader("Upload Financial PDF", type=["pdf"])
80
 
81
+ if uploaded_file and api_key:
82
+ if st.button("Start Synthetic Generation"):
83
+ # Process document and generate Q&A pairs
84
+ pass # Add processing logic here
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  if __name__ == "__main__":
87
  main()