awacke1 commited on
Commit
5fe90b3
·
verified ·
1 Parent(s): f367c7b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +217 -183
app.py CHANGED
@@ -8,6 +8,7 @@ import io
8
  import base64
9
  from collections import defaultdict
10
  from PIL import Image
 
11
 
12
  # Document Generation Libs
13
  from docx import Document
@@ -20,17 +21,20 @@ from reportlab.lib.units import inch
20
  from reportlab.pdfbase import pdfmetrics
21
  from reportlab.pdfbase.ttfonts import TTFont
22
 
23
- # AI and Media Libs
24
- from openai import AzureOpenAI
25
  import fitz # PyMuPDF
26
 
27
  # --- Configuration & Setup ---
28
  CWD = Path.cwd()
29
  OUTPUT_DIR = CWD / "generated_outputs"
30
  PREVIEW_DIR = CWD / "previews"
 
31
  FONT_DIR = CWD
 
 
32
  OUTPUT_DIR.mkdir(exist_ok=True)
33
  PREVIEW_DIR.mkdir(exist_ok=True)
 
34
 
35
  LAYOUTS = {
36
  "A4 Portrait": {"size": A4},
@@ -39,40 +43,6 @@ LAYOUTS = {
39
  "Letter Landscape": {"size": landscape(letter)},
40
  }
41
 
42
- # 🧠 Initialize Azure OpenAI Client
43
- # NOTE: This requires AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_API_KEY in your environment.
44
- try:
45
- client = AzureOpenAI(
46
- azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
47
- api_version="2024-05-01-preview",
48
- api_key=os.getenv("AZURE_OPENAI_API_KEY"),
49
- )
50
- AZURE_CLIENT_AVAILABLE = True
51
- except Exception as e:
52
- print("Warning: Azure OpenAI client could not be initialized. Text generation will use dummy data.")
53
- print(f"Error: {e}")
54
- client = None
55
- AZURE_CLIENT_AVAILABLE = False
56
-
57
- # 📖 Map UI model names to your actual Azure deployment names.
58
- # YOU MUST CHANGE THESE DEPLOYMENT NAMES to match your Azure setup.
59
- AZURE_DEPLOYMENT_NAMES = {
60
- # Chat / Vision Models
61
- "gpt-4o": "your-gpt-4o-deployment-name",
62
- "gpt-4.1": "your-gpt-4.1-deployment-name",
63
- "gpt-4.1-mini": "your-gpt-4.1-mini-deployment-name",
64
- "gpt-4o-mini": "your-gpt-4o-mini-deployment-name",
65
- "gpt-4o-realtime-preview": "your-gpt-4o-realtime-deployment-name",
66
- # Reasoning Models
67
- "o1-mini": "your-o1-mini-deployment-name",
68
- "o3-mini": "your-o3-mini-deployment-name",
69
- "o4-mini": "your-o4-mini-deployment-name",
70
- # Transcription Models
71
- "gpt-4o-transcribe": "your-gpt-4o-transcribe-deployment",
72
- "gpt-4o-mini-transcribe": "your-gpt-4o-mini-transcribe-deployment",
73
- }
74
-
75
-
76
  # --- ✍️ Document Generation Engines ---
77
 
78
  def create_pdf(md_content, font_name, emoji_font, pagesize, num_columns):
@@ -125,94 +95,61 @@ def markdown_to_story(markdown_text: str, font_name: str, emoji_font: str):
125
  """📜 Translates Markdown text into a sequence of ReportLab flowables for PDF rendering."""
126
  styles = getSampleStyleSheet()
127
  bold_font = f"{font_name}-Bold" if font_name != "Helvetica" else "Helvetica-Bold"
128
- style_normal = ParagraphStyle('BodyText', fontName=font_name, spaceAfter=6, fontSize=10)
129
- style_h1 = ParagraphStyle('h1', fontName=bold_font, spaceBefore=12, fontSize=24)
 
 
 
 
130
  story, first_heading = [], True
131
  for line in markdown_text.split('\n'):
132
- content, style = line, style_normal
133
- if line.startswith("# "):
 
 
 
 
 
134
  if not first_heading: story.append(PageBreak())
135
- content, style, first_heading = line.lstrip('# '), style_h1, False
136
- formatted_content = re.sub(r'\*\*(.*?)\*\*', r'<b>\1</b>', content)
 
 
 
 
 
 
 
 
 
 
137
  final_content = apply_emoji_font(formatted_content, emoji_font)
138
- story.append(Paragraph(final_content, style))
 
 
139
  return story
140
 
141
 
142
- # --- 🔮 Omni-Model Processing ---
143
-
144
- def process_text_input(prompt, model_deployment_name):
145
- """💬 Sends a text prompt to the Azure OpenAI model and gets a response."""
146
- if not AZURE_CLIENT_AVAILABLE: return "Azure OpenAI client not configured. This is dummy text."
147
- completion = client.chat.completions.create(
148
- model=model_deployment_name,
149
- messages=[{"role": "user", "content": prompt}]
150
- )
151
- return completion.choices[0].message.content
152
-
153
- def process_image_input(image_file, prompt, model_deployment_name):
154
- """🖼️ Encodes an image and sends it with a prompt to the Azure OpenAI model."""
155
- if not AZURE_CLIENT_AVAILABLE: return "Azure OpenAI client not configured. This is a dummy image description."
156
- with Image.open(image_file.name) as img:
157
- with io.BytesIO() as output:
158
- img.save(output, format="PNG")
159
- base64_image = base64.b64encode(output.getvalue()).decode("utf-8")
160
-
161
- response = client.chat.completions.create(
162
- model=model_deployment_name,
163
- messages=[{"role": "user", "content": [
164
- {"type": "text", "text": prompt},
165
- {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"}}
166
- ]}]
167
- )
168
- return response.choices[0].message.content
169
-
170
- def process_audio_input(audio_file, prompt, chat_model_deployment, transcribe_model_deployment):
171
- """🎤 Transcribes audio and sends the text with a prompt to the Azure OpenAI model."""
172
- if not AZURE_CLIENT_AVAILABLE: return "Azure OpenAI client not configured. This is a dummy audio summary."
173
- with open(audio_file.name, "rb") as f:
174
- transcription = client.audio.transcriptions.create(
175
- model=transcribe_model_deployment,
176
- file=f
177
- ).text
178
-
179
- full_prompt = f"{prompt}\n\nAudio Transcription:\n{transcription}"
180
- return process_text_input(full_prompt, chat_model_deployment)
181
 
182
- def process_pdf_input(pdf_file, prompt, model_deployment_name, progress):
183
- """📄 Performs OCR on a PDF by sending pages as images to the AI model."""
184
- if not AZURE_CLIENT_AVAILABLE: return "Azure OpenAI client not configured. This is a dummy PDF summary."
185
-
186
- all_extracted_text = []
187
- doc = fitz.open(pdf_file.name)
188
-
189
- # Process pages in pairs
190
- for i in progress.tqdm(range(0, len(doc), 2), desc="Performing PDF OCR"):
191
- page_images = []
192
- messages = [{"type": "text", "text": prompt}]
193
-
194
- # Get first page of the pair
195
- page1 = doc.load_page(i)
196
- pix1 = page1.get_pixmap(dpi=150)
197
- img_bytes1 = pix1.tobytes("png")
198
- base64_image1 = base64.b64encode(img_bytes1).decode("utf-8")
199
- messages.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image1}"}})
200
-
201
- # Get second page if it exists
202
- if i + 1 < len(doc):
203
- page2 = doc.load_page(i + 1)
204
- pix2 = page2.get_pixmap(dpi=150)
205
- img_bytes2 = pix2.tobytes("png")
206
- base64_image2 = base64.b64encode(img_bytes2).decode("utf-8")
207
- messages.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image2}"}})
208
-
209
- response = client.chat.completions.create(
210
- model=model_deployment_name,
211
- messages=[{"role": "user", "content": messages}]
212
- )
213
- all_extracted_text.append(response.choices[0].message.content)
214
 
215
- return "\n\n".join(all_extracted_text)
 
 
 
 
 
 
 
 
 
216
 
217
 
218
  # --- 🛠️ Helpers & Main API ---
@@ -231,8 +168,7 @@ def register_local_fonts():
231
  emoji_font_name = font_name
232
  else:
233
  text_font_names.append(font_name)
234
- except Exception as e:
235
- print(f"Could not register font {font_path.name}: {e}")
236
  if not text_font_names: text_font_names.append('Helvetica')
237
  return sorted(text_font_names), emoji_font_name
238
 
@@ -247,53 +183,134 @@ def create_pdf_preview(pdf_path: Path):
247
  """🏞️ Generates a PNG thumbnail for the first page of a PDF."""
248
  preview_path = PREVIEW_DIR / f"{pdf_path.stem}.png"
249
  try:
250
- doc = fitz.open(pdf_path); page = doc.load_page(0); pix = page.get_pixmap()
251
  pix.save(str(preview_path)); doc.close()
252
- return str(preview_path)
253
  except: return None
254
 
255
- def generate_outputs_api(omni_files, omni_prompt, chat_model, transcribe_model, output_formats, layouts, fonts, num_columns, page_w_mult, page_h_mult, progress=gr.Progress(track_tqdm=True)):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
  """🚀 The main entry point that orchestrates the entire multi-modal generation process."""
257
  if not omni_prompt and not omni_files: raise gr.Error("Please provide a prompt or upload at least one file.")
258
  if not output_formats: raise gr.Error("Please select at least one output format.")
259
 
260
- chat_deployment = AZURE_DEPLOYMENT_NAMES.get(chat_model)
261
- transcribe_deployment = AZURE_DEPLOYMENT_NAMES.get(transcribe_model)
262
- if not chat_deployment: raise gr.Error(f"Deployment for model '{chat_model}' not found in configuration.")
263
-
264
  shutil.rmtree(OUTPUT_DIR, ignore_errors=True); shutil.rmtree(PREVIEW_DIR, ignore_errors=True)
265
  OUTPUT_DIR.mkdir(); PREVIEW_DIR.mkdir()
266
 
267
- # --- Step 1: Omni-Model Processing ---
268
  md_content = ""
269
- # Process files first
270
  if omni_files:
271
- # Check for multiple file types
272
- file_paths = [Path(f.name) for f in omni_files]
273
- extensions = {p.suffix.lower() for p in file_paths}
274
-
275
- if '.md' in extensions:
276
- md_content = "\n\n".join([p.read_text(encoding='utf-8') for p in file_paths if p.suffix.lower() == '.md'])
277
- elif '.pdf' in extensions:
278
- # For simplicity, we process only the first PDF if multiple are uploaded for OCR
279
- pdf_file = next((f for f in omni_files if Path(f.name).suffix.lower() == '.pdf'), None)
280
- ocr_prompt = omni_prompt if omni_prompt else "Extract all text from the following document pages."
281
- md_content = process_pdf_input(pdf_file, ocr_prompt, chat_deployment, progress)
282
- elif '.png' in extensions or '.jpg' in extensions or '.jpeg' in extensions:
283
- image_file = next((f for f in omni_files if Path(f.name).suffix.lower() in ['.png', '.jpg', '.jpeg']), None)
284
- md_content = process_image_input(image_file, omni_prompt, chat_deployment)
285
- elif '.wav' in extensions or '.mp3' in extensions or '.m4a' in extensions:
286
- if not transcribe_deployment: raise gr.Error(f"Deployment for model '{transcribe_model}' not found.")
287
- audio_file = next((f for f in omni_files if Path(f.name).suffix.lower() in ['.wav', '.mp3', '.m4a']), None)
288
- md_content = process_audio_input(audio_file, omni_prompt, chat_deployment, transcribe_deployment)
289
- # If no files, process text prompt
290
  elif omni_prompt:
291
- md_content = process_text_input(omni_prompt, chat_deployment)
292
 
293
- if not md_content: raise gr.Error("Failed to generate source content from the provided input.")
294
 
295
- # --- Step 2: Generate Selected Document Formats ---
296
- generated_files = []
297
  for format_choice in progress.tqdm(output_formats, desc="Generating Formats"):
298
  time_str = datetime.datetime.now().strftime('%m-%d-%a_%I%M%p').upper()
299
  if format_choice == "PDF":
@@ -302,50 +319,59 @@ def generate_outputs_api(omni_files, omni_prompt, chat_model, transcribe_model,
302
  pagesize = LAYOUTS[layout_name]["size"]
303
  final_pagesize = (pagesize[0] * page_w_mult, pagesize[1] * page_h_mult)
304
  pdf_buffer = create_pdf(md_content, font_name, EMOJI_FONT_NAME, final_pagesize, num_columns)
305
- filename = f"Document_{time_str}_{layout_name.replace(' ','-')}_{font_name}.pdf"
306
  output_path = OUTPUT_DIR / filename
307
  with open(output_path, "wb") as f: f.write(pdf_buffer.getvalue())
308
- generated_files.append(output_path)
309
  elif format_choice == "DOCX":
310
- docx_doc = create_docx(md_content)
311
- filename = f"Document_{time_str}.docx"
312
- output_path = OUTPUT_DIR / filename
313
- docx_doc.save(output_path); generated_files.append(output_path)
314
  elif format_choice == "XLSX":
315
- xlsx_book = create_xlsx(md_content)
316
- filename = f"Outline_{time_str}.xlsx"
317
- output_path = OUTPUT_DIR / filename
318
- xlsx_book.save(output_path); generated_files.append(output_path)
319
 
320
- gallery_previews = [create_pdf_preview(p) for p in generated_files if p.suffix == '.pdf']
321
- final_gallery = [g for g in gallery_previews if g]
322
 
323
- return md_content, final_gallery, [str(p) for p in generated_files]
324
 
325
  # --- 🎨 Gradio UI Definition ---
326
  AVAILABLE_FONTS, EMOJI_FONT_NAME = register_local_fonts()
 
 
 
 
 
 
 
 
 
 
 
 
327
 
328
  with gr.Blocks(theme=gr.themes.Soft(), title="Omni-Model Document Generator") as demo:
329
- gr.Markdown("# 🧠 Omni-Model Document Generator (PDF, DOCX, XLSX)")
330
- gr.Markdown("Provide a prompt, or upload a Markdown, PDF, Image, or Audio file. The AI will process it, and you can generate documents from the result.")
331
 
332
  with gr.Row():
333
  with gr.Column(scale=1):
334
- gr.Markdown("### ⚙️ Omni-Model Input")
335
-
336
- chat_models = ["gpt-4o", "gpt-4.1", "gpt-4.1-mini", "gpt-4o-mini", "o1-mini", "o3-mini", "o4-mini"]
337
- transcribe_models = ["gpt-4o-transcribe", "gpt-4o-mini-transcribe"]
338
-
339
- selected_chat_model = gr.Dropdown(choices=chat_models, label="Select Chat/Vision/Reasoning Model", value=chat_models[0])
340
- selected_transcribe_model = gr.Dropdown(choices=transcribe_models, label="Select Transcription Model (for audio)", value=transcribe_models[0])
341
-
342
- omni_prompt = gr.Textbox(label="Prompt", lines=3, placeholder="Ask a question, or provide instructions for a file...")
343
- omni_files = gr.File(label="Upload File(s) (Optional)", file_count="multiple", file_types=["image", ".wav", ".mp3", ".md", ".pdf"])
 
 
344
 
345
  gr.Markdown("### 📄 Output Settings")
346
  output_formats = gr.CheckboxGroup(choices=["PDF", "DOCX", "XLSX"], label="Select Output Formats", value=["PDF"])
347
 
348
- with gr.Accordion("PDF Customization", open=True):
349
  num_columns_slider = gr.Slider(label="Text Columns", minimum=1, maximum=4, step=1, value=1)
350
  page_w_mult_slider = gr.Slider(label="Page Width Multiplier", minimum=1, maximum=5, step=1, value=1)
351
  page_h_mult_slider = gr.Slider(label="Page Height Multiplier", minimum=1, maximum=2, step=1, value=1)
@@ -357,13 +383,21 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Omni-Model Document Generator") as
357
  with gr.Column(scale=2):
358
  gr.Markdown("### 🤖 AI Response (Source for Documents)")
359
  ai_response_output = gr.Markdown(label="AI Generated Content")
360
- gr.Markdown("### 🖼️ Final Documents")
361
- gallery_output = gr.Gallery(label="PDF Previews", show_label=False, elem_id="gallery", columns=3, height="auto", object_fit="contain")
362
- downloadable_files_output = gr.Files(label="Download Generated Files")
363
-
364
- generate_btn.click(fn=generate_outputs_api,
365
- inputs=[omni_files, omni_prompt, selected_chat_model, selected_transcribe_model, output_formats, selected_layouts, selected_fonts, num_columns_slider, page_w_mult_slider, page_h_mult_slider],
366
- outputs=[ai_response_output, gallery_output, downloadable_files_output])
 
 
 
 
 
 
 
 
367
 
368
  if __name__ == "__main__":
369
- demo.launch()
 
8
  import base64
9
  from collections import defaultdict
10
  from PIL import Image
11
+ import json
12
 
13
  # Document Generation Libs
14
  from docx import Document
 
21
  from reportlab.pdfbase import pdfmetrics
22
  from reportlab.pdfbase.ttfonts import TTFont
23
 
24
+ # Media Libs
 
25
  import fitz # PyMuPDF
26
 
27
  # --- Configuration & Setup ---
28
  CWD = Path.cwd()
29
  OUTPUT_DIR = CWD / "generated_outputs"
30
  PREVIEW_DIR = CWD / "previews"
31
+ UPLOAD_DIR = CWD / "uploads"
32
  FONT_DIR = CWD
33
+
34
+ # Create necessary directories
35
  OUTPUT_DIR.mkdir(exist_ok=True)
36
  PREVIEW_DIR.mkdir(exist_ok=True)
37
+ UPLOAD_DIR.mkdir(exist_ok=True)
38
 
39
  LAYOUTS = {
40
  "A4 Portrait": {"size": A4},
 
43
  "Letter Landscape": {"size": landscape(letter)},
44
  }
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  # --- ✍️ Document Generation Engines ---
47
 
48
  def create_pdf(md_content, font_name, emoji_font, pagesize, num_columns):
 
95
  """📜 Translates Markdown text into a sequence of ReportLab flowables for PDF rendering."""
96
  styles = getSampleStyleSheet()
97
  bold_font = f"{font_name}-Bold" if font_name != "Helvetica" else "Helvetica-Bold"
98
+ style_normal = ParagraphStyle('BodyText', fontName=font_name, spaceAfter=6, fontSize=10, leading=14)
99
+ style_h1 = ParagraphStyle('h1', fontName=bold_font, spaceBefore=12, fontSize=24, textColor=colors.HexColor("#1E3A8A"))
100
+ style_h2 = ParagraphStyle('h2', fontName=bold_font, spaceBefore=10, fontSize=18, textColor=colors.HexColor("#374151"))
101
+ style_h3 = ParagraphStyle('h3', fontName=bold_font, spaceBefore=8, fontSize=14, textColor=colors.HexColor("#4B5563"))
102
+ style_code = ParagraphStyle('Code', fontName='Courier', backColor=colors.whitesmoke, textColor=colors.darkred, borderWidth=1, borderColor=colors.lightgrey, padding=8)
103
+
104
  story, first_heading = [], True
105
  for line in markdown_text.split('\n'):
106
+ stripped_line = line.strip()
107
+ if not stripped_line:
108
+ story.append(Spacer(1, 0.1 * inch)); continue
109
+
110
+ # Determine the structural element and its style
111
+ content, style, extra_args = stripped_line, style_normal, {}
112
+ if stripped_line.startswith("# "):
113
  if not first_heading: story.append(PageBreak())
114
+ content, style, first_heading = stripped_line.lstrip('# '), style_h1, False
115
+ elif stripped_line.startswith("## "):
116
+ content, style = stripped_line.lstrip('## '), style_h2
117
+ elif stripped_line.startswith("### "):
118
+ content, style = stripped_line.lstrip('### '), style_h3
119
+ elif stripped_line.startswith(("- ", "* ")):
120
+ content, extra_args = stripped_line[2:], {'bulletText': '•'}
121
+
122
+ # Now, format the content string correctly for ReportLab
123
+ # Apply bold/italic first
124
+ formatted_content = re.sub(r'_(.*?)_', r'<i>\1</i>', re.sub(r'\*\*(.*?)\*\*', r'<b>\1</b>', content))
125
+ # Then, apply the emoji font tags. This order is crucial.
126
  final_content = apply_emoji_font(formatted_content, emoji_font)
127
+
128
+ story.append(Paragraph(final_content, style, **extra_args))
129
+
130
  return story
131
 
132
 
133
+ # --- 🔮 Virtual AI Omni-Model Functions ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
+ def process_text_input(prompt):
136
+ """💬 Simulates an AI response to a text prompt."""
137
+ return f"# Virtual AI Response\n\n**Your Prompt:**\n> {prompt}\n\n**Generated Content:**\n- This is a simulated response for your text input.\n- Here's an emoji: 😊"
138
+
139
+ def process_image_input(image_path, prompt):
140
+ """🖼️ Simulates an AI description of an image."""
141
+ return f"# Virtual AI Image Analysis: {Path(image_path).name}\n\n**Your Prompt:**\n> {prompt}\n\n**Generated Content:**\n1. Simulated analysis of the uploaded image.\n2. File type appears to be `{Path(image_path).suffix}`."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
+ def process_audio_input(audio_path, prompt):
144
+ """🎤 Simulates AI transcription and summarization of an audio file."""
145
+ return f"# Virtual AI Audio Summary: {Path(audio_path).name}\n\n**Your Prompt:**\n> {prompt}\n\n**Simulated Transcription:**\n> \"This is a test of the emergency broadcast system.\"\n\n**Generated Summary:**\nThe audio is a test broadcast."
146
+
147
+ def process_pdf_input(pdf_path, prompt, progress):
148
+ """📄 Simulates AI-powered OCR of a PDF document."""
149
+ progress(0.5, desc="Simulating PDF page processing...")
150
+ ocr_text = f"# Virtual AI OCR of: {Path(pdf_path).name}\n\n**Your Prompt:**\n> {prompt}\n\n**Extracted Content (Simulated):**\n- **Page 1:** Simulated text from the first page.\n- **Page 2:** Simulated text from the second page."
151
+ progress(1.0, desc="PDF OCR Simulation Complete!")
152
+ return ocr_text
153
 
154
 
155
  # --- 🛠️ Helpers & Main API ---
 
168
  emoji_font_name = font_name
169
  else:
170
  text_font_names.append(font_name)
171
+ except: pass
 
172
  if not text_font_names: text_font_names.append('Helvetica')
173
  return sorted(text_font_names), emoji_font_name
174
 
 
183
  """🏞️ Generates a PNG thumbnail for the first page of a PDF."""
184
  preview_path = PREVIEW_DIR / f"{pdf_path.stem}.png"
185
  try:
186
+ doc = fitz.open(pdf_path); page = doc.load_page(0); pix = page.get_pixmap(dpi=96)
187
  pix.save(str(preview_path)); doc.close()
188
+ return preview_path
189
  except: return None
190
 
191
+ def build_file_explorer_html(generated_files, pdf_files_for_gallery):
192
+ """🗂️ Constructs the HTML/JS for the file explorer and PDF gallery."""
193
+
194
+ file_explorer_html = ""
195
+ file_icons = {".pdf": "📄", ".docx": "📝", ".xlsx": "📊"}
196
+ for file_path in generated_files:
197
+ icon = file_icons.get(file_path.suffix, '📎')
198
+ file_explorer_html += f"""
199
+ <a href="/file={file_path}" class="file-link" download="{file_path.name}">
200
+ <span class="file-icon">{icon}</span>
201
+ <span class="file-name">{file_path.name}</span>
202
+ </a>
203
+ """
204
+
205
+ gallery_items = []
206
+ for pdf_path in pdf_files_for_gallery:
207
+ preview_path = create_pdf_preview(pdf_path)
208
+ if preview_path:
209
+ with open(preview_path, "rb") as f:
210
+ img_base64 = base64.b64encode(f.read()).decode("utf-8")
211
+ gallery_items.append({
212
+ "preview_src": f"data:image/png;base64,{img_base64}",
213
+ "filename": pdf_path.name
214
+ })
215
+
216
+ gallery_html = ""
217
+ if gallery_items:
218
+ thumbs_html = ""
219
+ for item in gallery_items:
220
+ thumbs_html += f'<img src="{item["preview_src"]}" class="thumbnail" onclick="selectThumbnail(this, \'{item["preview_src"]}\', \'{item["filename"]}\')">'
221
+
222
+ gallery_html = f"""
223
+ <div class="gallery-container">
224
+ <div class="main-view">
225
+ <img id="main-image" src="{gallery_items[0]['preview_src']}" class="main-image">
226
+ <p id="main-filename">{gallery_items[0]['filename']}</p>
227
+ </div>
228
+ <div class="thumbnail-strip">{thumbs_html}</div>
229
+ </div>
230
+ """
231
+
232
+ html = f"""
233
+ <style>
234
+ .tabs {{ display: flex; border-bottom: 2px solid #ccc; }}
235
+ .tab-button {{ padding: 10px 15px; cursor: pointer; background: #f1f1f1; border: none; border-bottom: 2px solid transparent; outline: none; }}
236
+ .tab-button.active {{ background: #fff; border-top: 2px solid #007bff; border-left: 2px solid #ccc; border-right: 2px solid #ccc; border-bottom: 2px solid #fff; }}
237
+ .tab-content {{ display: none; padding: 15px; border: 1px solid #ccc; border-top: none; }}
238
+ .tab-content.active {{ display: block; }}
239
+ .file-explorer {{ display: grid; grid-template-columns: repeat(auto-fill, minmax(200px, 1fr)); gap: 10px; }}
240
+ .file-link {{ display: flex; align-items: center; padding: 10px; background: #f9f9f9; border-radius: 5px; text-decoration: none; color: #333; }}
241
+ .file-link:hover {{ background: #e9e9e9; }}
242
+ .file-icon {{ font-size: 2.5em; margin-right: 10px; }}
243
+ .gallery-container {{ display: flex; height: 500px; }}
244
+ .main-view {{ flex: 3; padding: 10px; display: flex; flex-direction: column; align-items: center; justify-content: center; }}
245
+ .main-image {{ max-width: 100%; max-height: 90%; }}
246
+ .thumbnail-strip {{ flex: 1; overflow-y: auto; padding: 5px; }}
247
+ .thumbnail {{ width: 100%; margin-bottom: 5px; cursor: pointer; border: 2px solid transparent; }}
248
+ .thumbnail.active {{ border-color: #007bff; }}
249
+ </style>
250
+ <div class="tabs">
251
+ <button class="tab-button active" onclick="openTab(event, 'explorer')">🗂️ File Explorer</button>
252
+ {'<button class="tab-button" onclick="openTab(event, \'gallery\')">🖼️ PDF Gallery</button>' if gallery_items else ''}
253
+ </div>
254
+ <div id="explorer" class="tab-content active">
255
+ <div class="file-explorer">{file_explorer_html}</div>
256
+ </div>
257
+ <div id="gallery" class="tab-content">
258
+ {gallery_html}
259
+ </div>
260
+ <script>
261
+ function openTab(evt, tabName) {{
262
+ var i, tabcontent, tablinks;
263
+ tabcontent = document.getElementsByClassName("tab-content");
264
+ for (i = 0; i < tabcontent.length; i++) {{ tabcontent[i].style.display = "none"; }}
265
+ tablinks = document.getElementsByClassName("tab-button");
266
+ for (i = 0; i < tablinks.length; i++) {{ tablinks[i].className = tablinks[i].className.replace(" active", ""); }}
267
+ document.getElementById(tabName).style.display = "block";
268
+ evt.currentTarget.className += " active";
269
+ }}
270
+
271
+ const mainImage = document.getElementById('main-image');
272
+ const mainFilename = document.getElementById('main-filename');
273
+ const thumbnails = document.querySelectorAll('.thumbnail');
274
+ if (thumbnails.length > 0) thumbnails[0].classList.add('active');
275
+
276
+ function selectThumbnail(selectedThumb, imgSrc, filename) {{
277
+ mainImage.src = imgSrc; mainFilename.textContent = filename;
278
+ thumbnails.forEach(t => t.classList.remove('active'));
279
+ selectedThumb.classList.add('active');
280
+ }};
281
+ </script>
282
+ """
283
+ return html
284
+
285
+ def generate_outputs_api(omni_files, omni_prompt, output_formats, layouts, fonts, num_columns, page_w_mult, page_h_mult, progress=gr.Progress(track_tqdm=True)):
286
  """🚀 The main entry point that orchestrates the entire multi-modal generation process."""
287
  if not omni_prompt and not omni_files: raise gr.Error("Please provide a prompt or upload at least one file.")
288
  if not output_formats: raise gr.Error("Please select at least one output format.")
289
 
 
 
 
 
290
  shutil.rmtree(OUTPUT_DIR, ignore_errors=True); shutil.rmtree(PREVIEW_DIR, ignore_errors=True)
291
  OUTPUT_DIR.mkdir(); PREVIEW_DIR.mkdir()
292
 
 
293
  md_content = ""
 
294
  if omni_files:
295
+ temp_paths = []
296
+ for f in omni_files:
297
+ temp_path = UPLOAD_DIR / Path(f.name).name
298
+ shutil.copyfile(f.name, temp_path)
299
+ temp_paths.append(temp_path)
300
+
301
+ file_path = temp_paths[0]
302
+ file_ext = file_path.suffix.lower()
303
+
304
+ if file_ext == '.md': md_content = "\n\n".join([p.read_text(encoding='utf-8') for p in temp_paths if p.suffix.lower() == '.md'])
305
+ elif file_ext == '.pdf': md_content = process_pdf_input(file_path, omni_prompt or "Extract text", progress)
306
+ elif file_ext in ['.png', '.jpg', '.jpeg']: md_content = process_image_input(file_path, omni_prompt or "Describe image")
307
+ elif file_ext in ['.wav', '.mp3']: md_content = process_audio_input(file_path, omni_prompt or "Summarize transcription")
 
 
 
 
 
 
308
  elif omni_prompt:
309
+ md_content = process_text_input(omni_prompt)
310
 
311
+ if not md_content: raise gr.Error("Failed to generate source content.")
312
 
313
+ generated_files, pdf_files_for_gallery = [], []
 
314
  for format_choice in progress.tqdm(output_formats, desc="Generating Formats"):
315
  time_str = datetime.datetime.now().strftime('%m-%d-%a_%I%M%p').upper()
316
  if format_choice == "PDF":
 
319
  pagesize = LAYOUTS[layout_name]["size"]
320
  final_pagesize = (pagesize[0] * page_w_mult, pagesize[1] * page_h_mult)
321
  pdf_buffer = create_pdf(md_content, font_name, EMOJI_FONT_NAME, final_pagesize, num_columns)
322
+ filename = f"Document_{time_str}.pdf"
323
  output_path = OUTPUT_DIR / filename
324
  with open(output_path, "wb") as f: f.write(pdf_buffer.getvalue())
325
+ generated_files.append(output_path); pdf_files_for_gallery.append(output_path)
326
  elif format_choice == "DOCX":
327
+ doc = create_docx(md_content); filename = f"Document_{time_str}.docx"
328
+ output_path = OUTPUT_DIR / filename; doc.save(output_path); generated_files.append(output_path)
 
 
329
  elif format_choice == "XLSX":
330
+ book = create_xlsx(md_content); filename = f"Outline_{time_str}.xlsx"
331
+ output_path = OUTPUT_DIR / filename; book.save(output_path); generated_files.append(output_path)
 
 
332
 
333
+ final_html_output = build_file_explorer_html(generated_files, pdf_files_for_gallery)
 
334
 
335
+ return md_content, final_html_output
336
 
337
  # --- 🎨 Gradio UI Definition ---
338
  AVAILABLE_FONTS, EMOJI_FONT_NAME = register_local_fonts()
339
+ SAMPLE_MARKDOWN = """# Deities Guide: Mythology and Moral Lessons
340
+
341
+ 1. 📜 **Introduction**
342
+ - **Purpose**: Explore deities, spirits, saints, and beings with their epic stories and morals!
343
+ - **Usage**: A guide for learning and storytelling across traditions. ️
344
+ - **Themes**: Justice ⚖️, faith 🙏, hubris 🏛️, redemption ✨, cosmic order 🌌.
345
+
346
+ # ⚔️ Arthurian Legends
347
+ - **Merlin, Morgan le Fay, Arthur**: Mentor 🧙, rival 🧙‍♀️, son 👑.
348
+ - **Relation**: Family tests loyalty 🤝.
349
+ - **Lesson**: Honor 🎖️ vs. betrayal 🗡️.
350
+ """
351
 
352
  with gr.Blocks(theme=gr.themes.Soft(), title="Omni-Model Document Generator") as demo:
353
+ gr.Markdown("# 🧠 Omni-Model Document Generator")
354
+ gr.Markdown("Provide a prompt, or upload a file (MD, PDF, Image, Audio). A virtual AI will process it, and you can generate documents from the result.")
355
 
356
  with gr.Row():
357
  with gr.Column(scale=1):
358
+ with gr.Tabs():
359
+ with gr.TabItem("💬 Text"):
360
+ text_prompt = gr.Textbox(label="Prompt", lines=5, placeholder="Ask a question or provide instructions...")
361
+ with gr.TabItem("🖼️ Image"):
362
+ image_prompt = gr.Textbox(label="Image Prompt", lines=2, placeholder="e.g., Describe this picture")
363
+ image_file = gr.File(label="Upload Image", file_types=["image"])
364
+ with gr.TabItem("🎤 Audio"):
365
+ audio_prompt = gr.Textbox(label="Audio Prompt", lines=2, placeholder="e.g., Summarize this audio")
366
+ audio_file = gr.File(label="Upload Audio", file_types=[".wav", ".mp3"])
367
+ with gr.TabItem("📄 Document"):
368
+ doc_prompt = gr.Textbox(label="Document Prompt", lines=2, placeholder="e.g., Extract text from this PDF")
369
+ doc_file = gr.File(label="Upload MD or PDF", file_types=[".md", ".pdf"])
370
 
371
  gr.Markdown("### 📄 Output Settings")
372
  output_formats = gr.CheckboxGroup(choices=["PDF", "DOCX", "XLSX"], label="Select Output Formats", value=["PDF"])
373
 
374
+ with gr.Accordion("PDF Customization", open=False):
375
  num_columns_slider = gr.Slider(label="Text Columns", minimum=1, maximum=4, step=1, value=1)
376
  page_w_mult_slider = gr.Slider(label="Page Width Multiplier", minimum=1, maximum=5, step=1, value=1)
377
  page_h_mult_slider = gr.Slider(label="Page Height Multiplier", minimum=1, maximum=2, step=1, value=1)
 
383
  with gr.Column(scale=2):
384
  gr.Markdown("### 🤖 AI Response (Source for Documents)")
385
  ai_response_output = gr.Markdown(label="AI Generated Content")
386
+ gr.Markdown("### 🗂️ Generated Files")
387
+ file_explorer_output = gr.HTML(label="File Explorer & Gallery")
388
+
389
+ def master_process(p1, p2, p3, p4, f1, f2, f3, f4, *args):
390
+ # Determine active tab and route to the API
391
+ if f1: return generate_outputs_api([f1], p1 or "Describe this text", *args)
392
+ if f2: return generate_outputs_api([f2], p2 or "Describe this image", *args)
393
+ if f3: return generate_outputs_api([f3], p3 or "Summarize this audio", *args)
394
+ if f4: return generate_outputs_api([f4], p4 or "Process this document", *args)
395
+ if p1: return generate_outputs_api(None, p1, *args)
396
+ raise gr.Error("Please provide an input in one of the tabs.")
397
+
398
+ generate_btn.click(fn=master_process,
399
+ inputs=[text_prompt, image_prompt, audio_prompt, doc_prompt, text_prompt, image_file, audio_file, doc_file, output_formats, selected_layouts, selected_fonts, num_columns_slider, page_w_mult_slider, page_h_mult_slider],
400
+ outputs=[ai_response_output, file_explorer_output])
401
 
402
  if __name__ == "__main__":
403
+ demo.launch(share=True)