awacke1 commited on
Commit
cbf77e5
·
verified ·
1 Parent(s): e13b6e9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +229 -107
app.py CHANGED
@@ -4,99 +4,99 @@ import datetime
4
  import re
5
  import os
6
  import shutil
7
- import fitz # PyMuPDF
8
- from PIL import Image
9
- from collections import defaultdict
10
  import io
11
- from pypdf import PdfWriter
 
 
12
 
13
- # Imports for new formats
14
  from docx import Document
15
- from docx.shared import Inches
16
  import openpyxl
17
-
18
- from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak, BaseDocTemplate, Frame, PageTemplate, Image as ReportLabImage
19
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
20
- from reportlab.lib.pagesizes import letter, A4, legal, landscape
21
  from reportlab.lib.units import inch
22
- from reportlab.lib import colors
23
  from reportlab.pdfbase import pdfmetrics
24
  from reportlab.pdfbase.ttfonts import TTFont
25
 
 
 
 
 
26
  # --- Configuration & Setup ---
27
  CWD = Path.cwd()
28
- LAYOUTS = {
29
- "A4 Portrait": {"size": A4},
30
- "A4 Landscape": {"size": landscape(A4)},
31
- "Letter Portrait": {"size": letter},
32
- "Letter Landscape": {"size": landscape(letter)},
33
- }
34
  OUTPUT_DIR = CWD / "generated_outputs"
35
  PREVIEW_DIR = CWD / "previews"
36
  FONT_DIR = CWD
37
-
38
- # Create necessary directories
39
  OUTPUT_DIR.mkdir(exist_ok=True)
40
  PREVIEW_DIR.mkdir(exist_ok=True)
41
 
 
 
 
 
 
 
42
 
43
- # --- Font & Emoji Handling (for PDF) ---
44
-
45
- def register_local_fonts():
46
- """Finds and registers all .ttf files from the application's base directory."""
47
- text_font_names, emoji_font_name = [], None
48
- font_files = list(FONT_DIR.glob("*.ttf"))
49
- print(f"Found {len(font_files)} .ttf files: {[f.name for f in font_files]}")
50
-
51
- for font_path in font_files:
52
- try:
53
- font_name = font_path.stem
54
- # Register the regular font
55
- pdfmetrics.registerFont(TTFont(font_name, str(font_path)))
56
- # Also register a bold version, pointing to the same file. ReportLab's <b> tag will work.
57
- pdfmetrics.registerFont(TTFont(f"{font_name}-Bold", str(font_path)))
58
- pdfmetrics.registerFontFamily(font_name, normal=font_name, bold=f"{font_name}-Bold")
59
-
60
- if "notocoloremoji-regular" in font_name.lower():
61
- emoji_font_name = font_name
62
- elif "notoemoji" not in font_name.lower():
63
- text_font_names.append(font_name)
64
- except Exception as e:
65
- print(f"Could not register font {font_path.name}: {e}")
66
- if not text_font_names: text_font_names.append('Helvetica')
67
- return sorted(text_font_names), emoji_font_name
68
 
69
- def apply_emoji_font(text: str, emoji_font_name: str) -> str:
70
- """Wraps emoji characters in a <font> tag for ReportLab."""
71
- if not emoji_font_name: return text
72
- emoji_pattern = re.compile(f"([{re.escape(''.join(map(chr, range(0x1f600, 0x1f650))))}"
73
- f"{re.escape(''.join(map(chr, range(0x1f300, 0x1f5ff))))}]+)")
74
- return emoji_pattern.sub(fr'<font name="{emoji_font_name}">\1</font>', text)
 
 
 
 
 
 
 
 
 
 
 
75
 
76
 
77
- # --- Document Generation Engines ---
78
 
79
  def create_pdf(md_content, font_name, emoji_font, pagesize, num_columns):
80
- """Generates a PDF file from markdown content."""
81
- md_buffer = io.BytesIO()
82
  story = markdown_to_story(md_content, font_name, emoji_font)
83
  if num_columns > 1:
84
- doc = BaseDocTemplate(md_buffer, pagesize=pagesize, leftMargin=0.5*inch, rightMargin=0.5*inch)
85
- frame_width = (doc.width / num_columns) - (num_columns - 1) * 0.1*inch
86
- frames = [Frame(doc.leftMargin + i * (frame_width + 0.2*inch), doc.bottomMargin, frame_width, doc.height) for i in range(num_columns)]
87
  doc.addPageTemplates([PageTemplate(id='MultiCol', frames=frames)])
88
  else:
89
- doc = SimpleDocTemplate(md_buffer, pagesize=pagesize)
90
  doc.build(story)
91
- return md_buffer
 
92
 
93
  def create_docx(md_content):
94
- """Generates a DOCX file from markdown content."""
95
  document = Document()
96
  for line in md_content.split('\n'):
97
  if line.startswith('# '): document.add_heading(line[2:], level=1)
98
  elif line.startswith('## '): document.add_heading(line[3:], level=2)
99
- elif line.strip().startswith(('- ','* ')): document.add_paragraph(line.strip()[2:], style='List Bullet')
100
  else:
101
  p = document.add_paragraph()
102
  parts = re.split(r'(\*\*.*?\*\*)', line)
@@ -106,10 +106,10 @@ def create_docx(md_content):
106
  return document
107
 
108
  def create_xlsx(md_content):
109
- """Generates an XLSX file, splitting content by H1 headers into columns."""
110
  workbook = openpyxl.Workbook(); sheet = workbook.active
111
  sections = re.split(r'\n# ', '\n' + md_content)
112
- if sections[0] == '': sections.pop(0)
113
  column_data = []
114
  for section in sections:
115
  lines = section.split('\n'); header = lines[0]
@@ -122,38 +122,129 @@ def create_xlsx(md_content):
122
  return workbook
123
 
124
  def markdown_to_story(markdown_text: str, font_name: str, emoji_font: str):
125
- """Converts markdown to a ReportLab story for PDF generation with enhanced styling."""
126
  styles = getSampleStyleSheet()
127
- # Use the bold variant of the selected font for headers
128
  bold_font = f"{font_name}-Bold" if font_name != "Helvetica" else "Helvetica-Bold"
129
-
130
- # Create styles with dynamic font sizes and bolding for headers
131
  style_normal = ParagraphStyle('BodyText', fontName=font_name, spaceAfter=6, fontSize=10)
132
- style_h1 = ParagraphStyle('h1', fontName=bold_font, spaceBefore=12, fontSize=24, leading=28)
133
- style_h2 = ParagraphStyle('h2', fontName=bold_font, spaceBefore=10, fontSize=18, leading=22)
134
- style_h3 = ParagraphStyle('h3', fontName=bold_font, spaceBefore=8, fontSize=14, leading=18)
135
-
136
  story, first_heading = [], True
137
  for line in markdown_text.split('\n'):
138
  content, style = line, style_normal
139
-
140
- # Determine the style based on markdown heading level
141
  if line.startswith("# "):
142
  if not first_heading: story.append(PageBreak())
143
  content, style, first_heading = line.lstrip('# '), style_h1, False
144
- elif line.startswith("## "):
145
- content, style = line.lstrip('## '), style_h2
146
- elif line.startswith("### "):
147
- content, style = line.lstrip('### '), style_h3
148
-
149
- # Apply bold tags and then apply emoji font wrapper
150
  formatted_content = re.sub(r'\*\*(.*?)\*\*', r'<b>\1</b>', content)
151
  final_content = apply_emoji_font(formatted_content, emoji_font)
152
  story.append(Paragraph(final_content, style))
153
-
154
  return story
155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  def create_pdf_preview(pdf_path: Path):
 
157
  preview_path = PREVIEW_DIR / f"{pdf_path.stem}.png"
158
  try:
159
  doc = fitz.open(pdf_path); page = doc.load_page(0); pix = page.get_pixmap()
@@ -161,22 +252,50 @@ def create_pdf_preview(pdf_path: Path):
161
  return str(preview_path)
162
  except: return None
163
 
164
- # --- Main API Function ---
165
- def generate_outputs_api(files, output_formats, layouts, fonts, num_columns, page_w_mult, page_h_mult, progress=gr.Progress(track_tqdm=True)):
166
- if not files: raise gr.Error("Please upload at least one file.")
167
  if not output_formats: raise gr.Error("Please select at least one output format.")
 
 
 
 
168
 
169
  shutil.rmtree(OUTPUT_DIR, ignore_errors=True); shutil.rmtree(PREVIEW_DIR, ignore_errors=True)
170
  OUTPUT_DIR.mkdir(); PREVIEW_DIR.mkdir()
171
 
172
- # Consolidate all markdown content
173
- md_content = "\n".join([Path(f.name).read_text(encoding='utf-8') for f in files if Path(f.name).suffix.lower() == '.md'])
 
 
 
 
 
174
 
175
- log_updates, generated_files = "", []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
 
 
 
177
  for format_choice in progress.tqdm(output_formats, desc="Generating Formats"):
178
  time_str = datetime.datetime.now().strftime('%m-%d-%a_%I%M%p').upper()
179
-
180
  if format_choice == "PDF":
181
  for layout_name in layouts:
182
  for font_name in fonts:
@@ -187,61 +306,64 @@ def generate_outputs_api(files, output_formats, layouts, fonts, num_columns, pag
187
  output_path = OUTPUT_DIR / filename
188
  with open(output_path, "wb") as f: f.write(pdf_buffer.getvalue())
189
  generated_files.append(output_path)
190
-
191
  elif format_choice == "DOCX":
192
  docx_doc = create_docx(md_content)
193
  filename = f"Document_{time_str}.docx"
194
  output_path = OUTPUT_DIR / filename
195
- docx_doc.save(output_path)
196
- generated_files.append(output_path)
197
-
198
  elif format_choice == "XLSX":
199
  xlsx_book = create_xlsx(md_content)
200
  filename = f"Outline_{time_str}.xlsx"
201
  output_path = OUTPUT_DIR / filename
202
- xlsx_book.save(output_path)
203
- generated_files.append(output_path)
204
 
205
  gallery_previews = [create_pdf_preview(p) for p in generated_files if p.suffix == '.pdf']
206
  final_gallery = [g for g in gallery_previews if g]
207
 
208
- return final_gallery, f"Generated {len(generated_files)} files.", [str(p) for p in generated_files]
209
 
210
- # --- Gradio UI Definition ---
211
  AVAILABLE_FONTS, EMOJI_FONT_NAME = register_local_fonts()
212
- SAMPLE_MARKDOWN = "# Deities Guide\n\n- **Purpose**: Explore deities and their morals! \n- **Themes**: Justice ⚖️, faith 🙏\n\n# Arthurian Legends\n\n - **Merlin, Arthur**: Mentor 🧙, son 👑.\n - **Lesson**: Honor 🎖️ vs. betrayal 🗡️."
213
- with open(CWD / "sample.md", "w", encoding="utf-8") as f: f.write(SAMPLE_MARKDOWN)
214
 
215
- with gr.Blocks(theme=gr.themes.Soft(), title="Advanced Document Generator") as demo:
216
- gr.Markdown("# 📄 Advanced Document Generator (PDF, DOCX, XLSX)")
217
- gr.Markdown("Upload Markdown files to generate documents in multiple formats. `# Headers` create columns in XLSX and page breaks in multi-page PDFs.")
218
 
219
  with gr.Row():
220
  with gr.Column(scale=1):
221
- gr.Markdown("### ⚙️ Generation Settings")
222
- uploaded_files = gr.File(label="Upload Markdown & Image Files", file_count="multiple", file_types=[".md", ".png", ".jpg"])
 
 
 
 
 
 
 
 
223
 
 
224
  output_formats = gr.CheckboxGroup(choices=["PDF", "DOCX", "XLSX"], label="Select Output Formats", value=["PDF"])
225
 
226
  with gr.Accordion("PDF Customization", open=True):
227
- with gr.Row():
228
- page_w_mult_slider = gr.Slider(label="Page Width Multiplier", minimum=1, maximum=5, step=1, value=1)
229
- page_h_mult_slider = gr.Slider(label="Page Height Multiplier", minimum=1, maximum=2, step=1, value=1)
230
  num_columns_slider = gr.Slider(label="Text Columns", minimum=1, maximum=4, step=1, value=1)
 
 
231
  selected_layouts = gr.CheckboxGroup(choices=list(LAYOUTS.keys()), label="Base Page Layout", value=["A4 Portrait"])
232
  selected_fonts = gr.CheckboxGroup(choices=AVAILABLE_FONTS, label="Text Font", value=[AVAILABLE_FONTS[0]] if AVAILABLE_FONTS else [])
233
 
234
  generate_btn = gr.Button("🚀 Generate Documents", variant="primary")
235
 
236
  with gr.Column(scale=2):
237
- gr.Markdown("### 🖼️ Output Files")
 
 
238
  gallery_output = gr.Gallery(label="PDF Previews", show_label=False, elem_id="gallery", columns=3, height="auto", object_fit="contain")
239
- log_output = gr.Markdown(label="Generation Log", value="Ready...")
240
  downloadable_files_output = gr.Files(label="Download Generated Files")
241
 
242
  generate_btn.click(fn=generate_outputs_api,
243
- inputs=[uploaded_files, output_formats, selected_layouts, selected_fonts, num_columns_slider, page_w_mult_slider, page_h_mult_slider],
244
- outputs=[gallery_output, log_output, downloadable_files_output])
245
 
246
  if __name__ == "__main__":
247
  demo.launch()
 
4
  import re
5
  import os
6
  import shutil
 
 
 
7
  import io
8
+ import base64
9
+ from collections import defaultdict
10
+ from PIL import Image
11
 
12
+ # Document Generation Libs
13
  from docx import Document
 
14
  import openpyxl
15
+ from pypdf import PdfWriter
16
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak, BaseDocTemplate, Frame, PageTemplate
17
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
18
+ from reportlab.lib.pagesizes import letter, A4, landscape
19
  from reportlab.lib.units import inch
 
20
  from reportlab.pdfbase import pdfmetrics
21
  from reportlab.pdfbase.ttfonts import TTFont
22
 
23
+ # AI and Media Libs
24
+ from openai import AzureOpenAI
25
+ import fitz # PyMuPDF
26
+
27
  # --- Configuration & Setup ---
28
  CWD = Path.cwd()
 
 
 
 
 
 
29
  OUTPUT_DIR = CWD / "generated_outputs"
30
  PREVIEW_DIR = CWD / "previews"
31
  FONT_DIR = CWD
 
 
32
  OUTPUT_DIR.mkdir(exist_ok=True)
33
  PREVIEW_DIR.mkdir(exist_ok=True)
34
 
35
+ LAYOUTS = {
36
+ "A4 Portrait": {"size": A4},
37
+ "A4 Landscape": {"size": landscape(A4)},
38
+ "Letter Portrait": {"size": letter},
39
+ "Letter Landscape": {"size": landscape(letter)},
40
+ }
41
 
42
+ # 🧠 Initialize Azure OpenAI Client
43
+ # NOTE: This requires AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_API_KEY in your environment.
44
+ try:
45
+ client = AzureOpenAI(
46
+ azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
47
+ api_version="2024-05-01-preview",
48
+ api_key=os.getenv("AZURE_OPENAI_API_KEY"),
49
+ )
50
+ AZURE_CLIENT_AVAILABLE = True
51
+ except Exception as e:
52
+ print("Warning: Azure OpenAI client could not be initialized. Text generation will use dummy data.")
53
+ print(f"Error: {e}")
54
+ client = None
55
+ AZURE_CLIENT_AVAILABLE = False
 
 
 
 
 
 
 
 
 
 
 
56
 
57
+ # 📖 Map UI model names to your actual Azure deployment names.
58
+ # YOU MUST CHANGE THESE DEPLOYMENT NAMES to match your Azure setup.
59
+ AZURE_DEPLOYMENT_NAMES = {
60
+ # Chat / Vision Models
61
+ "gpt-4o": "your-gpt-4o-deployment-name",
62
+ "gpt-4.1": "your-gpt-4.1-deployment-name",
63
+ "gpt-4.1-mini": "your-gpt-4.1-mini-deployment-name",
64
+ "gpt-4o-mini": "your-gpt-4o-mini-deployment-name",
65
+ "gpt-4o-realtime-preview": "your-gpt-4o-realtime-deployment-name",
66
+ # Reasoning Models
67
+ "o1-mini": "your-o1-mini-deployment-name",
68
+ "o3-mini": "your-o3-mini-deployment-name",
69
+ "o4-mini": "your-o4-mini-deployment-name",
70
+ # Transcription Models
71
+ "gpt-4o-transcribe": "your-gpt-4o-transcribe-deployment",
72
+ "gpt-4o-mini-transcribe": "your-gpt-4o-mini-transcribe-deployment",
73
+ }
74
 
75
 
76
+ # --- ✍️ Document Generation Engines ---
77
 
78
  def create_pdf(md_content, font_name, emoji_font, pagesize, num_columns):
79
+ """📄 Builds a beautiful PDF from a Markdown story using ReportLab."""
80
+ pdf_buffer = io.BytesIO()
81
  story = markdown_to_story(md_content, font_name, emoji_font)
82
  if num_columns > 1:
83
+ doc = BaseDocTemplate(pdf_buffer, pagesize=pagesize, leftMargin=0.5 * inch, rightMargin=0.5 * inch)
84
+ frame_width = (doc.width / num_columns) - (num_columns - 1) * 0.1 * inch
85
+ frames = [Frame(doc.leftMargin + i * (frame_width + 0.2 * inch), doc.bottomMargin, frame_width, doc.height) for i in range(num_columns)]
86
  doc.addPageTemplates([PageTemplate(id='MultiCol', frames=frames)])
87
  else:
88
+ doc = SimpleDocTemplate(pdf_buffer, pagesize=pagesize)
89
  doc.build(story)
90
+ pdf_buffer.seek(0)
91
+ return pdf_buffer
92
 
93
  def create_docx(md_content):
94
+ """📝 Crafts a DOCX document, translating Markdown to Word elements."""
95
  document = Document()
96
  for line in md_content.split('\n'):
97
  if line.startswith('# '): document.add_heading(line[2:], level=1)
98
  elif line.startswith('## '): document.add_heading(line[3:], level=2)
99
+ elif line.strip().startswith(('- ', '* ')): document.add_paragraph(line.strip()[2:], style='List Bullet')
100
  else:
101
  p = document.add_paragraph()
102
  parts = re.split(r'(\*\*.*?\*\*)', line)
 
106
  return document
107
 
108
  def create_xlsx(md_content):
109
+ """📊 Organizes a Markdown outline into columns in an XLSX file."""
110
  workbook = openpyxl.Workbook(); sheet = workbook.active
111
  sections = re.split(r'\n# ', '\n' + md_content)
112
+ if sections and sections[0] == '': sections.pop(0)
113
  column_data = []
114
  for section in sections:
115
  lines = section.split('\n'); header = lines[0]
 
122
  return workbook
123
 
124
  def markdown_to_story(markdown_text: str, font_name: str, emoji_font: str):
125
+ """📜 Translates Markdown text into a sequence of ReportLab flowables for PDF rendering."""
126
  styles = getSampleStyleSheet()
 
127
  bold_font = f"{font_name}-Bold" if font_name != "Helvetica" else "Helvetica-Bold"
 
 
128
  style_normal = ParagraphStyle('BodyText', fontName=font_name, spaceAfter=6, fontSize=10)
129
+ style_h1 = ParagraphStyle('h1', fontName=bold_font, spaceBefore=12, fontSize=24)
 
 
 
130
  story, first_heading = [], True
131
  for line in markdown_text.split('\n'):
132
  content, style = line, style_normal
 
 
133
  if line.startswith("# "):
134
  if not first_heading: story.append(PageBreak())
135
  content, style, first_heading = line.lstrip('# '), style_h1, False
 
 
 
 
 
 
136
  formatted_content = re.sub(r'\*\*(.*?)\*\*', r'<b>\1</b>', content)
137
  final_content = apply_emoji_font(formatted_content, emoji_font)
138
  story.append(Paragraph(final_content, style))
 
139
  return story
140
 
141
+
142
+ # --- 🔮 Omni-Model Processing ---
143
+
144
+ def process_text_input(prompt, model_deployment_name):
145
+ """💬 Sends a text prompt to the Azure OpenAI model and gets a response."""
146
+ if not AZURE_CLIENT_AVAILABLE: return "Azure OpenAI client not configured. This is dummy text."
147
+ completion = client.chat.completions.create(
148
+ model=model_deployment_name,
149
+ messages=[{"role": "user", "content": prompt}]
150
+ )
151
+ return completion.choices[0].message.content
152
+
153
+ def process_image_input(image_file, prompt, model_deployment_name):
154
+ """🖼️ Encodes an image and sends it with a prompt to the Azure OpenAI model."""
155
+ if not AZURE_CLIENT_AVAILABLE: return "Azure OpenAI client not configured. This is a dummy image description."
156
+ with Image.open(image_file.name) as img:
157
+ with io.BytesIO() as output:
158
+ img.save(output, format="PNG")
159
+ base64_image = base64.b64encode(output.getvalue()).decode("utf-8")
160
+
161
+ response = client.chat.completions.create(
162
+ model=model_deployment_name,
163
+ messages=[{"role": "user", "content": [
164
+ {"type": "text", "text": prompt},
165
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"}}
166
+ ]}]
167
+ )
168
+ return response.choices[0].message.content
169
+
170
+ def process_audio_input(audio_file, prompt, chat_model_deployment, transcribe_model_deployment):
171
+ """🎤 Transcribes audio and sends the text with a prompt to the Azure OpenAI model."""
172
+ if not AZURE_CLIENT_AVAILABLE: return "Azure OpenAI client not configured. This is a dummy audio summary."
173
+ with open(audio_file.name, "rb") as f:
174
+ transcription = client.audio.transcriptions.create(
175
+ model=transcribe_model_deployment,
176
+ file=f
177
+ ).text
178
+
179
+ full_prompt = f"{prompt}\n\nAudio Transcription:\n{transcription}"
180
+ return process_text_input(full_prompt, chat_model_deployment)
181
+
182
+ def process_pdf_input(pdf_file, prompt, model_deployment_name, progress):
183
+ """📄 Performs OCR on a PDF by sending pages as images to the AI model."""
184
+ if not AZURE_CLIENT_AVAILABLE: return "Azure OpenAI client not configured. This is a dummy PDF summary."
185
+
186
+ all_extracted_text = []
187
+ doc = fitz.open(pdf_file.name)
188
+
189
+ # Process pages in pairs
190
+ for i in progress.tqdm(range(0, len(doc), 2), desc="Performing PDF OCR"):
191
+ page_images = []
192
+ messages = [{"type": "text", "text": prompt}]
193
+
194
+ # Get first page of the pair
195
+ page1 = doc.load_page(i)
196
+ pix1 = page1.get_pixmap(dpi=150)
197
+ img_bytes1 = pix1.tobytes("png")
198
+ base64_image1 = base64.b64encode(img_bytes1).decode("utf-8")
199
+ messages.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image1}"}})
200
+
201
+ # Get second page if it exists
202
+ if i + 1 < len(doc):
203
+ page2 = doc.load_page(i + 1)
204
+ pix2 = page2.get_pixmap(dpi=150)
205
+ img_bytes2 = pix2.tobytes("png")
206
+ base64_image2 = base64.b64encode(img_bytes2).decode("utf-8")
207
+ messages.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image2}"}})
208
+
209
+ response = client.chat.completions.create(
210
+ model=model_deployment_name,
211
+ messages=[{"role": "user", "content": messages}]
212
+ )
213
+ all_extracted_text.append(response.choices[0].message.content)
214
+
215
+ return "\n\n".join(all_extracted_text)
216
+
217
+
218
+ # --- 🛠️ Helpers & Main API ---
219
+
220
+ def register_local_fonts():
221
+ """✒️ Scans for local .ttf fonts and registers them for PDF creation."""
222
+ text_font_names, emoji_font_name = [], None
223
+ font_files = list(FONT_DIR.glob("*.ttf"))
224
+ for font_path in font_files:
225
+ try:
226
+ font_name = font_path.stem
227
+ pdfmetrics.registerFont(TTFont(font_name, str(font_path)))
228
+ pdfmetrics.registerFont(TTFont(f"{font_name}-Bold", str(font_path)))
229
+ pdfmetrics.registerFontFamily(font_name, normal=font_name, bold=f"{font_name}-Bold")
230
+ if "notocoloremoji-regular" in font_name.lower():
231
+ emoji_font_name = font_name
232
+ else:
233
+ text_font_names.append(font_name)
234
+ except Exception as e:
235
+ print(f"Could not register font {font_path.name}: {e}")
236
+ if not text_font_names: text_font_names.append('Helvetica')
237
+ return sorted(text_font_names), emoji_font_name
238
+
239
+ def apply_emoji_font(text: str, emoji_font_name: str) -> str:
240
+ """😊 Finds emojis and wraps them in special font tags for the PDF."""
241
+ if not emoji_font_name: return text
242
+ emoji_pattern = re.compile(f"([{re.escape(''.join(map(chr, range(0x1f600, 0x1f650))))}"
243
+ f"{re.escape(''.join(map(chr, range(0x1f300, 0x1f5ff))))}]+)")
244
+ return emoji_pattern.sub(fr'<font name="{emoji_font_name}">\1</font>', text)
245
+
246
  def create_pdf_preview(pdf_path: Path):
247
+ """🏞️ Generates a PNG thumbnail for the first page of a PDF."""
248
  preview_path = PREVIEW_DIR / f"{pdf_path.stem}.png"
249
  try:
250
  doc = fitz.open(pdf_path); page = doc.load_page(0); pix = page.get_pixmap()
 
252
  return str(preview_path)
253
  except: return None
254
 
255
+ def generate_outputs_api(omni_files, omni_prompt, chat_model, transcribe_model, output_formats, layouts, fonts, num_columns, page_w_mult, page_h_mult, progress=gr.Progress(track_tqdm=True)):
256
+ """🚀 The main entry point that orchestrates the entire multi-modal generation process."""
257
+ if not omni_prompt and not omni_files: raise gr.Error("Please provide a prompt or upload at least one file.")
258
  if not output_formats: raise gr.Error("Please select at least one output format.")
259
+
260
+ chat_deployment = AZURE_DEPLOYMENT_NAMES.get(chat_model)
261
+ transcribe_deployment = AZURE_DEPLOYMENT_NAMES.get(transcribe_model)
262
+ if not chat_deployment: raise gr.Error(f"Deployment for model '{chat_model}' not found in configuration.")
263
 
264
  shutil.rmtree(OUTPUT_DIR, ignore_errors=True); shutil.rmtree(PREVIEW_DIR, ignore_errors=True)
265
  OUTPUT_DIR.mkdir(); PREVIEW_DIR.mkdir()
266
 
267
+ # --- Step 1: Omni-Model Processing ---
268
+ md_content = ""
269
+ # Process files first
270
+ if omni_files:
271
+ # Check for multiple file types
272
+ file_paths = [Path(f.name) for f in omni_files]
273
+ extensions = {p.suffix.lower() for p in file_paths}
274
 
275
+ if '.md' in extensions:
276
+ md_content = "\n\n".join([p.read_text(encoding='utf-8') for p in file_paths if p.suffix.lower() == '.md'])
277
+ elif '.pdf' in extensions:
278
+ # For simplicity, we process only the first PDF if multiple are uploaded for OCR
279
+ pdf_file = next((f for f in omni_files if Path(f.name).suffix.lower() == '.pdf'), None)
280
+ ocr_prompt = omni_prompt if omni_prompt else "Extract all text from the following document pages."
281
+ md_content = process_pdf_input(pdf_file, ocr_prompt, chat_deployment, progress)
282
+ elif '.png' in extensions or '.jpg' in extensions or '.jpeg' in extensions:
283
+ image_file = next((f for f in omni_files if Path(f.name).suffix.lower() in ['.png', '.jpg', '.jpeg']), None)
284
+ md_content = process_image_input(image_file, omni_prompt, chat_deployment)
285
+ elif '.wav' in extensions or '.mp3' in extensions or '.m4a' in extensions:
286
+ if not transcribe_deployment: raise gr.Error(f"Deployment for model '{transcribe_model}' not found.")
287
+ audio_file = next((f for f in omni_files if Path(f.name).suffix.lower() in ['.wav', '.mp3', '.m4a']), None)
288
+ md_content = process_audio_input(audio_file, omni_prompt, chat_deployment, transcribe_deployment)
289
+ # If no files, process text prompt
290
+ elif omni_prompt:
291
+ md_content = process_text_input(omni_prompt, chat_deployment)
292
+
293
+ if not md_content: raise gr.Error("Failed to generate source content from the provided input.")
294
 
295
+ # --- Step 2: Generate Selected Document Formats ---
296
+ generated_files = []
297
  for format_choice in progress.tqdm(output_formats, desc="Generating Formats"):
298
  time_str = datetime.datetime.now().strftime('%m-%d-%a_%I%M%p').upper()
 
299
  if format_choice == "PDF":
300
  for layout_name in layouts:
301
  for font_name in fonts:
 
306
  output_path = OUTPUT_DIR / filename
307
  with open(output_path, "wb") as f: f.write(pdf_buffer.getvalue())
308
  generated_files.append(output_path)
 
309
  elif format_choice == "DOCX":
310
  docx_doc = create_docx(md_content)
311
  filename = f"Document_{time_str}.docx"
312
  output_path = OUTPUT_DIR / filename
313
+ docx_doc.save(output_path); generated_files.append(output_path)
 
 
314
  elif format_choice == "XLSX":
315
  xlsx_book = create_xlsx(md_content)
316
  filename = f"Outline_{time_str}.xlsx"
317
  output_path = OUTPUT_DIR / filename
318
+ xlsx_book.save(output_path); generated_files.append(output_path)
 
319
 
320
  gallery_previews = [create_pdf_preview(p) for p in generated_files if p.suffix == '.pdf']
321
  final_gallery = [g for g in gallery_previews if g]
322
 
323
+ return md_content, final_gallery, [str(p) for p in generated_files]
324
 
325
+ # --- 🎨 Gradio UI Definition ---
326
  AVAILABLE_FONTS, EMOJI_FONT_NAME = register_local_fonts()
 
 
327
 
328
+ with gr.Blocks(theme=gr.themes.Soft(), title="Omni-Model Document Generator") as demo:
329
+ gr.Markdown("# 🧠 Omni-Model Document Generator (PDF, DOCX, XLSX)")
330
+ gr.Markdown("Provide a prompt, or upload a Markdown, PDF, Image, or Audio file. The AI will process it, and you can generate documents from the result.")
331
 
332
  with gr.Row():
333
  with gr.Column(scale=1):
334
+ gr.Markdown("### ⚙️ Omni-Model Input")
335
+
336
+ chat_models = ["gpt-4o", "gpt-4.1", "gpt-4.1-mini", "gpt-4o-mini", "o1-mini", "o3-mini", "o4-mini"]
337
+ transcribe_models = ["gpt-4o-transcribe", "gpt-4o-mini-transcribe"]
338
+
339
+ selected_chat_model = gr.Dropdown(choices=chat_models, label="Select Chat/Vision/Reasoning Model", value=chat_models[0])
340
+ selected_transcribe_model = gr.Dropdown(choices=transcribe_models, label="Select Transcription Model (for audio)", value=transcribe_models[0])
341
+
342
+ omni_prompt = gr.Textbox(label="Prompt", lines=3, placeholder="Ask a question, or provide instructions for a file...")
343
+ omni_files = gr.File(label="Upload File(s) (Optional)", file_count="multiple", file_types=["image", ".wav", ".mp3", ".md", ".pdf"])
344
 
345
+ gr.Markdown("### 📄 Output Settings")
346
  output_formats = gr.CheckboxGroup(choices=["PDF", "DOCX", "XLSX"], label="Select Output Formats", value=["PDF"])
347
 
348
  with gr.Accordion("PDF Customization", open=True):
 
 
 
349
  num_columns_slider = gr.Slider(label="Text Columns", minimum=1, maximum=4, step=1, value=1)
350
+ page_w_mult_slider = gr.Slider(label="Page Width Multiplier", minimum=1, maximum=5, step=1, value=1)
351
+ page_h_mult_slider = gr.Slider(label="Page Height Multiplier", minimum=1, maximum=2, step=1, value=1)
352
  selected_layouts = gr.CheckboxGroup(choices=list(LAYOUTS.keys()), label="Base Page Layout", value=["A4 Portrait"])
353
  selected_fonts = gr.CheckboxGroup(choices=AVAILABLE_FONTS, label="Text Font", value=[AVAILABLE_FONTS[0]] if AVAILABLE_FONTS else [])
354
 
355
  generate_btn = gr.Button("🚀 Generate Documents", variant="primary")
356
 
357
  with gr.Column(scale=2):
358
+ gr.Markdown("### 🤖 AI Response (Source for Documents)")
359
+ ai_response_output = gr.Markdown(label="AI Generated Content")
360
+ gr.Markdown("### 🖼️ Final Documents")
361
  gallery_output = gr.Gallery(label="PDF Previews", show_label=False, elem_id="gallery", columns=3, height="auto", object_fit="contain")
 
362
  downloadable_files_output = gr.Files(label="Download Generated Files")
363
 
364
  generate_btn.click(fn=generate_outputs_api,
365
+ inputs=[omni_files, omni_prompt, selected_chat_model, selected_transcribe_model, output_formats, selected_layouts, selected_fonts, num_columns_slider, page_w_mult_slider, page_h_mult_slider],
366
+ outputs=[ai_response_output, gallery_output, downloadable_files_output])
367
 
368
  if __name__ == "__main__":
369
  demo.launch()