Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -4,99 +4,99 @@ import datetime
|
|
4 |
import re
|
5 |
import os
|
6 |
import shutil
|
7 |
-
import fitz # PyMuPDF
|
8 |
-
from PIL import Image
|
9 |
-
from collections import defaultdict
|
10 |
import io
|
11 |
-
|
|
|
|
|
12 |
|
13 |
-
#
|
14 |
from docx import Document
|
15 |
-
from docx.shared import Inches
|
16 |
import openpyxl
|
17 |
-
|
18 |
-
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer,
|
19 |
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
20 |
-
from reportlab.lib.pagesizes import letter, A4,
|
21 |
from reportlab.lib.units import inch
|
22 |
-
from reportlab.lib import colors
|
23 |
from reportlab.pdfbase import pdfmetrics
|
24 |
from reportlab.pdfbase.ttfonts import TTFont
|
25 |
|
|
|
|
|
|
|
|
|
26 |
# --- Configuration & Setup ---
|
27 |
CWD = Path.cwd()
|
28 |
-
LAYOUTS = {
|
29 |
-
"A4 Portrait": {"size": A4},
|
30 |
-
"A4 Landscape": {"size": landscape(A4)},
|
31 |
-
"Letter Portrait": {"size": letter},
|
32 |
-
"Letter Landscape": {"size": landscape(letter)},
|
33 |
-
}
|
34 |
OUTPUT_DIR = CWD / "generated_outputs"
|
35 |
PREVIEW_DIR = CWD / "previews"
|
36 |
FONT_DIR = CWD
|
37 |
-
|
38 |
-
# Create necessary directories
|
39 |
OUTPUT_DIR.mkdir(exist_ok=True)
|
40 |
PREVIEW_DIR.mkdir(exist_ok=True)
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
-
#
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
pdfmetrics.registerFont(TTFont(f"{font_name}-Bold", str(font_path)))
|
58 |
-
pdfmetrics.registerFontFamily(font_name, normal=font_name, bold=f"{font_name}-Bold")
|
59 |
-
|
60 |
-
if "notocoloremoji-regular" in font_name.lower():
|
61 |
-
emoji_font_name = font_name
|
62 |
-
elif "notoemoji" not in font_name.lower():
|
63 |
-
text_font_names.append(font_name)
|
64 |
-
except Exception as e:
|
65 |
-
print(f"Could not register font {font_path.name}: {e}")
|
66 |
-
if not text_font_names: text_font_names.append('Helvetica')
|
67 |
-
return sorted(text_font_names), emoji_font_name
|
68 |
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
|
77 |
-
# --- Document Generation Engines ---
|
78 |
|
79 |
def create_pdf(md_content, font_name, emoji_font, pagesize, num_columns):
|
80 |
-
"""
|
81 |
-
|
82 |
story = markdown_to_story(md_content, font_name, emoji_font)
|
83 |
if num_columns > 1:
|
84 |
-
doc = BaseDocTemplate(
|
85 |
-
frame_width = (doc.width / num_columns) - (num_columns - 1) * 0.1*inch
|
86 |
-
frames = [Frame(doc.leftMargin + i * (frame_width + 0.2*inch), doc.bottomMargin, frame_width, doc.height) for i in range(num_columns)]
|
87 |
doc.addPageTemplates([PageTemplate(id='MultiCol', frames=frames)])
|
88 |
else:
|
89 |
-
doc = SimpleDocTemplate(
|
90 |
doc.build(story)
|
91 |
-
|
|
|
92 |
|
93 |
def create_docx(md_content):
|
94 |
-
"""
|
95 |
document = Document()
|
96 |
for line in md_content.split('\n'):
|
97 |
if line.startswith('# '): document.add_heading(line[2:], level=1)
|
98 |
elif line.startswith('## '): document.add_heading(line[3:], level=2)
|
99 |
-
elif line.strip().startswith(('- ','* ')): document.add_paragraph(line.strip()[2:], style='List Bullet')
|
100 |
else:
|
101 |
p = document.add_paragraph()
|
102 |
parts = re.split(r'(\*\*.*?\*\*)', line)
|
@@ -106,10 +106,10 @@ def create_docx(md_content):
|
|
106 |
return document
|
107 |
|
108 |
def create_xlsx(md_content):
|
109 |
-
"""
|
110 |
workbook = openpyxl.Workbook(); sheet = workbook.active
|
111 |
sections = re.split(r'\n# ', '\n' + md_content)
|
112 |
-
if sections[0] == '': sections.pop(0)
|
113 |
column_data = []
|
114 |
for section in sections:
|
115 |
lines = section.split('\n'); header = lines[0]
|
@@ -122,38 +122,129 @@ def create_xlsx(md_content):
|
|
122 |
return workbook
|
123 |
|
124 |
def markdown_to_story(markdown_text: str, font_name: str, emoji_font: str):
|
125 |
-
"""
|
126 |
styles = getSampleStyleSheet()
|
127 |
-
# Use the bold variant of the selected font for headers
|
128 |
bold_font = f"{font_name}-Bold" if font_name != "Helvetica" else "Helvetica-Bold"
|
129 |
-
|
130 |
-
# Create styles with dynamic font sizes and bolding for headers
|
131 |
style_normal = ParagraphStyle('BodyText', fontName=font_name, spaceAfter=6, fontSize=10)
|
132 |
-
style_h1 = ParagraphStyle('h1', fontName=bold_font, spaceBefore=12, fontSize=24
|
133 |
-
style_h2 = ParagraphStyle('h2', fontName=bold_font, spaceBefore=10, fontSize=18, leading=22)
|
134 |
-
style_h3 = ParagraphStyle('h3', fontName=bold_font, spaceBefore=8, fontSize=14, leading=18)
|
135 |
-
|
136 |
story, first_heading = [], True
|
137 |
for line in markdown_text.split('\n'):
|
138 |
content, style = line, style_normal
|
139 |
-
|
140 |
-
# Determine the style based on markdown heading level
|
141 |
if line.startswith("# "):
|
142 |
if not first_heading: story.append(PageBreak())
|
143 |
content, style, first_heading = line.lstrip('# '), style_h1, False
|
144 |
-
elif line.startswith("## "):
|
145 |
-
content, style = line.lstrip('## '), style_h2
|
146 |
-
elif line.startswith("### "):
|
147 |
-
content, style = line.lstrip('### '), style_h3
|
148 |
-
|
149 |
-
# Apply bold tags and then apply emoji font wrapper
|
150 |
formatted_content = re.sub(r'\*\*(.*?)\*\*', r'<b>\1</b>', content)
|
151 |
final_content = apply_emoji_font(formatted_content, emoji_font)
|
152 |
story.append(Paragraph(final_content, style))
|
153 |
-
|
154 |
return story
|
155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
def create_pdf_preview(pdf_path: Path):
|
|
|
157 |
preview_path = PREVIEW_DIR / f"{pdf_path.stem}.png"
|
158 |
try:
|
159 |
doc = fitz.open(pdf_path); page = doc.load_page(0); pix = page.get_pixmap()
|
@@ -161,22 +252,50 @@ def create_pdf_preview(pdf_path: Path):
|
|
161 |
return str(preview_path)
|
162 |
except: return None
|
163 |
|
164 |
-
|
165 |
-
|
166 |
-
if not
|
167 |
if not output_formats: raise gr.Error("Please select at least one output format.")
|
|
|
|
|
|
|
|
|
168 |
|
169 |
shutil.rmtree(OUTPUT_DIR, ignore_errors=True); shutil.rmtree(PREVIEW_DIR, ignore_errors=True)
|
170 |
OUTPUT_DIR.mkdir(); PREVIEW_DIR.mkdir()
|
171 |
|
172 |
-
#
|
173 |
-
md_content = "
|
|
|
|
|
|
|
|
|
|
|
174 |
|
175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
|
|
|
|
|
177 |
for format_choice in progress.tqdm(output_formats, desc="Generating Formats"):
|
178 |
time_str = datetime.datetime.now().strftime('%m-%d-%a_%I%M%p').upper()
|
179 |
-
|
180 |
if format_choice == "PDF":
|
181 |
for layout_name in layouts:
|
182 |
for font_name in fonts:
|
@@ -187,61 +306,64 @@ def generate_outputs_api(files, output_formats, layouts, fonts, num_columns, pag
|
|
187 |
output_path = OUTPUT_DIR / filename
|
188 |
with open(output_path, "wb") as f: f.write(pdf_buffer.getvalue())
|
189 |
generated_files.append(output_path)
|
190 |
-
|
191 |
elif format_choice == "DOCX":
|
192 |
docx_doc = create_docx(md_content)
|
193 |
filename = f"Document_{time_str}.docx"
|
194 |
output_path = OUTPUT_DIR / filename
|
195 |
-
docx_doc.save(output_path)
|
196 |
-
generated_files.append(output_path)
|
197 |
-
|
198 |
elif format_choice == "XLSX":
|
199 |
xlsx_book = create_xlsx(md_content)
|
200 |
filename = f"Outline_{time_str}.xlsx"
|
201 |
output_path = OUTPUT_DIR / filename
|
202 |
-
xlsx_book.save(output_path)
|
203 |
-
generated_files.append(output_path)
|
204 |
|
205 |
gallery_previews = [create_pdf_preview(p) for p in generated_files if p.suffix == '.pdf']
|
206 |
final_gallery = [g for g in gallery_previews if g]
|
207 |
|
208 |
-
return
|
209 |
|
210 |
-
# --- Gradio UI Definition ---
|
211 |
AVAILABLE_FONTS, EMOJI_FONT_NAME = register_local_fonts()
|
212 |
-
SAMPLE_MARKDOWN = "# Deities Guide\n\n- **Purpose**: Explore deities and their morals! \n- **Themes**: Justice ⚖️, faith 🙏\n\n# Arthurian Legends\n\n - **Merlin, Arthur**: Mentor 🧙, son 👑.\n - **Lesson**: Honor 🎖️ vs. betrayal 🗡️."
|
213 |
-
with open(CWD / "sample.md", "w", encoding="utf-8") as f: f.write(SAMPLE_MARKDOWN)
|
214 |
|
215 |
-
with gr.Blocks(theme=gr.themes.Soft(), title="
|
216 |
-
gr.Markdown("#
|
217 |
-
gr.Markdown("
|
218 |
|
219 |
with gr.Row():
|
220 |
with gr.Column(scale=1):
|
221 |
-
gr.Markdown("### ⚙️
|
222 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
|
|
|
224 |
output_formats = gr.CheckboxGroup(choices=["PDF", "DOCX", "XLSX"], label="Select Output Formats", value=["PDF"])
|
225 |
|
226 |
with gr.Accordion("PDF Customization", open=True):
|
227 |
-
with gr.Row():
|
228 |
-
page_w_mult_slider = gr.Slider(label="Page Width Multiplier", minimum=1, maximum=5, step=1, value=1)
|
229 |
-
page_h_mult_slider = gr.Slider(label="Page Height Multiplier", minimum=1, maximum=2, step=1, value=1)
|
230 |
num_columns_slider = gr.Slider(label="Text Columns", minimum=1, maximum=4, step=1, value=1)
|
|
|
|
|
231 |
selected_layouts = gr.CheckboxGroup(choices=list(LAYOUTS.keys()), label="Base Page Layout", value=["A4 Portrait"])
|
232 |
selected_fonts = gr.CheckboxGroup(choices=AVAILABLE_FONTS, label="Text Font", value=[AVAILABLE_FONTS[0]] if AVAILABLE_FONTS else [])
|
233 |
|
234 |
generate_btn = gr.Button("🚀 Generate Documents", variant="primary")
|
235 |
|
236 |
with gr.Column(scale=2):
|
237 |
-
gr.Markdown("###
|
|
|
|
|
238 |
gallery_output = gr.Gallery(label="PDF Previews", show_label=False, elem_id="gallery", columns=3, height="auto", object_fit="contain")
|
239 |
-
log_output = gr.Markdown(label="Generation Log", value="Ready...")
|
240 |
downloadable_files_output = gr.Files(label="Download Generated Files")
|
241 |
|
242 |
generate_btn.click(fn=generate_outputs_api,
|
243 |
-
inputs=[
|
244 |
-
outputs=[
|
245 |
|
246 |
if __name__ == "__main__":
|
247 |
demo.launch()
|
|
|
4 |
import re
|
5 |
import os
|
6 |
import shutil
|
|
|
|
|
|
|
7 |
import io
|
8 |
+
import base64
|
9 |
+
from collections import defaultdict
|
10 |
+
from PIL import Image
|
11 |
|
12 |
+
# Document Generation Libs
|
13 |
from docx import Document
|
|
|
14 |
import openpyxl
|
15 |
+
from pypdf import PdfWriter
|
16 |
+
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak, BaseDocTemplate, Frame, PageTemplate
|
17 |
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
18 |
+
from reportlab.lib.pagesizes import letter, A4, landscape
|
19 |
from reportlab.lib.units import inch
|
|
|
20 |
from reportlab.pdfbase import pdfmetrics
|
21 |
from reportlab.pdfbase.ttfonts import TTFont
|
22 |
|
23 |
+
# AI and Media Libs
|
24 |
+
from openai import AzureOpenAI
|
25 |
+
import fitz # PyMuPDF
|
26 |
+
|
27 |
# --- Configuration & Setup ---
|
28 |
CWD = Path.cwd()
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
OUTPUT_DIR = CWD / "generated_outputs"
|
30 |
PREVIEW_DIR = CWD / "previews"
|
31 |
FONT_DIR = CWD
|
|
|
|
|
32 |
OUTPUT_DIR.mkdir(exist_ok=True)
|
33 |
PREVIEW_DIR.mkdir(exist_ok=True)
|
34 |
|
35 |
+
LAYOUTS = {
|
36 |
+
"A4 Portrait": {"size": A4},
|
37 |
+
"A4 Landscape": {"size": landscape(A4)},
|
38 |
+
"Letter Portrait": {"size": letter},
|
39 |
+
"Letter Landscape": {"size": landscape(letter)},
|
40 |
+
}
|
41 |
|
42 |
+
# 🧠 Initialize Azure OpenAI Client
|
43 |
+
# NOTE: This requires AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_API_KEY in your environment.
|
44 |
+
try:
|
45 |
+
client = AzureOpenAI(
|
46 |
+
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
|
47 |
+
api_version="2024-05-01-preview",
|
48 |
+
api_key=os.getenv("AZURE_OPENAI_API_KEY"),
|
49 |
+
)
|
50 |
+
AZURE_CLIENT_AVAILABLE = True
|
51 |
+
except Exception as e:
|
52 |
+
print("Warning: Azure OpenAI client could not be initialized. Text generation will use dummy data.")
|
53 |
+
print(f"Error: {e}")
|
54 |
+
client = None
|
55 |
+
AZURE_CLIENT_AVAILABLE = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
+
# 📖 Map UI model names to your actual Azure deployment names.
|
58 |
+
# YOU MUST CHANGE THESE DEPLOYMENT NAMES to match your Azure setup.
|
59 |
+
AZURE_DEPLOYMENT_NAMES = {
|
60 |
+
# Chat / Vision Models
|
61 |
+
"gpt-4o": "your-gpt-4o-deployment-name",
|
62 |
+
"gpt-4.1": "your-gpt-4.1-deployment-name",
|
63 |
+
"gpt-4.1-mini": "your-gpt-4.1-mini-deployment-name",
|
64 |
+
"gpt-4o-mini": "your-gpt-4o-mini-deployment-name",
|
65 |
+
"gpt-4o-realtime-preview": "your-gpt-4o-realtime-deployment-name",
|
66 |
+
# Reasoning Models
|
67 |
+
"o1-mini": "your-o1-mini-deployment-name",
|
68 |
+
"o3-mini": "your-o3-mini-deployment-name",
|
69 |
+
"o4-mini": "your-o4-mini-deployment-name",
|
70 |
+
# Transcription Models
|
71 |
+
"gpt-4o-transcribe": "your-gpt-4o-transcribe-deployment",
|
72 |
+
"gpt-4o-mini-transcribe": "your-gpt-4o-mini-transcribe-deployment",
|
73 |
+
}
|
74 |
|
75 |
|
76 |
+
# --- ✍️ Document Generation Engines ---
|
77 |
|
78 |
def create_pdf(md_content, font_name, emoji_font, pagesize, num_columns):
|
79 |
+
"""📄 Builds a beautiful PDF from a Markdown story using ReportLab."""
|
80 |
+
pdf_buffer = io.BytesIO()
|
81 |
story = markdown_to_story(md_content, font_name, emoji_font)
|
82 |
if num_columns > 1:
|
83 |
+
doc = BaseDocTemplate(pdf_buffer, pagesize=pagesize, leftMargin=0.5 * inch, rightMargin=0.5 * inch)
|
84 |
+
frame_width = (doc.width / num_columns) - (num_columns - 1) * 0.1 * inch
|
85 |
+
frames = [Frame(doc.leftMargin + i * (frame_width + 0.2 * inch), doc.bottomMargin, frame_width, doc.height) for i in range(num_columns)]
|
86 |
doc.addPageTemplates([PageTemplate(id='MultiCol', frames=frames)])
|
87 |
else:
|
88 |
+
doc = SimpleDocTemplate(pdf_buffer, pagesize=pagesize)
|
89 |
doc.build(story)
|
90 |
+
pdf_buffer.seek(0)
|
91 |
+
return pdf_buffer
|
92 |
|
93 |
def create_docx(md_content):
|
94 |
+
"""📝 Crafts a DOCX document, translating Markdown to Word elements."""
|
95 |
document = Document()
|
96 |
for line in md_content.split('\n'):
|
97 |
if line.startswith('# '): document.add_heading(line[2:], level=1)
|
98 |
elif line.startswith('## '): document.add_heading(line[3:], level=2)
|
99 |
+
elif line.strip().startswith(('- ', '* ')): document.add_paragraph(line.strip()[2:], style='List Bullet')
|
100 |
else:
|
101 |
p = document.add_paragraph()
|
102 |
parts = re.split(r'(\*\*.*?\*\*)', line)
|
|
|
106 |
return document
|
107 |
|
108 |
def create_xlsx(md_content):
|
109 |
+
"""📊 Organizes a Markdown outline into columns in an XLSX file."""
|
110 |
workbook = openpyxl.Workbook(); sheet = workbook.active
|
111 |
sections = re.split(r'\n# ', '\n' + md_content)
|
112 |
+
if sections and sections[0] == '': sections.pop(0)
|
113 |
column_data = []
|
114 |
for section in sections:
|
115 |
lines = section.split('\n'); header = lines[0]
|
|
|
122 |
return workbook
|
123 |
|
124 |
def markdown_to_story(markdown_text: str, font_name: str, emoji_font: str):
|
125 |
+
"""📜 Translates Markdown text into a sequence of ReportLab flowables for PDF rendering."""
|
126 |
styles = getSampleStyleSheet()
|
|
|
127 |
bold_font = f"{font_name}-Bold" if font_name != "Helvetica" else "Helvetica-Bold"
|
|
|
|
|
128 |
style_normal = ParagraphStyle('BodyText', fontName=font_name, spaceAfter=6, fontSize=10)
|
129 |
+
style_h1 = ParagraphStyle('h1', fontName=bold_font, spaceBefore=12, fontSize=24)
|
|
|
|
|
|
|
130 |
story, first_heading = [], True
|
131 |
for line in markdown_text.split('\n'):
|
132 |
content, style = line, style_normal
|
|
|
|
|
133 |
if line.startswith("# "):
|
134 |
if not first_heading: story.append(PageBreak())
|
135 |
content, style, first_heading = line.lstrip('# '), style_h1, False
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
formatted_content = re.sub(r'\*\*(.*?)\*\*', r'<b>\1</b>', content)
|
137 |
final_content = apply_emoji_font(formatted_content, emoji_font)
|
138 |
story.append(Paragraph(final_content, style))
|
|
|
139 |
return story
|
140 |
|
141 |
+
|
142 |
+
# --- 🔮 Omni-Model Processing ---
|
143 |
+
|
144 |
+
def process_text_input(prompt, model_deployment_name):
|
145 |
+
"""💬 Sends a text prompt to the Azure OpenAI model and gets a response."""
|
146 |
+
if not AZURE_CLIENT_AVAILABLE: return "Azure OpenAI client not configured. This is dummy text."
|
147 |
+
completion = client.chat.completions.create(
|
148 |
+
model=model_deployment_name,
|
149 |
+
messages=[{"role": "user", "content": prompt}]
|
150 |
+
)
|
151 |
+
return completion.choices[0].message.content
|
152 |
+
|
153 |
+
def process_image_input(image_file, prompt, model_deployment_name):
|
154 |
+
"""🖼️ Encodes an image and sends it with a prompt to the Azure OpenAI model."""
|
155 |
+
if not AZURE_CLIENT_AVAILABLE: return "Azure OpenAI client not configured. This is a dummy image description."
|
156 |
+
with Image.open(image_file.name) as img:
|
157 |
+
with io.BytesIO() as output:
|
158 |
+
img.save(output, format="PNG")
|
159 |
+
base64_image = base64.b64encode(output.getvalue()).decode("utf-8")
|
160 |
+
|
161 |
+
response = client.chat.completions.create(
|
162 |
+
model=model_deployment_name,
|
163 |
+
messages=[{"role": "user", "content": [
|
164 |
+
{"type": "text", "text": prompt},
|
165 |
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"}}
|
166 |
+
]}]
|
167 |
+
)
|
168 |
+
return response.choices[0].message.content
|
169 |
+
|
170 |
+
def process_audio_input(audio_file, prompt, chat_model_deployment, transcribe_model_deployment):
|
171 |
+
"""🎤 Transcribes audio and sends the text with a prompt to the Azure OpenAI model."""
|
172 |
+
if not AZURE_CLIENT_AVAILABLE: return "Azure OpenAI client not configured. This is a dummy audio summary."
|
173 |
+
with open(audio_file.name, "rb") as f:
|
174 |
+
transcription = client.audio.transcriptions.create(
|
175 |
+
model=transcribe_model_deployment,
|
176 |
+
file=f
|
177 |
+
).text
|
178 |
+
|
179 |
+
full_prompt = f"{prompt}\n\nAudio Transcription:\n{transcription}"
|
180 |
+
return process_text_input(full_prompt, chat_model_deployment)
|
181 |
+
|
182 |
+
def process_pdf_input(pdf_file, prompt, model_deployment_name, progress):
|
183 |
+
"""📄 Performs OCR on a PDF by sending pages as images to the AI model."""
|
184 |
+
if not AZURE_CLIENT_AVAILABLE: return "Azure OpenAI client not configured. This is a dummy PDF summary."
|
185 |
+
|
186 |
+
all_extracted_text = []
|
187 |
+
doc = fitz.open(pdf_file.name)
|
188 |
+
|
189 |
+
# Process pages in pairs
|
190 |
+
for i in progress.tqdm(range(0, len(doc), 2), desc="Performing PDF OCR"):
|
191 |
+
page_images = []
|
192 |
+
messages = [{"type": "text", "text": prompt}]
|
193 |
+
|
194 |
+
# Get first page of the pair
|
195 |
+
page1 = doc.load_page(i)
|
196 |
+
pix1 = page1.get_pixmap(dpi=150)
|
197 |
+
img_bytes1 = pix1.tobytes("png")
|
198 |
+
base64_image1 = base64.b64encode(img_bytes1).decode("utf-8")
|
199 |
+
messages.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image1}"}})
|
200 |
+
|
201 |
+
# Get second page if it exists
|
202 |
+
if i + 1 < len(doc):
|
203 |
+
page2 = doc.load_page(i + 1)
|
204 |
+
pix2 = page2.get_pixmap(dpi=150)
|
205 |
+
img_bytes2 = pix2.tobytes("png")
|
206 |
+
base64_image2 = base64.b64encode(img_bytes2).decode("utf-8")
|
207 |
+
messages.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image2}"}})
|
208 |
+
|
209 |
+
response = client.chat.completions.create(
|
210 |
+
model=model_deployment_name,
|
211 |
+
messages=[{"role": "user", "content": messages}]
|
212 |
+
)
|
213 |
+
all_extracted_text.append(response.choices[0].message.content)
|
214 |
+
|
215 |
+
return "\n\n".join(all_extracted_text)
|
216 |
+
|
217 |
+
|
218 |
+
# --- 🛠️ Helpers & Main API ---
|
219 |
+
|
220 |
+
def register_local_fonts():
|
221 |
+
"""✒️ Scans for local .ttf fonts and registers them for PDF creation."""
|
222 |
+
text_font_names, emoji_font_name = [], None
|
223 |
+
font_files = list(FONT_DIR.glob("*.ttf"))
|
224 |
+
for font_path in font_files:
|
225 |
+
try:
|
226 |
+
font_name = font_path.stem
|
227 |
+
pdfmetrics.registerFont(TTFont(font_name, str(font_path)))
|
228 |
+
pdfmetrics.registerFont(TTFont(f"{font_name}-Bold", str(font_path)))
|
229 |
+
pdfmetrics.registerFontFamily(font_name, normal=font_name, bold=f"{font_name}-Bold")
|
230 |
+
if "notocoloremoji-regular" in font_name.lower():
|
231 |
+
emoji_font_name = font_name
|
232 |
+
else:
|
233 |
+
text_font_names.append(font_name)
|
234 |
+
except Exception as e:
|
235 |
+
print(f"Could not register font {font_path.name}: {e}")
|
236 |
+
if not text_font_names: text_font_names.append('Helvetica')
|
237 |
+
return sorted(text_font_names), emoji_font_name
|
238 |
+
|
239 |
+
def apply_emoji_font(text: str, emoji_font_name: str) -> str:
|
240 |
+
"""😊 Finds emojis and wraps them in special font tags for the PDF."""
|
241 |
+
if not emoji_font_name: return text
|
242 |
+
emoji_pattern = re.compile(f"([{re.escape(''.join(map(chr, range(0x1f600, 0x1f650))))}"
|
243 |
+
f"{re.escape(''.join(map(chr, range(0x1f300, 0x1f5ff))))}]+)")
|
244 |
+
return emoji_pattern.sub(fr'<font name="{emoji_font_name}">\1</font>', text)
|
245 |
+
|
246 |
def create_pdf_preview(pdf_path: Path):
|
247 |
+
"""🏞️ Generates a PNG thumbnail for the first page of a PDF."""
|
248 |
preview_path = PREVIEW_DIR / f"{pdf_path.stem}.png"
|
249 |
try:
|
250 |
doc = fitz.open(pdf_path); page = doc.load_page(0); pix = page.get_pixmap()
|
|
|
252 |
return str(preview_path)
|
253 |
except: return None
|
254 |
|
255 |
+
def generate_outputs_api(omni_files, omni_prompt, chat_model, transcribe_model, output_formats, layouts, fonts, num_columns, page_w_mult, page_h_mult, progress=gr.Progress(track_tqdm=True)):
|
256 |
+
"""🚀 The main entry point that orchestrates the entire multi-modal generation process."""
|
257 |
+
if not omni_prompt and not omni_files: raise gr.Error("Please provide a prompt or upload at least one file.")
|
258 |
if not output_formats: raise gr.Error("Please select at least one output format.")
|
259 |
+
|
260 |
+
chat_deployment = AZURE_DEPLOYMENT_NAMES.get(chat_model)
|
261 |
+
transcribe_deployment = AZURE_DEPLOYMENT_NAMES.get(transcribe_model)
|
262 |
+
if not chat_deployment: raise gr.Error(f"Deployment for model '{chat_model}' not found in configuration.")
|
263 |
|
264 |
shutil.rmtree(OUTPUT_DIR, ignore_errors=True); shutil.rmtree(PREVIEW_DIR, ignore_errors=True)
|
265 |
OUTPUT_DIR.mkdir(); PREVIEW_DIR.mkdir()
|
266 |
|
267 |
+
# --- Step 1: Omni-Model Processing ---
|
268 |
+
md_content = ""
|
269 |
+
# Process files first
|
270 |
+
if omni_files:
|
271 |
+
# Check for multiple file types
|
272 |
+
file_paths = [Path(f.name) for f in omni_files]
|
273 |
+
extensions = {p.suffix.lower() for p in file_paths}
|
274 |
|
275 |
+
if '.md' in extensions:
|
276 |
+
md_content = "\n\n".join([p.read_text(encoding='utf-8') for p in file_paths if p.suffix.lower() == '.md'])
|
277 |
+
elif '.pdf' in extensions:
|
278 |
+
# For simplicity, we process only the first PDF if multiple are uploaded for OCR
|
279 |
+
pdf_file = next((f for f in omni_files if Path(f.name).suffix.lower() == '.pdf'), None)
|
280 |
+
ocr_prompt = omni_prompt if omni_prompt else "Extract all text from the following document pages."
|
281 |
+
md_content = process_pdf_input(pdf_file, ocr_prompt, chat_deployment, progress)
|
282 |
+
elif '.png' in extensions or '.jpg' in extensions or '.jpeg' in extensions:
|
283 |
+
image_file = next((f for f in omni_files if Path(f.name).suffix.lower() in ['.png', '.jpg', '.jpeg']), None)
|
284 |
+
md_content = process_image_input(image_file, omni_prompt, chat_deployment)
|
285 |
+
elif '.wav' in extensions or '.mp3' in extensions or '.m4a' in extensions:
|
286 |
+
if not transcribe_deployment: raise gr.Error(f"Deployment for model '{transcribe_model}' not found.")
|
287 |
+
audio_file = next((f for f in omni_files if Path(f.name).suffix.lower() in ['.wav', '.mp3', '.m4a']), None)
|
288 |
+
md_content = process_audio_input(audio_file, omni_prompt, chat_deployment, transcribe_deployment)
|
289 |
+
# If no files, process text prompt
|
290 |
+
elif omni_prompt:
|
291 |
+
md_content = process_text_input(omni_prompt, chat_deployment)
|
292 |
+
|
293 |
+
if not md_content: raise gr.Error("Failed to generate source content from the provided input.")
|
294 |
|
295 |
+
# --- Step 2: Generate Selected Document Formats ---
|
296 |
+
generated_files = []
|
297 |
for format_choice in progress.tqdm(output_formats, desc="Generating Formats"):
|
298 |
time_str = datetime.datetime.now().strftime('%m-%d-%a_%I%M%p').upper()
|
|
|
299 |
if format_choice == "PDF":
|
300 |
for layout_name in layouts:
|
301 |
for font_name in fonts:
|
|
|
306 |
output_path = OUTPUT_DIR / filename
|
307 |
with open(output_path, "wb") as f: f.write(pdf_buffer.getvalue())
|
308 |
generated_files.append(output_path)
|
|
|
309 |
elif format_choice == "DOCX":
|
310 |
docx_doc = create_docx(md_content)
|
311 |
filename = f"Document_{time_str}.docx"
|
312 |
output_path = OUTPUT_DIR / filename
|
313 |
+
docx_doc.save(output_path); generated_files.append(output_path)
|
|
|
|
|
314 |
elif format_choice == "XLSX":
|
315 |
xlsx_book = create_xlsx(md_content)
|
316 |
filename = f"Outline_{time_str}.xlsx"
|
317 |
output_path = OUTPUT_DIR / filename
|
318 |
+
xlsx_book.save(output_path); generated_files.append(output_path)
|
|
|
319 |
|
320 |
gallery_previews = [create_pdf_preview(p) for p in generated_files if p.suffix == '.pdf']
|
321 |
final_gallery = [g for g in gallery_previews if g]
|
322 |
|
323 |
+
return md_content, final_gallery, [str(p) for p in generated_files]
|
324 |
|
325 |
+
# --- 🎨 Gradio UI Definition ---
|
326 |
AVAILABLE_FONTS, EMOJI_FONT_NAME = register_local_fonts()
|
|
|
|
|
327 |
|
328 |
+
with gr.Blocks(theme=gr.themes.Soft(), title="Omni-Model Document Generator") as demo:
|
329 |
+
gr.Markdown("# 🧠 Omni-Model Document Generator (PDF, DOCX, XLSX)")
|
330 |
+
gr.Markdown("Provide a prompt, or upload a Markdown, PDF, Image, or Audio file. The AI will process it, and you can generate documents from the result.")
|
331 |
|
332 |
with gr.Row():
|
333 |
with gr.Column(scale=1):
|
334 |
+
gr.Markdown("### ⚙️ Omni-Model Input")
|
335 |
+
|
336 |
+
chat_models = ["gpt-4o", "gpt-4.1", "gpt-4.1-mini", "gpt-4o-mini", "o1-mini", "o3-mini", "o4-mini"]
|
337 |
+
transcribe_models = ["gpt-4o-transcribe", "gpt-4o-mini-transcribe"]
|
338 |
+
|
339 |
+
selected_chat_model = gr.Dropdown(choices=chat_models, label="Select Chat/Vision/Reasoning Model", value=chat_models[0])
|
340 |
+
selected_transcribe_model = gr.Dropdown(choices=transcribe_models, label="Select Transcription Model (for audio)", value=transcribe_models[0])
|
341 |
+
|
342 |
+
omni_prompt = gr.Textbox(label="Prompt", lines=3, placeholder="Ask a question, or provide instructions for a file...")
|
343 |
+
omni_files = gr.File(label="Upload File(s) (Optional)", file_count="multiple", file_types=["image", ".wav", ".mp3", ".md", ".pdf"])
|
344 |
|
345 |
+
gr.Markdown("### 📄 Output Settings")
|
346 |
output_formats = gr.CheckboxGroup(choices=["PDF", "DOCX", "XLSX"], label="Select Output Formats", value=["PDF"])
|
347 |
|
348 |
with gr.Accordion("PDF Customization", open=True):
|
|
|
|
|
|
|
349 |
num_columns_slider = gr.Slider(label="Text Columns", minimum=1, maximum=4, step=1, value=1)
|
350 |
+
page_w_mult_slider = gr.Slider(label="Page Width Multiplier", minimum=1, maximum=5, step=1, value=1)
|
351 |
+
page_h_mult_slider = gr.Slider(label="Page Height Multiplier", minimum=1, maximum=2, step=1, value=1)
|
352 |
selected_layouts = gr.CheckboxGroup(choices=list(LAYOUTS.keys()), label="Base Page Layout", value=["A4 Portrait"])
|
353 |
selected_fonts = gr.CheckboxGroup(choices=AVAILABLE_FONTS, label="Text Font", value=[AVAILABLE_FONTS[0]] if AVAILABLE_FONTS else [])
|
354 |
|
355 |
generate_btn = gr.Button("🚀 Generate Documents", variant="primary")
|
356 |
|
357 |
with gr.Column(scale=2):
|
358 |
+
gr.Markdown("### 🤖 AI Response (Source for Documents)")
|
359 |
+
ai_response_output = gr.Markdown(label="AI Generated Content")
|
360 |
+
gr.Markdown("### 🖼️ Final Documents")
|
361 |
gallery_output = gr.Gallery(label="PDF Previews", show_label=False, elem_id="gallery", columns=3, height="auto", object_fit="contain")
|
|
|
362 |
downloadable_files_output = gr.Files(label="Download Generated Files")
|
363 |
|
364 |
generate_btn.click(fn=generate_outputs_api,
|
365 |
+
inputs=[omni_files, omni_prompt, selected_chat_model, selected_transcribe_model, output_formats, selected_layouts, selected_fonts, num_columns_slider, page_w_mult_slider, page_h_mult_slider],
|
366 |
+
outputs=[ai_response_output, gallery_output, downloadable_files_output])
|
367 |
|
368 |
if __name__ == "__main__":
|
369 |
demo.launch()
|