Spaces:
Build error
Build error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from pathlib import Path
|
3 |
+
import datetime
|
4 |
+
import re
|
5 |
+
import os
|
6 |
+
import shutil
|
7 |
+
import fitz # PyMuPDF
|
8 |
+
from PIL import Image
|
9 |
+
from collections import defaultdict
|
10 |
+
import io
|
11 |
+
from pypdf import PdfWriter
|
12 |
+
|
13 |
+
# Imports for new formats
|
14 |
+
from docx import Document
|
15 |
+
from docx.shared import Inches
|
16 |
+
import openpyxl
|
17 |
+
|
18 |
+
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak, BaseDocTemplate, Frame, PageTemplate, Image as ReportLabImage
|
19 |
+
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
20 |
+
from reportlab.lib.pagesizes import letter, A4, legal, landscape
|
21 |
+
from reportlab.lib.units import inch
|
22 |
+
from reportlab.lib import colors
|
23 |
+
from reportlab.pdfbase import pdfmetrics
|
24 |
+
from reportlab.pdfbase.ttfonts import TTFont
|
25 |
+
|
26 |
+
# --- Configuration & Setup ---
|
27 |
+
CWD = Path.cwd()
|
28 |
+
LAYOUTS = {
|
29 |
+
"A4 Portrait": {"size": A4},
|
30 |
+
"A4 Landscape": {"size": landscape(A4)},
|
31 |
+
"Letter Portrait": {"size": letter},
|
32 |
+
"Letter Landscape": {"size": landscape(letter)},
|
33 |
+
}
|
34 |
+
OUTPUT_DIR = CWD / "generated_outputs"
|
35 |
+
PREVIEW_DIR = CWD / "previews"
|
36 |
+
FONT_DIR = CWD
|
37 |
+
|
38 |
+
# Create necessary directories
|
39 |
+
OUTPUT_DIR.mkdir(exist_ok=True)
|
40 |
+
PREVIEW_DIR.mkdir(exist_ok=True)
|
41 |
+
|
42 |
+
|
43 |
+
# --- Font & Emoji Handling (for PDF) ---
|
44 |
+
|
45 |
+
def register_local_fonts():
|
46 |
+
"""Finds and registers all .ttf files from the application's base directory."""
|
47 |
+
text_font_names, emoji_font_name = [], None
|
48 |
+
font_files = list(FONT_DIR.glob("*.ttf"))
|
49 |
+
print(f"Found {len(font_files)} .ttf files: {[f.name for f in font_files]}")
|
50 |
+
|
51 |
+
for font_path in font_files:
|
52 |
+
try:
|
53 |
+
font_name = font_path.stem
|
54 |
+
# Register the regular font
|
55 |
+
pdfmetrics.registerFont(TTFont(font_name, str(font_path)))
|
56 |
+
# Also register a bold version, pointing to the same file. ReportLab's <b> tag will work.
|
57 |
+
pdfmetrics.registerFont(TTFont(f"{font_name}-Bold", str(font_path)))
|
58 |
+
pdfmetrics.registerFontFamily(font_name, normal=font_name, bold=f"{font_name}-Bold")
|
59 |
+
|
60 |
+
if "notocoloremoji-regular" in font_name.lower():
|
61 |
+
emoji_font_name = font_name
|
62 |
+
elif "notoemoji" not in font_name.lower():
|
63 |
+
text_font_names.append(font_name)
|
64 |
+
except Exception as e:
|
65 |
+
print(f"Could not register font {font_path.name}: {e}")
|
66 |
+
if not text_font_names: text_font_names.append('Helvetica')
|
67 |
+
return sorted(text_font_names), emoji_font_name
|
68 |
+
|
69 |
+
def apply_emoji_font(text: str, emoji_font_name: str) -> str:
|
70 |
+
"""Wraps emoji characters in a <font> tag for ReportLab."""
|
71 |
+
if not emoji_font_name: return text
|
72 |
+
emoji_pattern = re.compile(f"([{re.escape(''.join(map(chr, range(0x1f600, 0x1f650))))}"
|
73 |
+
f"{re.escape(''.join(map(chr, range(0x1f300, 0x1f5ff))))}]+)")
|
74 |
+
return emoji_pattern.sub(fr'<font name="{emoji_font_name}">\1</font>', text)
|
75 |
+
|
76 |
+
|
77 |
+
# --- Document Generation Engines ---
|
78 |
+
|
79 |
+
def create_pdf(md_content, font_name, emoji_font, pagesize, num_columns):
|
80 |
+
"""Generates a PDF file from markdown content."""
|
81 |
+
md_buffer = io.BytesIO()
|
82 |
+
story = markdown_to_story(md_content, font_name, emoji_font)
|
83 |
+
if num_columns > 1:
|
84 |
+
doc = BaseDocTemplate(md_buffer, pagesize=pagesize, leftMargin=0.5*inch, rightMargin=0.5*inch)
|
85 |
+
frame_width = (doc.width / num_columns) - (num_columns - 1) * 0.1*inch
|
86 |
+
frames = [Frame(doc.leftMargin + i * (frame_width + 0.2*inch), doc.bottomMargin, frame_width, doc.height) for i in range(num_columns)]
|
87 |
+
doc.addPageTemplates([PageTemplate(id='MultiCol', frames=frames)])
|
88 |
+
else:
|
89 |
+
doc = SimpleDocTemplate(md_buffer, pagesize=pagesize)
|
90 |
+
doc.build(story)
|
91 |
+
return md_buffer
|
92 |
+
|
93 |
+
def create_docx(md_content):
|
94 |
+
"""Generates a DOCX file from markdown content."""
|
95 |
+
document = Document()
|
96 |
+
for line in md_content.split('\n'):
|
97 |
+
if line.startswith('# '): document.add_heading(line[2:], level=1)
|
98 |
+
elif line.startswith('## '): document.add_heading(line[3:], level=2)
|
99 |
+
elif line.strip().startswith(('- ','* ')): document.add_paragraph(line.strip()[2:], style='List Bullet')
|
100 |
+
else:
|
101 |
+
p = document.add_paragraph()
|
102 |
+
parts = re.split(r'(\*\*.*?\*\*)', line)
|
103 |
+
for part in parts:
|
104 |
+
if part.startswith('**') and part.endswith('**'): p.add_run(part[2:-2]).bold = True
|
105 |
+
else: p.add_run(part)
|
106 |
+
return document
|
107 |
+
|
108 |
+
def create_xlsx(md_content):
|
109 |
+
"""Generates an XLSX file, splitting content by H1 headers into columns."""
|
110 |
+
workbook = openpyxl.Workbook(); sheet = workbook.active
|
111 |
+
sections = re.split(r'\n# ', '\n' + md_content)
|
112 |
+
if sections[0] == '': sections.pop(0)
|
113 |
+
column_data = []
|
114 |
+
for section in sections:
|
115 |
+
lines = section.split('\n'); header = lines[0]
|
116 |
+
content = [l.strip() for l in lines[1:] if l.strip()]
|
117 |
+
column_data.append({'header': header, 'content': content})
|
118 |
+
for c_idx, col in enumerate(column_data, 1):
|
119 |
+
sheet.cell(row=1, column=c_idx, value=col['header'])
|
120 |
+
for r_idx, line_content in enumerate(col['content'], 2):
|
121 |
+
sheet.cell(row=r_idx, column=c_idx, value=line_content)
|
122 |
+
return workbook
|
123 |
+
|
124 |
+
def markdown_to_story(markdown_text: str, font_name: str, emoji_font: str):
|
125 |
+
"""Converts markdown to a ReportLab story for PDF generation with enhanced styling."""
|
126 |
+
styles = getSampleStyleSheet()
|
127 |
+
# Use the bold variant of the selected font for headers
|
128 |
+
bold_font = f"{font_name}-Bold" if font_name != "Helvetica" else "Helvetica-Bold"
|
129 |
+
|
130 |
+
# Create styles with dynamic font sizes and bolding for headers
|
131 |
+
style_normal = ParagraphStyle('BodyText', fontName=font_name, spaceAfter=6, fontSize=10)
|
132 |
+
style_h1 = ParagraphStyle('h1', fontName=bold_font, spaceBefore=12, fontSize=24, leading=28)
|
133 |
+
style_h2 = ParagraphStyle('h2', fontName=bold_font, spaceBefore=10, fontSize=18, leading=22)
|
134 |
+
style_h3 = ParagraphStyle('h3', fontName=bold_font, spaceBefore=8, fontSize=14, leading=18)
|
135 |
+
|
136 |
+
story, first_heading = [], True
|
137 |
+
for line in markdown_text.split('\n'):
|
138 |
+
content, style = line, style_normal
|
139 |
+
|
140 |
+
# Determine the style based on markdown heading level
|
141 |
+
if line.startswith("# "):
|
142 |
+
if not first_heading: story.append(PageBreak())
|
143 |
+
content, style, first_heading = line.lstrip('# '), style_h1, False
|
144 |
+
elif line.startswith("## "):
|
145 |
+
content, style = line.lstrip('## '), style_h2
|
146 |
+
elif line.startswith("### "):
|
147 |
+
content, style = line.lstrip('### '), style_h3
|
148 |
+
|
149 |
+
# Apply bold tags and then apply emoji font wrapper
|
150 |
+
formatted_content = re.sub(r'\*\*(.*?)\*\*', r'<b>\1</b>', content)
|
151 |
+
final_content = apply_emoji_font(formatted_content, emoji_font)
|
152 |
+
story.append(Paragraph(final_content, style))
|
153 |
+
|
154 |
+
return story
|
155 |
+
|
156 |
+
def create_pdf_preview(pdf_path: Path):
|
157 |
+
preview_path = PREVIEW_DIR / f"{pdf_path.stem}.png"
|
158 |
+
try:
|
159 |
+
doc = fitz.open(pdf_path); page = doc.load_page(0); pix = page.get_pixmap()
|
160 |
+
pix.save(str(preview_path)); doc.close()
|
161 |
+
return str(preview_path)
|
162 |
+
except: return None
|
163 |
+
|
164 |
+
# --- Main API Function ---
|
165 |
+
def generate_outputs_api(files, output_formats, layouts, fonts, num_columns, page_w_mult, page_h_mult, progress=gr.Progress(track_tqdm=True)):
|
166 |
+
if not files: raise gr.Error("Please upload at least one file.")
|
167 |
+
if not output_formats: raise gr.Error("Please select at least one output format.")
|
168 |
+
|
169 |
+
shutil.rmtree(OUTPUT_DIR, ignore_errors=True); shutil.rmtree(PREVIEW_DIR, ignore_errors=True)
|
170 |
+
OUTPUT_DIR.mkdir(); PREVIEW_DIR.mkdir()
|
171 |
+
|
172 |
+
# Consolidate all markdown content
|
173 |
+
md_content = "\n".join([Path(f.name).read_text(encoding='utf-8') for f in files if Path(f.name).suffix.lower() == '.md'])
|
174 |
+
|
175 |
+
log_updates, generated_files = "", []
|
176 |
+
|
177 |
+
for format_choice in progress.tqdm(output_formats, desc="Generating Formats"):
|
178 |
+
time_str = datetime.datetime.now().strftime('%m-%d-%a_%I%M%p').upper()
|
179 |
+
|
180 |
+
if format_choice == "PDF":
|
181 |
+
for layout_name in layouts:
|
182 |
+
for font_name in fonts:
|
183 |
+
pagesize = LAYOUTS[layout_name]["size"]
|
184 |
+
final_pagesize = (pagesize[0] * page_w_mult, pagesize[1] * page_h_mult)
|
185 |
+
pdf_buffer = create_pdf(md_content, font_name, EMOJI_FONT_NAME, final_pagesize, num_columns)
|
186 |
+
filename = f"Document_{time_str}_{layout_name.replace(' ','-')}_{font_name}.pdf"
|
187 |
+
output_path = OUTPUT_DIR / filename
|
188 |
+
with open(output_path, "wb") as f: f.write(pdf_buffer.getvalue())
|
189 |
+
generated_files.append(output_path)
|
190 |
+
|
191 |
+
elif format_choice == "DOCX":
|
192 |
+
docx_doc = create_docx(md_content)
|
193 |
+
filename = f"Document_{time_str}.docx"
|
194 |
+
output_path = OUTPUT_DIR / filename
|
195 |
+
docx_doc.save(output_path)
|
196 |
+
generated_files.append(output_path)
|
197 |
+
|
198 |
+
elif format_choice == "XLSX":
|
199 |
+
xlsx_book = create_xlsx(md_content)
|
200 |
+
filename = f"Outline_{time_str}.xlsx"
|
201 |
+
output_path = OUTPUT_DIR / filename
|
202 |
+
xlsx_book.save(output_path)
|
203 |
+
generated_files.append(output_path)
|
204 |
+
|
205 |
+
gallery_previews = [create_pdf_preview(p) for p in generated_files if p.suffix == '.pdf']
|
206 |
+
final_gallery = [g for g in gallery_previews if g]
|
207 |
+
|
208 |
+
return final_gallery, f"Generated {len(generated_files)} files.", [str(p) for p in generated_files]
|
209 |
+
|
210 |
+
# --- Gradio UI Definition ---
|
211 |
+
AVAILABLE_FONTS, EMOJI_FONT_NAME = register_local_fonts()
|
212 |
+
SAMPLE_MARKDOWN = "# Deities Guide\n\n- **Purpose**: Explore deities and their morals! \n- **Themes**: Justice βοΈ, faith π\n\n# Arthurian Legends\n\n - **Merlin, Arthur**: Mentor π§, son π.\n - **Lesson**: Honor ποΈ vs. betrayal π‘οΈ."
|
213 |
+
with open(CWD / "sample.md", "w", encoding="utf-8") as f: f.write(SAMPLE_MARKDOWN)
|
214 |
+
|
215 |
+
with gr.Blocks(theme=gr.themes.Soft(), title="Advanced Document Generator") as demo:
|
216 |
+
gr.Markdown("# π Advanced Document Generator (PDF, DOCX, XLSX)")
|
217 |
+
gr.Markdown("Upload Markdown files to generate documents in multiple formats. `# Headers` create columns in XLSX and page breaks in multi-page PDFs.")
|
218 |
+
|
219 |
+
with gr.Row():
|
220 |
+
with gr.Column(scale=1):
|
221 |
+
gr.Markdown("### βοΈ Generation Settings")
|
222 |
+
uploaded_files = gr.File(label="Upload Markdown & Image Files", file_count="multiple", file_types=[".md", ".png", ".jpg"])
|
223 |
+
|
224 |
+
output_formats = gr.CheckboxGroup(choices=["PDF", "DOCX", "XLSX"], label="Select Output Formats", value=["PDF"])
|
225 |
+
|
226 |
+
with gr.Accordion("PDF Customization", open=True):
|
227 |
+
with gr.Row():
|
228 |
+
page_w_mult_slider = gr.Slider(label="Page Width Multiplier", minimum=1, maximum=5, step=1, value=1)
|
229 |
+
page_h_mult_slider = gr.Slider(label="Page Height Multiplier", minimum=1, maximum=2, step=1, value=1)
|
230 |
+
num_columns_slider = gr.Slider(label="Text Columns", minimum=1, maximum=4, step=1, value=1)
|
231 |
+
selected_layouts = gr.CheckboxGroup(choices=list(LAYOUTS.keys()), label="Base Page Layout", value=["A4 Portrait"])
|
232 |
+
selected_fonts = gr.CheckboxGroup(choices=AVAILABLE_FONTS, label="Text Font", value=[AVAILABLE_FONTS[0]] if AVAILABLE_FONTS else [])
|
233 |
+
|
234 |
+
generate_btn = gr.Button("π Generate Documents", variant="primary")
|
235 |
+
|
236 |
+
with gr.Column(scale=2):
|
237 |
+
gr.Markdown("### πΌοΈ Output Files")
|
238 |
+
gallery_output = gr.Gallery(label="PDF Previews", show_label=False, elem_id="gallery", columns=3, height="auto", object_fit="contain")
|
239 |
+
log_output = gr.Markdown(label="Generation Log", value="Ready...")
|
240 |
+
downloadable_files_output = gr.Files(label="Download Generated Files")
|
241 |
+
|
242 |
+
generate_btn.click(fn=generate_outputs_api,
|
243 |
+
inputs=[uploaded_files, output_formats, selected_layouts, selected_fonts, num_columns_slider, page_w_mult_slider, page_h_mult_slider],
|
244 |
+
outputs=[gallery_output, log_output, downloadable_files_output])
|
245 |
+
|
246 |
+
if __name__ == "__main__":
|
247 |
+
demo.launch()
|