Spaces:
Sleeping
Sleeping
Update README.md
Browse files
README.md
CHANGED
@@ -315,4 +315,380 @@ Let the research journey continue! May your OCR be accurate, your layouts make s
|
|
315 |
* [PubMed Central (PMC)](https://www.ncbi.nlm.nih.gov/pmc/): Biomedical literature resource.
|
316 |
* Specific papers cited in Section V.
|
317 |
* Surveys on Document AI, Layout Analysis, NER, Table Extraction, Clinical NLP.
|
318 |
-
* Blogs and documentation for tools like LayoutLM, Donut, GROBID, Tesseract, PyMuPDF.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
* [PubMed Central (PMC)](https://www.ncbi.nlm.nih.gov/pmc/): Biomedical literature resource.
|
316 |
* Specific papers cited in Section V.
|
317 |
* Surveys on Document AI, Layout Analysis, NER, Table Extraction, Clinical NLP.
|
318 |
+
* Blogs and documentation for tools like LayoutLM, Donut, GROBID, Tesseract, PyMuPDF.
|
319 |
+
|
320 |
+
|
321 |
+
|
322 |
+
```html
|
323 |
+
|
324 |
+
|
325 |
+
I need some help with this base capability. 1. Annotate all functions and reorder them so they make sense from object oriented standpoint. add emoji led comments with wise witty rhyme and wisdom in short tiny feature embelishing instructions of how to use functions, what the main features we pass around are and an emoji for each one of those, specifically the md asset and the pdf asset,, I want some deeper definition of what we do with page and fonts since I am having difficulty getting emojis to show in pdf. fix that if you can. import streamlit as st
|
326 |
+
|
327 |
+
from pathlib import Path
|
328 |
+
|
329 |
+
import base64
|
330 |
+
|
331 |
+
import datetime
|
332 |
+
|
333 |
+
import re
|
334 |
+
|
335 |
+
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
|
336 |
+
|
337 |
+
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
338 |
+
|
339 |
+
from reportlab.lib.pagesizes import letter, A4, legal, landscape
|
340 |
+
|
341 |
+
from reportlab.lib.units import inch
|
342 |
+
|
343 |
+
from reportlab.lib import colors
|
344 |
+
|
345 |
+
|
346 |
+
|
347 |
+
# --- Configuration & Setup ---
|
348 |
+
|
349 |
+
|
350 |
+
|
351 |
+
# Define layouts using reportlab's pagesizes
|
352 |
+
|
353 |
+
# The 'size' key now holds a tuple (width, height)
|
354 |
+
|
355 |
+
LAYOUTS = {
|
356 |
+
|
357 |
+
"A4 Portrait": {"size": A4, "icon": "📄"},
|
358 |
+
|
359 |
+
"A4 Landscape": {"size": landscape(A4), "icon": "📄"},
|
360 |
+
|
361 |
+
"Letter Portrait": {"size": letter, "icon": "📄"},
|
362 |
+
|
363 |
+
"Letter Landscape": {"size": landscape(letter), "icon": "📄"},
|
364 |
+
|
365 |
+
"Legal Portrait": {"size": legal, "icon": "📄"},
|
366 |
+
|
367 |
+
"Legal Landscape": {"size": landscape(legal), "icon": "📄"},
|
368 |
+
|
369 |
+
}
|
370 |
+
|
371 |
+
|
372 |
+
|
373 |
+
# Directory to save the generated PDFs
|
374 |
+
|
375 |
+
OUTPUT_DIR = Path("generated_pdfs")
|
376 |
+
|
377 |
+
OUTPUT_DIR.mkdir(exist_ok=True)
|
378 |
+
|
379 |
+
|
380 |
+
|
381 |
+
# --- ReportLab PDF Generation ---
|
382 |
+
|
383 |
+
|
384 |
+
|
385 |
+
def markdown_to_story(markdown_text: str):
|
386 |
+
|
387 |
+
"""Converts a markdown string into a list of ReportLab Flowables (a 'story')."""
|
388 |
+
|
389 |
+
styles = getSampleStyleSheet()
|
390 |
+
|
391 |
+
|
392 |
+
|
393 |
+
# Define custom styles
|
394 |
+
|
395 |
+
style_normal = styles['BodyText']
|
396 |
+
|
397 |
+
style_h1 = styles['h1']
|
398 |
+
|
399 |
+
style_h2 = styles['h2']
|
400 |
+
|
401 |
+
style_h3 = styles['h3']
|
402 |
+
|
403 |
+
style_code = styles['Code']
|
404 |
+
|
405 |
+
|
406 |
+
|
407 |
+
# A simple regex-based parser for markdown
|
408 |
+
|
409 |
+
story = []
|
410 |
+
|
411 |
+
lines = markdown_text.split('\n')
|
412 |
+
|
413 |
+
|
414 |
+
|
415 |
+
in_code_block = False
|
416 |
+
|
417 |
+
code_block_text = ""
|
418 |
+
|
419 |
+
|
420 |
+
|
421 |
+
for line in lines:
|
422 |
+
|
423 |
+
if line.strip().startswith("```"):
|
424 |
+
|
425 |
+
if in_code_block:
|
426 |
+
|
427 |
+
story.append(Paragraph(code_block_text.replace('\n', '<br/>'), style_code))
|
428 |
+
|
429 |
+
in_code_block = False
|
430 |
+
|
431 |
+
code_block_text = ""
|
432 |
+
|
433 |
+
else:
|
434 |
+
|
435 |
+
in_code_block = True
|
436 |
+
|
437 |
+
continue
|
438 |
+
|
439 |
+
|
440 |
+
|
441 |
+
if in_code_block:
|
442 |
+
|
443 |
+
# Escape HTML tags for code blocks
|
444 |
+
|
445 |
+
escaped_line = line.replace('&', '&').replace('<', '<').replace('>', '>')
|
446 |
+
|
447 |
+
code_block_text += escaped_line + '\n'
|
448 |
+
|
449 |
+
continue
|
450 |
+
|
451 |
+
|
452 |
+
|
453 |
+
if line.startswith("# "):
|
454 |
+
|
455 |
+
story.append(Paragraph(line[2:], style_h1))
|
456 |
+
|
457 |
+
elif line.startswith("## "):
|
458 |
+
|
459 |
+
story.append(Paragraph(line[3:], style_h2))
|
460 |
+
|
461 |
+
elif line.startswith("### "):
|
462 |
+
|
463 |
+
story.append(Paragraph(line[4:], style_h3))
|
464 |
+
|
465 |
+
elif line.strip().startswith(("* ", "- ")):
|
466 |
+
|
467 |
+
# Handle bullet points
|
468 |
+
|
469 |
+
story.append(Paragraph(f"• {line.strip()[2:]}", style_normal, bulletText='•'))
|
470 |
+
|
471 |
+
elif re.match(r'^\d+\.\s', line.strip()):
|
472 |
+
|
473 |
+
# Handle numbered lists
|
474 |
+
|
475 |
+
story.append(Paragraph(line.strip(), style_normal))
|
476 |
+
|
477 |
+
elif line.strip() == "":
|
478 |
+
|
479 |
+
story.append(Spacer(1, 0.2 * inch))
|
480 |
+
|
481 |
+
else:
|
482 |
+
|
483 |
+
# Handle bold and italics
|
484 |
+
|
485 |
+
line = re.sub(r'\*\*(.*?)\*\*', r'<b>\1</b>', line)
|
486 |
+
|
487 |
+
line = re.sub(r'_(.*?)_', r'<i>\1</i>', line)
|
488 |
+
|
489 |
+
story.append(Paragraph(line, style_normal))
|
490 |
+
|
491 |
+
|
492 |
+
|
493 |
+
return story
|
494 |
+
|
495 |
+
|
496 |
+
|
497 |
+
def create_pdf_with_reportlab(md_path: Path, layout_name: str, layout_properties: dict):
|
498 |
+
|
499 |
+
"""Creates a PDF for a given markdown file and layout."""
|
500 |
+
|
501 |
+
try:
|
502 |
+
|
503 |
+
md_content = md_path.read_text(encoding="utf-8")
|
504 |
+
|
505 |
+
|
506 |
+
|
507 |
+
date_str = datetime.datetime.now().strftime("%Y-%m-%d")
|
508 |
+
|
509 |
+
output_filename = f"{md_path.stem}_{layout_name.replace(' ', '-')}_{date_str}.pdf"
|
510 |
+
|
511 |
+
output_path = OUTPUT_DIR / output_filename
|
512 |
+
|
513 |
+
|
514 |
+
|
515 |
+
doc = SimpleDocTemplate(
|
516 |
+
|
517 |
+
str(output_path),
|
518 |
+
|
519 |
+
pagesize=layout_properties.get("size", A4),
|
520 |
+
|
521 |
+
rightMargin=inch,
|
522 |
+
|
523 |
+
leftMargin=inch,
|
524 |
+
|
525 |
+
topMargin=inch,
|
526 |
+
|
527 |
+
bottomMargin=inch
|
528 |
+
|
529 |
+
)
|
530 |
+
|
531 |
+
|
532 |
+
|
533 |
+
story = markdown_to_story(md_content)
|
534 |
+
|
535 |
+
|
536 |
+
|
537 |
+
doc.build(story)
|
538 |
+
|
539 |
+
|
540 |
+
|
541 |
+
except Exception as e:
|
542 |
+
|
543 |
+
st.error(f"Failed to process {md_path.name} with ReportLab: {e}")
|
544 |
+
|
545 |
+
|
546 |
+
|
547 |
+
|
548 |
+
|
549 |
+
# --- Streamlit UI and File Handling (Mostly Unchanged) ---
|
550 |
+
|
551 |
+
|
552 |
+
|
553 |
+
def get_file_download_link(file_path: Path) -> str:
|
554 |
+
|
555 |
+
"""Generates a base64-encoded download link for a file."""
|
556 |
+
|
557 |
+
with open(file_path, "rb") as f:
|
558 |
+
|
559 |
+
data = base64.b64encode(f.read()).decode()
|
560 |
+
|
561 |
+
return f'<a href="data:application/octet-stream;base64,{data}" download="{file_path.name}">Download</a>'
|
562 |
+
|
563 |
+
|
564 |
+
|
565 |
+
def display_file_explorer():
|
566 |
+
|
567 |
+
"""Renders a simple file explorer in the Streamlit app."""
|
568 |
+
|
569 |
+
st.header("📂 File Explorer")
|
570 |
+
|
571 |
+
|
572 |
+
|
573 |
+
st.subheader("Source Markdown Files (.md)")
|
574 |
+
|
575 |
+
md_files = list(Path(".").glob("*.md"))
|
576 |
+
|
577 |
+
if not md_files:
|
578 |
+
|
579 |
+
st.info("No Markdown files found. Create a `.md` file to begin.")
|
580 |
+
|
581 |
+
else:
|
582 |
+
|
583 |
+
for md_file in md_files:
|
584 |
+
|
585 |
+
col1, col2 = st.columns([0.8, 0.2])
|
586 |
+
|
587 |
+
with col1:
|
588 |
+
|
589 |
+
st.write(f"📝 `{md_file.name}`")
|
590 |
+
|
591 |
+
with col2:
|
592 |
+
|
593 |
+
st.markdown(get_file_download_link(md_file), unsafe_allow_html=True)
|
594 |
+
|
595 |
+
|
596 |
+
|
597 |
+
st.subheader("Generated PDF Files")
|
598 |
+
|
599 |
+
pdf_files = sorted(list(OUTPUT_DIR.glob("*.pdf")), key=lambda p: p.stat().st_mtime, reverse=True)
|
600 |
+
|
601 |
+
if not pdf_files:
|
602 |
+
|
603 |
+
st.info("No PDFs generated yet. Click the button above.")
|
604 |
+
|
605 |
+
else:
|
606 |
+
|
607 |
+
for pdf_file in pdf_files:
|
608 |
+
|
609 |
+
col1, col2 = st.columns([0.8, 0.2])
|
610 |
+
|
611 |
+
with col1:
|
612 |
+
|
613 |
+
st.write(f"📄 `{pdf_file.name}`")
|
614 |
+
|
615 |
+
with col2:
|
616 |
+
|
617 |
+
st.markdown(get_file_download_link(pdf_file), unsafe_allow_html=True)
|
618 |
+
|
619 |
+
|
620 |
+
|
621 |
+
|
622 |
+
|
623 |
+
# --- Main App ---
|
624 |
+
|
625 |
+
|
626 |
+
|
627 |
+
st.set_page_config(layout="wide", page_title="PDF Generator")
|
628 |
+
|
629 |
+
|
630 |
+
|
631 |
+
st.title("📄 Markdown to PDF Generator (ReportLab Engine)")
|
632 |
+
|
633 |
+
st.markdown("This tool finds all `.md` files in this directory, converts them to PDF in various layouts, and provides download links. It uses the `ReportLab` library and requires no external dependencies.")
|
634 |
+
|
635 |
+
|
636 |
+
|
637 |
+
if not list(Path(".").glob("*.md")):
|
638 |
+
|
639 |
+
with open("sample.md", "w", encoding="utf-8") as f:
|
640 |
+
|
641 |
+
f.write("# Sample Document\n\nThis is a sample markdown file. **ReportLab** is now creating the PDF.\n\n### Features\n- Item 1\n- Item 2\n\n1. Numbered item\n2. Another one\n\n```\ndef hello():\n print(\"Hello, PDF!\")\n```\n")
|
642 |
+
|
643 |
+
st.rerun()
|
644 |
+
|
645 |
+
|
646 |
+
|
647 |
+
if st.button("🚀 Generate PDFs from all Markdown Files", type="primary"):
|
648 |
+
|
649 |
+
markdown_files = list(Path(".").glob("*.md"))
|
650 |
+
|
651 |
+
|
652 |
+
|
653 |
+
if not markdown_files:
|
654 |
+
|
655 |
+
st.warning("No `.md` files found. Please add a markdown file to the directory.")
|
656 |
+
|
657 |
+
else:
|
658 |
+
|
659 |
+
total_pdfs = len(markdown_files) * len(LAYOUTS)
|
660 |
+
|
661 |
+
progress_bar = st.progress(0)
|
662 |
+
|
663 |
+
pdf_count = 0
|
664 |
+
|
665 |
+
|
666 |
+
|
667 |
+
with st.spinner("Generating PDFs using ReportLab..."):
|
668 |
+
|
669 |
+
for md_file in markdown_files:
|
670 |
+
|
671 |
+
st.info(f"Processing: **{md_file.name}**")
|
672 |
+
|
673 |
+
for name, properties in LAYOUTS.items():
|
674 |
+
|
675 |
+
st.write(f" - Generating `{name}` format...")
|
676 |
+
|
677 |
+
create_pdf_with_reportlab(md_file, name, properties)
|
678 |
+
|
679 |
+
pdf_count += 1
|
680 |
+
|
681 |
+
progress_bar.progress(pdf_count / total_pdfs)
|
682 |
+
|
683 |
+
|
684 |
+
|
685 |
+
st.success("✅ PDF generation complete!")
|
686 |
+
|
687 |
+
st.rerun()
|
688 |
+
|
689 |
+
|
690 |
+
|
691 |
+
display_file_explorer()
|
692 |
+
|
693 |
+
|
694 |
+
```
|