|
import PyPDF2 |
|
import xml.etree.ElementTree as ET |
|
import gradio as gr |
|
import tempfile |
|
import os |
|
|
|
def pdf_to_xml(pdf_path, xml_path): |
|
""" |
|
Convert a PDF file to an XML file by extracting text from each page. |
|
""" |
|
with open(pdf_path, 'rb') as pdf_file: |
|
pdf_reader = PyPDF2.PdfReader(pdf_file) |
|
root = ET.Element("document") |
|
|
|
for page_num, page in enumerate(pdf_reader.pages, start=1): |
|
try: |
|
page_text = page.extract_text() |
|
except Exception as e: |
|
|
|
page_text = "" |
|
page_element = ET.SubElement(root, "page", number=str(page_num)) |
|
|
|
if page_text: |
|
for line in page_text.split('\n'): |
|
line_element = ET.SubElement(page_element, "line") |
|
line_element.text = line |
|
|
|
tree = ET.ElementTree(root) |
|
tree.write(xml_path, encoding="utf-8", xml_declaration=True) |
|
|
|
def pdf_to_xml_interface(pdf_file): |
|
""" |
|
Gradio interface function that accepts an uploaded PDF file and returns the converted XML file. |
|
""" |
|
|
|
if pdf_file is None: |
|
return "No file uploaded." |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".xml") as tmp: |
|
output_path = tmp.name |
|
|
|
try: |
|
pdf_to_xml(pdf_file.name, output_path) |
|
except Exception as e: |
|
|
|
return f"An error occurred during conversion: {e}" |
|
|
|
return output_path |
|
|
|
|
|
iface = gr.Interface( |
|
fn=pdf_to_xml_interface, |
|
inputs=gr.File(label="Upload PDF File"), |
|
outputs=gr.File(label="Download XML File"), |
|
title="PDF to XML Extractor", |
|
description="Upload a PDF file to extract its text into a structured XML format." |
|
) |
|
|
|
if __name__ == "__main__": |
|
|
|
iface.launch() |
|
|