File size: 2,067 Bytes
bf4ed2a e550436 bf4ed2a e550436 bf4ed2a e550436 bf4ed2a e550436 bf4ed2a e550436 bf4ed2a e550436 bf4ed2a e550436 bf4ed2a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
import PyPDF2
import xml.etree.ElementTree as ET
import gradio as gr
import tempfile
import os
def pdf_to_xml(pdf_path, xml_path):
"""
Convert a PDF file to an XML file by extracting text from each page.
"""
with open(pdf_path, 'rb') as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
root = ET.Element("document")
for page_num, page in enumerate(pdf_reader.pages, start=1):
try:
page_text = page.extract_text()
except Exception as e:
# Log or handle extraction errors per page if needed.
page_text = ""
page_element = ET.SubElement(root, "page", number=str(page_num))
if page_text:
for line in page_text.split('\n'):
line_element = ET.SubElement(page_element, "line")
line_element.text = line
tree = ET.ElementTree(root)
tree.write(xml_path, encoding="utf-8", xml_declaration=True)
def pdf_to_xml_interface(pdf_file):
"""
Gradio interface function that accepts an uploaded PDF file and returns the converted XML file.
"""
# Handle case when no file is uploaded.
if pdf_file is None:
return "No file uploaded."
# Create a temporary file to store the output XML.
with tempfile.NamedTemporaryFile(delete=False, suffix=".xml") as tmp:
output_path = tmp.name
try:
pdf_to_xml(pdf_file.name, output_path)
except Exception as e:
# You can return a more detailed error message if desired.
return f"An error occurred during conversion: {e}"
return output_path
# Define the Gradio interface.
iface = gr.Interface(
fn=pdf_to_xml_interface,
inputs=gr.File(label="Upload PDF File"),
outputs=gr.File(label="Download XML File"),
title="PDF to XML Extractor",
description="Upload a PDF file to extract its text into a structured XML format."
)
if __name__ == "__main__":
# Launch the Gradio app.
iface.launch()
|