import PyPDF2 import xml.etree.ElementTree as ET import gradio as gr import tempfile import os def pdf_to_xml(pdf_path, xml_path): """ Convert a PDF file to an XML file by extracting text from each page. """ with open(pdf_path, 'rb') as pdf_file: pdf_reader = PyPDF2.PdfReader(pdf_file) root = ET.Element("document") for page_num, page in enumerate(pdf_reader.pages, start=1): try: page_text = page.extract_text() except Exception as e: # Log or handle extraction errors per page if needed. page_text = "" page_element = ET.SubElement(root, "page", number=str(page_num)) if page_text: for line in page_text.split('\n'): line_element = ET.SubElement(page_element, "line") line_element.text = line tree = ET.ElementTree(root) tree.write(xml_path, encoding="utf-8", xml_declaration=True) def pdf_to_xml_interface(pdf_file): """ Gradio interface function that accepts an uploaded PDF file and returns the converted XML file. """ # Handle case when no file is uploaded. if pdf_file is None: return "No file uploaded." # Create a temporary file to store the output XML. with tempfile.NamedTemporaryFile(delete=False, suffix=".xml") as tmp: output_path = tmp.name try: pdf_to_xml(pdf_file.name, output_path) except Exception as e: # You can return a more detailed error message if desired. return f"An error occurred during conversion: {e}" return output_path # Define the Gradio interface. iface = gr.Interface( fn=pdf_to_xml_interface, inputs=gr.File(label="Upload PDF File"), outputs=gr.File(label="Download XML File"), title="PDF to XML Extractor", description="Upload a PDF file to extract its text into a structured XML format." ) if __name__ == "__main__": # Launch the Gradio app. iface.launch()