File size: 2,067 Bytes
bf4ed2a
 
 
 
e550436
bf4ed2a
 
 
 
 
 
 
 
 
 
e550436
 
 
 
 
bf4ed2a
 
 
 
 
 
 
 
 
 
 
 
 
 
e550436
 
 
 
 
bf4ed2a
 
e550436
 
 
 
 
 
 
bf4ed2a
 
e550436
bf4ed2a
 
e550436
 
bf4ed2a
 
 
 
 
e550436
bf4ed2a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import PyPDF2
import xml.etree.ElementTree as ET
import gradio as gr
import tempfile
import os

def pdf_to_xml(pdf_path, xml_path):
    """
    Convert a PDF file to an XML file by extracting text from each page.
    """
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        root = ET.Element("document")
        
        for page_num, page in enumerate(pdf_reader.pages, start=1):
            try:
                page_text = page.extract_text()
            except Exception as e:
                # Log or handle extraction errors per page if needed.
                page_text = ""
            page_element = ET.SubElement(root, "page", number=str(page_num))
            
            if page_text:
                for line in page_text.split('\n'):
                    line_element = ET.SubElement(page_element, "line")
                    line_element.text = line
                    
    tree = ET.ElementTree(root)
    tree.write(xml_path, encoding="utf-8", xml_declaration=True)

def pdf_to_xml_interface(pdf_file):
    """
    Gradio interface function that accepts an uploaded PDF file and returns the converted XML file.
    """
    # Handle case when no file is uploaded.
    if pdf_file is None:
        return "No file uploaded."
    
    # Create a temporary file to store the output XML.
    with tempfile.NamedTemporaryFile(delete=False, suffix=".xml") as tmp:
        output_path = tmp.name
    
    try:
        pdf_to_xml(pdf_file.name, output_path)
    except Exception as e:
        # You can return a more detailed error message if desired.
        return f"An error occurred during conversion: {e}"
    
    return output_path

# Define the Gradio interface.
iface = gr.Interface(
    fn=pdf_to_xml_interface,
    inputs=gr.File(label="Upload PDF File"),
    outputs=gr.File(label="Download XML File"),
    title="PDF to XML Extractor",
    description="Upload a PDF file to extract its text into a structured XML format."
)

if __name__ == "__main__":
    # Launch the Gradio app.
    iface.launch()