Spaces:

jkorstad
/

PDF-Parser

Runtime error

jkorstad commited on Feb 26

Commit

7fc9b26

verified ·

1 Parent(s): 7f7f7cb

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -16,34 +16,32 @@ def process_pdf(pdf_file):
  """
  # Check if a PDF file was uploaded
  if pdf_file is None:
-    return "<p>Please upload a PDF file.</p>"
  # Convert PDF to images
  try:
-    pages = convert_from_path(pdf_file.name)
-    except Exception as e:
-    return f"<p>Error converting PDF to images: {str(e)}</p>"
-    # Start building the HTML output
-    html = '<div><button onclick="copyAll()" style="margin-bottom: 10px;">Copy All</button></div><div id="pages">'
-    # Process each page
-    for i, page in enumerate(pages):
-    # Convert the page image to base64 for embedding in HTML
-        buffered = io.BytesIO()
-        page.save(buffered, format="PNG")
-        img_str = base64.b64encode(buffered.getvalue()).decode()
-        img_data = f"data:image/png;base64,{img_str}"
  # Extract text from the page using the OCR model
  try:
-    inputs = processor(text="Extract the text from this image.", images=page, return_tensors="pt")
-    outputs = model.generate(**inputs)
-    text = processor.decode(outputs[0], skip_special_tokens=True)
-    except Exception as e:
-        text = f"Error extracting text: {str(e)}"
  # Generate HTML for this page's section
  textarea_id = f"text{i+1}"
@@ -89,4 +87,5 @@ with gr.Blocks(title="PDF Text Extractor") as demo:
  submit_btn.click(fn=process_pdf, inputs=pdf_input, outputs=output_html)
 # Launch the interface
-demo.launch()

  """
  # Check if a PDF file was uploaded
  if pdf_file is None:
+ return "<p>Please upload a PDF file.</p>"
  # Convert PDF to images
  try:
+ pages = convert_from_path(pdf_file.name)
+ except Exception as e:
+ return f"<p>Error converting PDF to images: {str(e)}</p>"
+ # Start building the HTML output
+ html = '<div><button onclick="copyAll()" style="margin-bottom: 10px;">Copy All</button></div><div id="pages">'
+ # Process each page
+ for i, page in enumerate(pages):
+ # Convert the page image to base64 for embedding in HTML
+ buffered = io.BytesIO()
+ page.save(buffered, format="PNG")
+ img_str = base64.b64encode(buffered.getvalue()).decode()
+ img_data = f"data:image/png;base64,{img_str}"
  # Extract text from the page using the OCR model
  try:
+ inputs = processor(text="Extract the text from this image.", images=page, return_tensors="pt")
+ outputs = model.generate(**inputs)
+ text = processor.decode(outputs[0], skip_special_tokens=True)
+ except Exception as e:
+ text = f"Error extracting text: {str(e)}"
  # Generate HTML for this page's section
  textarea_id = f"text{i+1}"
  submit_btn.click(fn=process_pdf, inputs=pdf_input, outputs=output_html)
 # Launch the interface
+demo.launch()