broadfield-dev commited on
Commit
90650e1
·
verified ·
1 Parent(s): 47dc1da

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -15
app.py CHANGED
@@ -205,27 +205,20 @@ def generate_pdf_conversion_stream(pdf_input_source_path_or_url):
205
 
206
  page_text = page.extract_text(layout=True, x_density=1, y_density=1) or ""
207
 
208
- page_tables_md = ""
209
- tables = page.extract_tables()
210
- if tables:
211
- for table_idx, table_data in enumerate(tables):
212
- if table_data and len(table_data) > 0 and table_data[0] is not None and len(table_data[0]) > 0 :
213
- yield yield_message("status", {"message": f" Processing table {table_idx+1} on page {i+1}..."})
214
- header_cells = table_data[0]
215
- header = [" | ".join(str(cell) if cell is not None else "" for cell in header_cells)]
216
- separator = [" | ".join(["---"] * len(header_cells))]
217
- body = [" | ".join(str(cell) if cell is not None else "" for cell in row) for row in table_data[1:]]
218
- table_md_lines = header + separator + body
219
- page_tables_md += f"**Table (Page {i+1}):**\n" + "\n".join(table_md_lines) + "\n\n"
220
 
221
  formatted_page_text_md = format_page_text_to_markdown_chunk(page_text)
222
 
223
  yield yield_message("markdown_chunk", {"content": formatted_page_text_md})
224
- if page_tables_md:
225
- yield yield_message("markdown_chunk", {"content": page_tables_md})
226
  time.sleep(0.01)
227
  except Exception as e:
228
- logger.error(f"Error during PDF text/table extraction: {str(e)}", exc_info=True)
229
  yield yield_message("error", {"message": f"Error during text extraction: {str(e)}"})
230
 
231
  if not check_poppler():
 
205
 
206
  page_text = page.extract_text(layout=True, x_density=1, y_density=1) or ""
207
 
208
+ # Removed table extraction logic here
209
+ # page_tables_md = "" # No longer needed
210
+ # tables = page.extract_tables() # No longer needed
211
+ # if tables: # No longer needed
212
+ # ... (table processing code removed) ...
 
 
 
 
 
 
 
213
 
214
  formatted_page_text_md = format_page_text_to_markdown_chunk(page_text)
215
 
216
  yield yield_message("markdown_chunk", {"content": formatted_page_text_md})
217
+ # if page_tables_md: # No longer needed, as page_tables_md is not created
218
+ # yield yield_message("markdown_chunk", {"content": page_tables_md})
219
  time.sleep(0.01)
220
  except Exception as e:
221
+ logger.error(f"Error during PDF text extraction: {str(e)}", exc_info=True) # Updated log message
222
  yield yield_message("error", {"message": f"Error during text extraction: {str(e)}"})
223
 
224
  if not check_poppler():