masadonline commited on
Commit
de2271c
·
verified ·
1 Parent(s): ea00c43

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -11
app.py CHANGED
@@ -16,22 +16,42 @@ from pdfminer.layout import LAParams
16
  from twilio.base.exceptions import TwilioRestException # Add this at the top
17
  import pdfplumber
18
  import datetime
 
19
 
20
  APP_START_TIME = datetime.datetime.now(datetime.timezone.utc)
21
 
22
  os.environ["PYTORCH_JIT"] = "0"
 
23
  # --- PDF Extraction ---
24
- # --- PDF Extraction ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def extract_text_from_pdf(pdf_path):
26
  text_output = StringIO()
27
- tables = []
28
  try:
29
  with pdfplumber.open(pdf_path) as pdf:
30
  for page in pdf.pages:
31
  # Extract tables
32
- page_tables = page.extract_tables()
33
  if page_tables:
34
- tables.extend(page_tables)
35
  # Extract text
36
  text = page.extract_text()
37
  if text:
@@ -42,8 +62,7 @@ def extract_text_from_pdf(pdf_path):
42
  with open(pdf_path, 'rb') as file:
43
  extract_text_to_fp(file, text_output, laparams=LAParams(), output_type='text', codec=None)
44
  extracted_text = text_output.getvalue()
45
- formatted_tables = _format_tables_internal(tables)
46
- return f"{extracted_text}\n\n{formatted_tables}"
47
 
48
  def clean_extracted_text(text):
49
  lines = text.splitlines()
@@ -56,12 +75,16 @@ def clean_extracted_text(text):
56
  return '\n'.join(cleaned)
57
 
58
  def _format_tables_internal(tables):
59
- formatted_tables = []
 
 
60
  for table in tables:
61
- # Basic formatting: joining rows with '|' and cells with ','
62
- formatted_table = "\n".join(["|".join(row) for row in table])
63
- formatted_tables.append(f"<table data>\n{formatted_table}\n</table>")
64
- return "\n\n".join(formatted_tables)
 
 
65
 
66
  # --- DOCX Extraction ---
67
  def extract_text_from_docx(docx_path):
 
16
  from twilio.base.exceptions import TwilioRestException # Add this at the top
17
  import pdfplumber
18
  import datetime
19
+ import csv
20
 
21
  APP_START_TIME = datetime.datetime.now(datetime.timezone.utc)
22
 
23
  os.environ["PYTORCH_JIT"] = "0"
24
+
25
  # --- PDF Extraction ---
26
+ def _extract_tables_from_page(page):
27
+ """Extracts tables from a single page of a PDF."""
28
+
29
+ tables = page.extract_tables()
30
+ if not tables:
31
+ return []
32
+
33
+ formatted_tables = []
34
+ for table in tables:
35
+ formatted_table = []
36
+ for row in table:
37
+ if row: # Filter out empty rows
38
+ formatted_row = [cell if cell is not None else "" for cell in row] # Replace None with ""
39
+ formatted_table.append(formatted_row)
40
+ else:
41
+ formatted_table.append([""]) # Append an empty row if the row is None
42
+ formatted_tables.append(formatted_table)
43
+ return formatted_tables
44
+
45
  def extract_text_from_pdf(pdf_path):
46
  text_output = StringIO()
47
+ all_tables = []
48
  try:
49
  with pdfplumber.open(pdf_path) as pdf:
50
  for page in pdf.pages:
51
  # Extract tables
52
+ page_tables = _extract_tables_from_page(page)
53
  if page_tables:
54
+ all_tables.extend(page_tables)
55
  # Extract text
56
  text = page.extract_text()
57
  if text:
 
62
  with open(pdf_path, 'rb') as file:
63
  extract_text_to_fp(file, text_output, laparams=LAParams(), output_type='text', codec=None)
64
  extracted_text = text_output.getvalue()
65
+ return extracted_text, all_tables # Return text and list of tables
 
66
 
67
  def clean_extracted_text(text):
68
  lines = text.splitlines()
 
75
  return '\n'.join(cleaned)
76
 
77
  def _format_tables_internal(tables):
78
+ """Formats extracted tables into a string representation."""
79
+
80
+ formatted_tables_str = []
81
  for table in tables:
82
+ # Use csv writer to handle commas and quotes correctly
83
+ with StringIO() as csvfile:
84
+ csvwriter = csv.writer(csvfile)
85
+ csvwriter.writerows(table)
86
+ formatted_tables_str.append(csvfile.getvalue())
87
+ return "\n\n".join(formatted_tables_str)
88
 
89
  # --- DOCX Extraction ---
90
  def extract_text_from_docx(docx_path):