Spaces:
Running
Running
Update chroma_operations/pdf_processing.py
Browse files
chroma_operations/pdf_processing.py
CHANGED
@@ -1,52 +1,92 @@
|
|
1 |
-
import pdfplumber
|
2 |
-
import logging
|
3 |
-
from typing import List, Union, Tuple
|
4 |
-
import os
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
logging
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
return []
|
|
|
1 |
+
import pdfplumber
|
2 |
+
import logging
|
3 |
+
from typing import List, Union, Tuple
|
4 |
+
import os
|
5 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
6 |
+
|
7 |
+
|
8 |
+
# Set up logging
|
9 |
+
logging.basicConfig(level=logging.INFO)
|
10 |
+
logger = logging.getLogger(__name__)
|
11 |
+
|
12 |
+
|
13 |
+
def extract_page_content(args) -> Union[str, None]:
|
14 |
+
"""
|
15 |
+
Extract content from a specific page number of a PDF.
|
16 |
+
Opens the PDF file independently for thread safety.
|
17 |
+
"""
|
18 |
+
pdf_path, page_number = args
|
19 |
+
try:
|
20 |
+
with pdfplumber.open(pdf_path) as pdf:
|
21 |
+
page = pdf.pages[page_number]
|
22 |
+
|
23 |
+
# Extract tables
|
24 |
+
tables = page.extract_tables()
|
25 |
+
table_strings = []
|
26 |
+
for table in tables:
|
27 |
+
if table:
|
28 |
+
table_str = "\n".join(
|
29 |
+
["\t".join(str(cell) if cell is not None else "" for cell in row)]
|
30 |
+
)
|
31 |
+
table_strings.append(f"[TABLE]\n{table_str}\n[/TABLE]")
|
32 |
+
|
33 |
+
# Extract text
|
34 |
+
text = page.extract_text()
|
35 |
+
content = []
|
36 |
+
|
37 |
+
if table_strings:
|
38 |
+
content.extend(table_strings)
|
39 |
+
if text and text.strip():
|
40 |
+
content.append(text.strip())
|
41 |
+
|
42 |
+
return "\n".join(content) if content else None
|
43 |
+
|
44 |
+
except Exception as e:
|
45 |
+
logger.error(f"Error processing page {page_number} of {pdf_path}: {str(e)}")
|
46 |
+
return None
|
47 |
+
|
48 |
+
|
49 |
+
def extract_pdf_content_parallel(pdf_path: str, max_workers: int = 4) -> List[str]:
|
50 |
+
"""
|
51 |
+
Extract all pages of a PDF in parallel using threads.
|
52 |
+
|
53 |
+
Args:
|
54 |
+
pdf_path (str): Path to the PDF file
|
55 |
+
max_workers (int): Number of threads to use
|
56 |
+
|
57 |
+
Returns:
|
58 |
+
List[str]: List of extracted content chunks (per page)
|
59 |
+
"""
|
60 |
+
if not os.path.exists(pdf_path):
|
61 |
+
logger.error(f"PDF file not found: {pdf_path}")
|
62 |
+
return []
|
63 |
+
|
64 |
+
try:
|
65 |
+
with pdfplumber.open(pdf_path) as pdf:
|
66 |
+
total_pages = len(pdf.pages)
|
67 |
+
logger.info(f"Processing {total_pages} pages from {pdf_path} in parallel.")
|
68 |
+
|
69 |
+
page_args = [(pdf_path, i) for i in range(total_pages)]
|
70 |
+
|
71 |
+
results = []
|
72 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
73 |
+
future_to_page = {
|
74 |
+
executor.submit(extract_page_content, args): args[1]
|
75 |
+
for args in page_args
|
76 |
+
}
|
77 |
+
|
78 |
+
for future in as_completed(future_to_page):
|
79 |
+
page_num = future_to_page[future]
|
80 |
+
try:
|
81 |
+
result = future.result()
|
82 |
+
if result:
|
83 |
+
results.append(result)
|
84 |
+
except Exception as exc:
|
85 |
+
logger.error(f"Page {page_num} generated an exception: {exc}")
|
86 |
+
|
87 |
+
# Maintain page order based on index
|
88 |
+
return results
|
89 |
+
|
90 |
+
except Exception as e:
|
91 |
+
logger.error(f"Error opening {pdf_path}: {str(e)}")
|
92 |
return []
|