baderanas commited on
Commit
f7c371e
·
verified ·
1 Parent(s): fced55b

Update chroma_operations/pdf_processing.py

Browse files
Files changed (1) hide show
  1. chroma_operations/pdf_processing.py +91 -51
chroma_operations/pdf_processing.py CHANGED
@@ -1,52 +1,92 @@
1
- import pdfplumber
2
- import logging
3
- from typing import List, Union, Tuple
4
- import os
5
-
6
-
7
- # Set up logging
8
- logging.basicConfig(level=logging.INFO)
9
- logger = logging.getLogger(__name__)
10
-
11
-
12
- def extract_pdf_content(pdf_path: str) -> List[str]:
13
- """
14
- Extract text and tables from PDF in their natural reading order.
15
- Simplified version without positional processing.
16
-
17
- Args:
18
- pdf_path (str): Path to the PDF file
19
-
20
- Returns:
21
- List[str]: List of extracted content chunks (text and tables)
22
- """
23
- if not os.path.exists(pdf_path):
24
- logger.error(f"PDF file not found: {pdf_path}")
25
- return []
26
-
27
- try:
28
- with pdfplumber.open(pdf_path) as pdf:
29
- content = []
30
-
31
- for page in pdf.pages:
32
- # First extract tables
33
- tables = page.extract_tables()
34
- for table in tables:
35
- if table:
36
- # Convert table to string representation
37
- table_str = "\n".join(
38
- ["\t".join(str(cell) for cell in row) for row in table]
39
- )
40
- content.append(f"[TABLE]\n{table_str}\n[/TABLE]")
41
-
42
- # Then extract regular text
43
- text = page.extract_text()
44
- if text and text.strip():
45
- content.append(text.strip())
46
-
47
- logger.info(f"Successfully extracted content from {pdf_path}")
48
- return content
49
-
50
- except Exception as e:
51
- logger.error(f"Error processing {pdf_path}: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  return []
 
1
+ import pdfplumber
2
+ import logging
3
+ from typing import List, Union, Tuple
4
+ import os
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
+
7
+
8
+ # Set up logging
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def extract_page_content(args) -> Union[str, None]:
14
+ """
15
+ Extract content from a specific page number of a PDF.
16
+ Opens the PDF file independently for thread safety.
17
+ """
18
+ pdf_path, page_number = args
19
+ try:
20
+ with pdfplumber.open(pdf_path) as pdf:
21
+ page = pdf.pages[page_number]
22
+
23
+ # Extract tables
24
+ tables = page.extract_tables()
25
+ table_strings = []
26
+ for table in tables:
27
+ if table:
28
+ table_str = "\n".join(
29
+ ["\t".join(str(cell) if cell is not None else "" for cell in row)]
30
+ )
31
+ table_strings.append(f"[TABLE]\n{table_str}\n[/TABLE]")
32
+
33
+ # Extract text
34
+ text = page.extract_text()
35
+ content = []
36
+
37
+ if table_strings:
38
+ content.extend(table_strings)
39
+ if text and text.strip():
40
+ content.append(text.strip())
41
+
42
+ return "\n".join(content) if content else None
43
+
44
+ except Exception as e:
45
+ logger.error(f"Error processing page {page_number} of {pdf_path}: {str(e)}")
46
+ return None
47
+
48
+
49
+ def extract_pdf_content_parallel(pdf_path: str, max_workers: int = 4) -> List[str]:
50
+ """
51
+ Extract all pages of a PDF in parallel using threads.
52
+
53
+ Args:
54
+ pdf_path (str): Path to the PDF file
55
+ max_workers (int): Number of threads to use
56
+
57
+ Returns:
58
+ List[str]: List of extracted content chunks (per page)
59
+ """
60
+ if not os.path.exists(pdf_path):
61
+ logger.error(f"PDF file not found: {pdf_path}")
62
+ return []
63
+
64
+ try:
65
+ with pdfplumber.open(pdf_path) as pdf:
66
+ total_pages = len(pdf.pages)
67
+ logger.info(f"Processing {total_pages} pages from {pdf_path} in parallel.")
68
+
69
+ page_args = [(pdf_path, i) for i in range(total_pages)]
70
+
71
+ results = []
72
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
73
+ future_to_page = {
74
+ executor.submit(extract_page_content, args): args[1]
75
+ for args in page_args
76
+ }
77
+
78
+ for future in as_completed(future_to_page):
79
+ page_num = future_to_page[future]
80
+ try:
81
+ result = future.result()
82
+ if result:
83
+ results.append(result)
84
+ except Exception as exc:
85
+ logger.error(f"Page {page_num} generated an exception: {exc}")
86
+
87
+ # Maintain page order based on index
88
+ return results
89
+
90
+ except Exception as e:
91
+ logger.error(f"Error opening {pdf_path}: {str(e)}")
92
  return []