Spaces:

baderanas
/

rag-medical

Running

App Files Files Community

baderanas commited on 21 days ago

Commit

f7c371e

verified ·

1 Parent(s): fced55b

Update chroma_operations/pdf_processing.py

Browse files

Files changed (1) hide show

chroma_operations/pdf_processing.py +91 -51

chroma_operations/pdf_processing.py CHANGED Viewed

@@ -1,52 +1,92 @@
-import pdfplumber
-import logging
-from typing import List, Union, Tuple
-import os
-# Set up logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-def extract_pdf_content(pdf_path: str) -> List[str]:
-    """
-    Extract text and tables from PDF in their natural reading order.
-    Simplified version without positional processing.
-    Args:
-        pdf_path (str): Path to the PDF file
-    Returns:
-        List[str]: List of extracted content chunks (text and tables)
-    """
-    if not os.path.exists(pdf_path):
-        logger.error(f"PDF file not found: {pdf_path}")
-        return []
-    try:
-        with pdfplumber.open(pdf_path) as pdf:
-            content = []
-            for page in pdf.pages:
-                # First extract tables
-                tables = page.extract_tables()
-                for table in tables:
-                    if table:
-                        # Convert table to string representation
-                        table_str = "\n".join(
-                            ["\t".join(str(cell) for cell in row) for row in table]
-                        )
-                        content.append(f"[TABLE]\n{table_str}\n[/TABLE]")
-                # Then extract regular text
-                text = page.extract_text()
-                if text and text.strip():
-                    content.append(text.strip())
-            logger.info(f"Successfully extracted content from {pdf_path}")
-            return content
-    except Exception as e:
-        logger.error(f"Error processing {pdf_path}: {str(e)}")
         return []

+import pdfplumber
+import logging
+from typing import List, Union, Tuple
+import os
+from concurrent.futures import ThreadPoolExecutor, as_completed
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def extract_page_content(args) -> Union[str, None]:
+    """
+    Extract content from a specific page number of a PDF.
+    Opens the PDF file independently for thread safety.
+    """
+    pdf_path, page_number = args
+    try:
+        with pdfplumber.open(pdf_path) as pdf:
+            page = pdf.pages[page_number]
+            # Extract tables
+            tables = page.extract_tables()
+            table_strings = []
+            for table in tables:
+                if table:
+                    table_str = "\n".join(
+                        ["\t".join(str(cell) if cell is not None else "" for cell in row)]
+                    )
+                    table_strings.append(f"[TABLE]\n{table_str}\n[/TABLE]")
+            # Extract text
+            text = page.extract_text()
+            content = []
+            if table_strings:
+                content.extend(table_strings)
+            if text and text.strip():
+                content.append(text.strip())
+            return "\n".join(content) if content else None
+    except Exception as e:
+        logger.error(f"Error processing page {page_number} of {pdf_path}: {str(e)}")
+        return None
+def extract_pdf_content_parallel(pdf_path: str, max_workers: int = 4) -> List[str]:
+    """
+    Extract all pages of a PDF in parallel using threads.
+    Args:
+        pdf_path (str): Path to the PDF file
+        max_workers (int): Number of threads to use
+    Returns:
+        List[str]: List of extracted content chunks (per page)
+    """
+    if not os.path.exists(pdf_path):
+        logger.error(f"PDF file not found: {pdf_path}")
+        return []
+    try:
+        with pdfplumber.open(pdf_path) as pdf:
+            total_pages = len(pdf.pages)
+        logger.info(f"Processing {total_pages} pages from {pdf_path} in parallel.")
+        page_args = [(pdf_path, i) for i in range(total_pages)]
+        results = []
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            future_to_page = {
+                executor.submit(extract_page_content, args): args[1]
+                for args in page_args
+            }
+            for future in as_completed(future_to_page):
+                page_num = future_to_page[future]
+                try:
+                    result = future.result()
+                    if result:
+                        results.append(result)
+                except Exception as exc:
+                    logger.error(f"Page {page_num} generated an exception: {exc}")
+        # Maintain page order based on index
+        return results
+    except Exception as e:
+        logger.error(f"Error opening {pdf_path}: {str(e)}")
         return []