Spaces:
Runtime error
Runtime error
Commit
·
bd0ff68
1
Parent(s):
d529654
update: ImageLoader
Browse files- .gitignore +2 -1
- medrag_multi_modal/document_loader/load_image.py +31 -16
.gitignore
CHANGED
|
@@ -6,4 +6,5 @@ cursor_prompt.txt
|
|
| 6 |
.ruff_cache/
|
| 7 |
test.py
|
| 8 |
**.pdf
|
| 9 |
-
images/
|
|
|
|
|
|
| 6 |
.ruff_cache/
|
| 7 |
test.py
|
| 8 |
**.pdf
|
| 9 |
+
images/
|
| 10 |
+
wandb/
|
medrag_multi_modal/document_loader/load_image.py
CHANGED
|
@@ -1,16 +1,19 @@
|
|
| 1 |
import asyncio
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import rich
|
| 4 |
import weave
|
| 5 |
from pdf2image.pdf2image import convert_from_path
|
| 6 |
from PIL import Image
|
| 7 |
|
|
|
|
| 8 |
from medrag_multi_modal.document_loader.load_text import TextLoader
|
| 9 |
|
| 10 |
|
| 11 |
class ImageLoader(TextLoader):
|
| 12 |
"""
|
| 13 |
-
ImageLoader is a class that extends the TextLoader class to handle the extraction and
|
| 14 |
loading of images from a PDF file.
|
| 15 |
|
| 16 |
This class provides functionality to convert specific pages of a PDF document into images
|
|
@@ -20,13 +23,13 @@ class ImageLoader(TextLoader):
|
|
| 20 |
```python
|
| 21 |
import asyncio
|
| 22 |
|
| 23 |
-
import
|
| 24 |
from dotenv import load_dotenv
|
| 25 |
|
| 26 |
from medrag_multi_modal.document_loader import ImageLoader
|
| 27 |
|
| 28 |
load_dotenv()
|
| 29 |
-
|
| 30 |
url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
|
| 31 |
loader = ImageLoader(
|
| 32 |
url=url,
|
|
@@ -37,7 +40,7 @@ class ImageLoader(TextLoader):
|
|
| 37 |
loader.load_data(
|
| 38 |
start_page=31,
|
| 39 |
end_page=33,
|
| 40 |
-
|
| 41 |
)
|
| 42 |
)
|
| 43 |
```
|
|
@@ -59,7 +62,13 @@ class ImageLoader(TextLoader):
|
|
| 59 |
)[0]
|
| 60 |
return image
|
| 61 |
|
| 62 |
-
async def load_data(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
"""
|
| 64 |
Asynchronously loads images from a PDF file specified by a URL or local file path,
|
| 65 |
processes the images for the specified range of pages, and optionally publishes them
|
|
@@ -68,14 +77,15 @@ class ImageLoader(TextLoader):
|
|
| 68 |
This function reads the specified range of pages from a PDF document, converts each page
|
| 69 |
to an image using the `pdf2image` library, and returns a list of dictionaries containing
|
| 70 |
the image and metadata for each processed page. It processes pages concurrently using
|
| 71 |
-
`asyncio` for efficiency. If a
|
| 72 |
-
published to a Weave dataset
|
|
|
|
| 73 |
|
| 74 |
Args:
|
| 75 |
start_page (int): The starting page index (0-based) to process.
|
| 76 |
end_page (int): The ending page index (0-based) to process.
|
| 77 |
-
|
| 78 |
-
|
| 79 |
|
| 80 |
Returns:
|
| 81 |
list[dict]: A list of dictionaries, each containing the image and metadata for a
|
|
@@ -85,6 +95,7 @@ class ImageLoader(TextLoader):
|
|
| 85 |
ValueError: If the specified start_page or end_page is out of bounds of the document's
|
| 86 |
page count.
|
| 87 |
"""
|
|
|
|
| 88 |
start_page, end_page = self.get_page_indices(start_page, end_page)
|
| 89 |
pages = []
|
| 90 |
processed_pages_counter: int = 1
|
|
@@ -92,25 +103,29 @@ class ImageLoader(TextLoader):
|
|
| 92 |
|
| 93 |
async def process_page(page_idx):
|
| 94 |
nonlocal processed_pages_counter
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
pages.append(
|
| 96 |
{
|
| 97 |
-
"image": convert_from_path(
|
| 98 |
-
self.document_file_path,
|
| 99 |
-
first_page=page_idx + 1,
|
| 100 |
-
last_page=page_idx + 1,
|
| 101 |
-
)[0],
|
| 102 |
"page_idx": page_idx,
|
| 103 |
"document_name": self.document_name,
|
| 104 |
"file_path": self.document_file_path,
|
| 105 |
"file_url": self.url,
|
| 106 |
}
|
| 107 |
)
|
|
|
|
| 108 |
rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
|
| 109 |
processed_pages_counter += 1
|
| 110 |
|
| 111 |
tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
|
| 112 |
for task in asyncio.as_completed(tasks):
|
| 113 |
await task
|
| 114 |
-
if
|
| 115 |
-
|
|
|
|
|
|
|
|
|
|
| 116 |
return pages
|
|
|
|
| 1 |
import asyncio
|
| 2 |
+
import os
|
| 3 |
+
from typing import Optional
|
| 4 |
|
| 5 |
import rich
|
| 6 |
import weave
|
| 7 |
from pdf2image.pdf2image import convert_from_path
|
| 8 |
from PIL import Image
|
| 9 |
|
| 10 |
+
import wandb
|
| 11 |
from medrag_multi_modal.document_loader.load_text import TextLoader
|
| 12 |
|
| 13 |
|
| 14 |
class ImageLoader(TextLoader):
|
| 15 |
"""
|
| 16 |
+
ImageLoader is a class that extends the `TextLoader` class to handle the extraction and
|
| 17 |
loading of images from a PDF file.
|
| 18 |
|
| 19 |
This class provides functionality to convert specific pages of a PDF document into images
|
|
|
|
| 23 |
```python
|
| 24 |
import asyncio
|
| 25 |
|
| 26 |
+
import wandb
|
| 27 |
from dotenv import load_dotenv
|
| 28 |
|
| 29 |
from medrag_multi_modal.document_loader import ImageLoader
|
| 30 |
|
| 31 |
load_dotenv()
|
| 32 |
+
wandb.init(project="medrag-multi-modal", entity="ml-colabs")
|
| 33 |
url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
|
| 34 |
loader = ImageLoader(
|
| 35 |
url=url,
|
|
|
|
| 40 |
loader.load_data(
|
| 41 |
start_page=31,
|
| 42 |
end_page=33,
|
| 43 |
+
dataset_name="grays-anatomy-images",
|
| 44 |
)
|
| 45 |
)
|
| 46 |
```
|
|
|
|
| 62 |
)[0]
|
| 63 |
return image
|
| 64 |
|
| 65 |
+
async def load_data(
|
| 66 |
+
self,
|
| 67 |
+
start_page: int,
|
| 68 |
+
end_page: int,
|
| 69 |
+
image_save_dir: str = "./images",
|
| 70 |
+
dataset_name: Optional[str] = None,
|
| 71 |
+
):
|
| 72 |
"""
|
| 73 |
Asynchronously loads images from a PDF file specified by a URL or local file path,
|
| 74 |
processes the images for the specified range of pages, and optionally publishes them
|
|
|
|
| 77 |
This function reads the specified range of pages from a PDF document, converts each page
|
| 78 |
to an image using the `pdf2image` library, and returns a list of dictionaries containing
|
| 79 |
the image and metadata for each processed page. It processes pages concurrently using
|
| 80 |
+
`asyncio` for efficiency. If a `dataset_name` is provided, the processed page images are
|
| 81 |
+
published to Weights & Biases artifact and the corresponding metadata to a Weave dataset
|
| 82 |
+
with the specified name.
|
| 83 |
|
| 84 |
Args:
|
| 85 |
start_page (int): The starting page index (0-based) to process.
|
| 86 |
end_page (int): The ending page index (0-based) to process.
|
| 87 |
+
dataset_name (Optional[str]): The name of the Weave dataset to publish the
|
| 88 |
+
processed images to. Defaults to None.
|
| 89 |
|
| 90 |
Returns:
|
| 91 |
list[dict]: A list of dictionaries, each containing the image and metadata for a
|
|
|
|
| 95 |
ValueError: If the specified start_page or end_page is out of bounds of the document's
|
| 96 |
page count.
|
| 97 |
"""
|
| 98 |
+
os.makedirs(image_save_dir, exist_ok=True)
|
| 99 |
start_page, end_page = self.get_page_indices(start_page, end_page)
|
| 100 |
pages = []
|
| 101 |
processed_pages_counter: int = 1
|
|
|
|
| 103 |
|
| 104 |
async def process_page(page_idx):
|
| 105 |
nonlocal processed_pages_counter
|
| 106 |
+
image = convert_from_path(
|
| 107 |
+
self.document_file_path,
|
| 108 |
+
first_page=page_idx + 1,
|
| 109 |
+
last_page=page_idx + 1,
|
| 110 |
+
)[0]
|
| 111 |
pages.append(
|
| 112 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
"page_idx": page_idx,
|
| 114 |
"document_name": self.document_name,
|
| 115 |
"file_path": self.document_file_path,
|
| 116 |
"file_url": self.url,
|
| 117 |
}
|
| 118 |
)
|
| 119 |
+
image.save(os.path.join(image_save_dir, f"{page_idx}.png"))
|
| 120 |
rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
|
| 121 |
processed_pages_counter += 1
|
| 122 |
|
| 123 |
tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
|
| 124 |
for task in asyncio.as_completed(tasks):
|
| 125 |
await task
|
| 126 |
+
if dataset_name:
|
| 127 |
+
artifact = wandb.Artifact(name=dataset_name, type="dataset")
|
| 128 |
+
artifact.add_dir(local_path=image_save_dir)
|
| 129 |
+
artifact.save()
|
| 130 |
+
weave.publish(weave.Dataset(name=dataset_name, rows=pages))
|
| 131 |
return pages
|