Spaces:

mckabue
/

document-similarity-matching-using-visual-layout-features-archive

Build error

App Files Files Community

Charles Kabui commited on Mar 14, 2024

Commit

0da14c5

1 Parent(s): c33e07b

replaced pdf2image with PyMuPDF

Browse files

Files changed (6) hide show

analysis.ipynb +0 -0
app.py +0 -12
main.py +92 -65
requirements.txt +10 -0
utils/get_RGB_image.py +2 -1
utils/visualize_bboxes_on_image.py +129 -100

analysis.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

app.py CHANGED Viewed

@@ -1,15 +1,3 @@
-import os
-os.system("apt install -y poppler-utils")
-os.system("python -m pip install --upgrade pip")
-os.system("python -m pip install pdf2image==1.16.3")
-os.system("python -m pip install torch==2.1.0")
-os.system("python -m pip install 'git+https://github.com/facebookresearch/detectron2.git@898507047cf441a1e4be7a729270961c401c4354'")
-os.system("python -m pip install layoutparser==0.3.4 layoutparser[layoutmodels] layoutparser[ocr]")
-os.system("python -m pip install Pillow==9.5.0")
-os.system("python -m pip install imagehash==4.3.1")
-os.system("python -m pip install tensorflow==2.15.0 tensorflow-estimator==2.15.0")
-os.system("python -m pip install scikit-learn==1.3.2")
 from main import app
 model_path = './model/trained_model/model_final.pth'














1	from main import app
2
3	model_path = './model/trained_model/model_final.pth'

main.py CHANGED Viewed

@@ -1,17 +1,19 @@
 import traceback
 import gradio as gr
 from utils.get_RGB_image import get_RGB_image, is_online_file, steam_online_file
-from pdf2image import convert_from_path, convert_from_bytes
 import layoutparser as lp
 from PIL import Image
 from utils.get_features import get_features
 from imagehash import average_hash
 from sklearn.metrics.pairwise import cosine_similarity
 from utils.visualize_bboxes_on_image import visualize_bboxes_on_image
-label_map = {0: 'Caption', 1: 'Footnote', 2: 'Formula', 3: 'List-item', 4: 'Page-footer', 5: 'Page-header', 6: 'Picture', 7: 'Section-header', 8: 'Table', 9: 'Text', 10: 'Title'}
 label_names = list(label_map.values())
-color_map = {'Caption': '#acc2d9', 'Footnote': '#56ae57', 'Formula': '#b2996e', 'List-item': '#a8ff04', 'Page-footer': '#69d84f', 'Page-header': '#894585', 'Picture': '#70b23f', 'Section-header': '#d4ffff', 'Table': '#65ab7c', 'Text': '#952e8f', 'Title': '#fcfc81'}
 cache = {
     'output_document_image_1_hash': None,
     'output_document_image_2_hash': None,
@@ -29,14 +31,16 @@ visualize_bboxes_on_image_kwargs = {
     'label_rectangle_left_margin': 0,
     'label_rectangle_top_margin': 0
 }
-vectors_types = ['vectors', 'weighted_vectors', 'reduced_vectors', 'weighted_reduced_vectors']
 def similarity_fn(model: lp.Detectron2LayoutModel, document_image_1: Image.Image, document_image_2: Image.Image, vectors_type: str):
     message = None
     annotations = {
-      'predicted_bboxes': 'predicted_bboxes' if vectors_type in ['vectors', 'weighted_vectors'] else 'reduced_predicted_bboxes',
-      'predicted_scores': 'predicted_scores' if vectors_type in ['vectors', 'weighted_vectors'] else 'reduced_predicted_scores',
-      'predicted_labels': 'predicted_labels' if vectors_type in ['vectors', 'weighted_vectors'] else 'reduced_predicted_labels',
     }
     show_vectors_type = False
     try:
@@ -50,7 +54,8 @@ def similarity_fn(model: lp.Detectron2LayoutModel, document_image_1: Image.Image
                 document_image_1_features = cache['document_image_1_features']
                 document_image_1 = cache['original_document_image_1']
             else:
-                document_image_1_features = get_features(document_image_1, model, label_names)
                 cache['document_image_1_features'] = document_image_1_features
                 cache['original_document_image_1'] = document_image_1
@@ -58,105 +63,126 @@ def similarity_fn(model: lp.Detectron2LayoutModel, document_image_1: Image.Image
                 document_image_2_features = cache['document_image_2_features']
                 document_image_2 = cache['original_document_image_2']
             else:
-                document_image_2_features = get_features(document_image_2, model, label_names)
                 cache['document_image_2_features'] = document_image_2_features
                 cache['original_document_image_2'] = document_image_2
             [[similarity]] = cosine_similarity(
                 [
                     cache['document_image_1_features'][vectors_type]
-                ],
                 [
                     cache['document_image_2_features'][vectors_type]
                 ])
             message = f'<pre style="{pre_message_style}">Similarity between the two documents is: {round(similarity, 4)}<pre>'
             document_image_1 = visualize_bboxes_on_image(
-                image = document_image_1,
-                bboxes = cache['document_image_1_features'][annotations['predicted_bboxes']],
-                labels = [f'{label}, score:{round(score, 2)}' for label, score in zip(
-                    cache['document_image_1_features'][annotations['predicted_labels']],
                     cache['document_image_1_features'][annotations['predicted_scores']])],
-                bbox_outline_color = [color_map[label] for label in cache['document_image_1_features'][annotations['predicted_labels']]],
                 **visualize_bboxes_on_image_kwargs)
             document_image_2 = visualize_bboxes_on_image(
-                image = document_image_2,
-                bboxes = cache['document_image_2_features'][annotations['predicted_bboxes']],
-                labels = [f'{label}, score:{score}' for label, score in zip(
-                    cache['document_image_2_features'][annotations['predicted_labels']],
                     cache['document_image_2_features'][annotations['predicted_scores']])],
-                bbox_outline_color = [color_map[label] for label in cache['document_image_2_features'][annotations['predicted_labels']]],
                 **visualize_bboxes_on_image_kwargs)
-            cache['output_document_image_1_hash'] = str(average_hash(document_image_1))
-            cache['output_document_image_2_hash'] = str(average_hash(document_image_2))
             show_vectors_type = True
     except Exception as e:
         message = f'<pre style="{pre_message_style}">{traceback.format_exc()}<pre>'
     return [
-        gr.HTML(message, visible=True),
-        document_image_1,
         document_image_2,
         gr.Dropdown(visible=show_vectors_type)
     ]
-def load_image(filename, page = 0):
     try:
         image = None
         try:
             if (is_online_file(filename)):
-                image = get_RGB_image(convert_from_bytes(steam_online_file(filename))[page])
             else:
-                image = get_RGB_image(convert_from_path(filename)[page])
-        except:
             image = get_RGB_image(filename)
         return [
-            gr.Image(value=image, visible=True),
             None
         ]
-    except:
-        error = traceback.format_exc()
         return [None, gr.HTML(value=error, visible=True)]
-def preview_url(url, page = 0):
-    [image, error] = load_image(url, page = page)
     if image:
         return [gr.Tabs(selected=0), image, error]
     else:
-        return [gr.Tabs(selected=1), image, error]
 def document_view(document_number: int):
-    gr.HTML(value=f'<h4>Load the {"first" if document_number == 1 else "second"} PDF or Document Image<h4>', elem_classes=['center'])
     with gr.Tabs() as document_tabs:
         with gr.Tab("From Image", id=0):
-            document = gr.Image(type="pil", label=f"Document {document_number}", visible=False, interactive=False, show_download_button=True)
-            document_error_message = gr.HTML(label="Error Message", visible=False)
             document_preview = gr.UploadButton(
-                "Upload PDF or Document Image",
-                file_types=["image", ".pdf"],
                 file_count="single")
         with gr.Tab("From URL", id=1):
             document_url = gr.Textbox(
                 label=f"Document {document_number} URL",
                 info="Paste a Link/URL to PDF or Document Image",
                 placeholder="https://datasets-server.huggingface.co/.../image.jpg")
-            document_url_error_message = gr.HTML(label="Error Message", visible=False)
-            document_url_preview = gr.Button(value="Preview", variant="primary")
     document_preview.upload(
-         fn = lambda file: load_image(file.name),
-         inputs = [document_preview],
-         outputs = [document, document_error_message])
     document_url_preview.click(
-        fn = preview_url,
-        inputs = [document_url],
-        outputs = [document_tabs, document, document_url_error_message])
     return document
-def app(*, model_path, config_path, debug = False):
     model: lp.Detectron2LayoutModel = lp.Detectron2LayoutModel(
-        config_path = config_path,
-        model_path = model_path,
-        label_map = label_map)
     title = 'Document Similarity Search Using Visual Layout Features'
     description = f"<h2>{title}<h2>"
     css = '''
@@ -167,7 +193,7 @@ def app(*, model_path, config_path, debug = False):
     with gr.Blocks(title=title, css=css) as app:
         with gr.Row():
             gr.HTML(value=description, elem_classes=['center'])
-        with gr.Row(equal_height = False):
             with gr.Column():
                 document_1_image = document_view(1)
             with gr.Column():
@@ -178,22 +204,23 @@ def app(*, model_path, config_path, debug = False):
                 submit = gr.Button(value="Get Similarity", variant="primary")
             with gr.Column():
                 vectors_type = gr.Dropdown(
-                    choices = vectors_types,
-                    value = vectors_types[0],
-                    visible = False,
-                    label = "Vectors Type",
-                    info = "Select the Vectors Type to use for Similarity Calculation")
-                similarity_output = gr.HTML(label="Similarity Score", visible=False)
         reset = gr.Button(value="Reset", variant="secondary")
         kwargs = {
             'fn': lambda document_1_image, document_2_image, vectors_type: similarity_fn(
-                model,
-                document_1_image,
-                document_2_image,
                 vectors_type),
             'inputs': [document_1_image, document_2_image, vectors_type],
             'outputs': [similarity_output, document_1_image, document_2_image, vectors_type]
         }
         submit.click(**kwargs)
         vectors_type.change(**kwargs)
-    return app.launch(debug=debug)

 import traceback
 import gradio as gr
 from utils.get_RGB_image import get_RGB_image, is_online_file, steam_online_file
 import layoutparser as lp
 from PIL import Image
 from utils.get_features import get_features
 from imagehash import average_hash
 from sklearn.metrics.pairwise import cosine_similarity
 from utils.visualize_bboxes_on_image import visualize_bboxes_on_image
+import fitz
+label_map = {0: 'Caption', 1: 'Footnote', 2: 'Formula', 3: 'List-item', 4: 'Page-footer',
+             5: 'Page-header', 6: 'Picture', 7: 'Section-header', 8: 'Table', 9: 'Text', 10: 'Title'}
 label_names = list(label_map.values())
+color_map = {'Caption': '#FF0000', 'Footnote': '#00FF00', 'Formula': '#0000FF', 'List-item': '#FF00FF', 'Page-footer': '#FFFF00',
+             'Page-header': '#000000', 'Picture': '#FFFFFF', 'Section-header': '#40E0D0', 'Table': '#F28030', 'Text': '#7F00FF', 'Title': '#C0C0C0'}
 cache = {
     'output_document_image_1_hash': None,
     'output_document_image_2_hash': None,
     'label_rectangle_left_margin': 0,
     'label_rectangle_top_margin': 0
 }
+vectors_types = ['vectors', 'weighted_vectors',
+                 'reduced_vectors', 'weighted_reduced_vectors']
 def similarity_fn(model: lp.Detectron2LayoutModel, document_image_1: Image.Image, document_image_2: Image.Image, vectors_type: str):
     message = None
     annotations = {
+        'predicted_bboxes': 'predicted_bboxes' if vectors_type in ['vectors', 'weighted_vectors'] else 'reduced_predicted_bboxes',
+        'predicted_scores': 'predicted_scores' if vectors_type in ['vectors', 'weighted_vectors'] else 'reduced_predicted_scores',
+        'predicted_labels': 'predicted_labels' if vectors_type in ['vectors', 'weighted_vectors'] else 'reduced_predicted_labels',
     }
     show_vectors_type = False
     try:
                 document_image_1_features = cache['document_image_1_features']
                 document_image_1 = cache['original_document_image_1']
             else:
+                document_image_1_features = get_features(
+                    document_image_1, model, label_names)
                 cache['document_image_1_features'] = document_image_1_features
                 cache['original_document_image_1'] = document_image_1
                 document_image_2_features = cache['document_image_2_features']
                 document_image_2 = cache['original_document_image_2']
             else:
+                document_image_2_features = get_features(
+                    document_image_2, model, label_names)
                 cache['document_image_2_features'] = document_image_2_features
                 cache['original_document_image_2'] = document_image_2
             [[similarity]] = cosine_similarity(
                 [
                     cache['document_image_1_features'][vectors_type]
+                ],
                 [
                     cache['document_image_2_features'][vectors_type]
                 ])
             message = f'<pre style="{pre_message_style}">Similarity between the two documents is: {round(similarity, 4)}<pre>'
             document_image_1 = visualize_bboxes_on_image(
+                image=document_image_1,
+                bboxes=cache['document_image_1_features'][annotations['predicted_bboxes']],
+                labels=[f'{label}, score:{round(score, 2)}' for label, score in zip(
+                    cache['document_image_1_features'][annotations['predicted_labels']],
                     cache['document_image_1_features'][annotations['predicted_scores']])],
+                bbox_outline_color=[
+                    color_map[label] for label in cache['document_image_1_features'][annotations['predicted_labels']]],
+                bbox_fill_color=[
+                    (color_map[label], 50) for label in cache['document_image_1_features'][annotations['predicted_labels']]],
                 **visualize_bboxes_on_image_kwargs)
             document_image_2 = visualize_bboxes_on_image(
+                image=document_image_2,
+                bboxes=cache['document_image_2_features'][annotations['predicted_bboxes']],
+                labels=[f'{label}, score:{round(score, 2)}' for label, score in zip(
+                    cache['document_image_2_features'][annotations['predicted_labels']],
                     cache['document_image_2_features'][annotations['predicted_scores']])],
+                bbox_outline_color=[
+                    color_map[label] for label in cache['document_image_2_features'][annotations['predicted_labels']]],
+                bbox_fill_color=[
+                    (color_map[label], 50) for label in cache['document_image_2_features'][annotations['predicted_labels']]],
                 **visualize_bboxes_on_image_kwargs)
+            cache['output_document_image_1_hash'] = str(
+                average_hash(document_image_1))
+            cache['output_document_image_2_hash'] = str(
+                average_hash(document_image_2))
             show_vectors_type = True
     except Exception as e:
         message = f'<pre style="{pre_message_style}">{traceback.format_exc()}<pre>'
     return [
+        gr.HTML(message, visible=True),
+        document_image_1,
         document_image_2,
         gr.Dropdown(visible=show_vectors_type)
     ]
+def load_image(filename, page=0):
     try:
         image = None
+        first_error = None
         try:
             if (is_online_file(filename)):
+                pixmap = fitz.open("pdf", steam_online_file(filename))[page].get_pixmap()
             else:
+                pixmap = fitz.open(filename)[page].get_pixmap()
+            image = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)
+        except Exception as e:
+            first_error = e
             image = get_RGB_image(filename)
         return [
+            gr.Image(value=image, visible=True),
             None
         ]
+    except Exception as second_error:
+        error = f'{traceback.format_exc()}\n\nFirst Error:\n{first_error}\n\nSecond Error:\n{second_error}'
         return [None, gr.HTML(value=error, visible=True)]
+def preview_url(url, page=0):
+    [image, error] = load_image(url, page=page)
     if image:
         return [gr.Tabs(selected=0), image, error]
     else:
+        return [gr.Tabs(selected=1), image, error]
 def document_view(document_number: int):
+    gr.HTML(value=f'<h4>Load the {"first" if document_number == 1 else "second"} PDF or Document Image<h4>', elem_classes=[
+            'center'])
     with gr.Tabs() as document_tabs:
         with gr.Tab("From Image", id=0):
+            document = gr.Image(
+                type="pil", label=f"Document {document_number}", visible=False, interactive=False, show_download_button=True)
+            document_error_message = gr.HTML(
+                label="Error Message", visible=False)
             document_preview = gr.UploadButton(
+                "Upload PDF or Document Image",
+                file_types=["image", ".pdf"],
                 file_count="single")
         with gr.Tab("From URL", id=1):
             document_url = gr.Textbox(
                 label=f"Document {document_number} URL",
                 info="Paste a Link/URL to PDF or Document Image",
                 placeholder="https://datasets-server.huggingface.co/.../image.jpg")
+            document_url_error_message = gr.HTML(
+                label="Error Message", visible=False)
+            document_url_preview = gr.Button(
+                value="Preview", variant="primary")
     document_preview.upload(
+        fn=lambda file: load_image(file.name),
+        inputs=[document_preview],
+        outputs=[document, document_error_message])
     document_url_preview.click(
+        fn=preview_url,
+        inputs=[document_url],
+        outputs=[document_tabs, document, document_url_error_message])
     return document
+def app(*, model_path, config_path, debug=False):
     model: lp.Detectron2LayoutModel = lp.Detectron2LayoutModel(
+        config_path=config_path,
+        model_path=model_path,
+        label_map=label_map)
     title = 'Document Similarity Search Using Visual Layout Features'
     description = f"<h2>{title}<h2>"
     css = '''
     with gr.Blocks(title=title, css=css) as app:
         with gr.Row():
             gr.HTML(value=description, elem_classes=['center'])
+        with gr.Row(equal_height=False):
             with gr.Column():
                 document_1_image = document_view(1)
             with gr.Column():
                 submit = gr.Button(value="Get Similarity", variant="primary")
             with gr.Column():
                 vectors_type = gr.Dropdown(
+                    choices=vectors_types,
+                    value=vectors_types[0],
+                    visible=False,
+                    label="Vectors Type",
+                    info="Select the Vectors Type to use for Similarity Calculation")
+                similarity_output = gr.HTML(
+                    label="Similarity Score", visible=False)
         reset = gr.Button(value="Reset", variant="secondary")
         kwargs = {
             'fn': lambda document_1_image, document_2_image, vectors_type: similarity_fn(
+                model,
+                document_1_image,
+                document_2_image,
                 vectors_type),
             'inputs': [document_1_image, document_2_image, vectors_type],
             'outputs': [similarity_output, document_1_image, document_2_image, vectors_type]
         }
         submit.click(**kwargs)
         vectors_type.change(**kwargs)
+    return app.launch(debug=debug)

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+PyMuPDF==1.23.26
+scikit-learn==1.3.2
+torch==2.1.0
+torchvision==0.16.0
+tensorflow==2.15.0
+ImageHash==4.3.1
+Pillow==9.5.0
+layoutparser[layoutmodels,ocr]==0.3.4
+detectron2 @ git+https://github.com/facebookresearch/detectron2.git@898507047cf441a1e4be7a729270961c401c4354

utils/get_RGB_image.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from PIL import Image
 from urllib.parse import urlparse
 import requests
@@ -6,7 +7,7 @@ def is_online_file(url: str) -> bool:
     return urlparse(url).scheme in ["http", "https"]
 def steam_online_file(url: str) -> bytes:
-    return requests.get(url, stream=True).raw
 def get_RGB_image(image_or_path: str | Image.Image) -> bytes:
     if isinstance(image_or_path, str):

+import io
 from PIL import Image
 from urllib.parse import urlparse
 import requests
     return urlparse(url).scheme in ["http", "https"]
 def steam_online_file(url: str) -> bytes:
+    return io.BytesIO(requests.get(url, stream=True).content)
 def get_RGB_image(image_or_path: str | Image.Image) -> bytes:
     if isinstance(image_or_path, str):

utils/visualize_bboxes_on_image.py CHANGED Viewed

@@ -5,115 +5,144 @@ import numpy as np
 import requests
 from typing import List
 from functools import cache
 DEFAULTS = {
-  'bbox_outline_width': 2,
-  'bbox_outline_color': (0, 0, 256, 123), # alpha runs from 0 to 127
-  'bbox_fill_color': (256, 0, 0, 50), # alpha runs from 0 to 127
-  'label_text_color': "black",
-  'label_fill_color': "red",
-  'label_text_padding': 0,
-  'label_rectangle_left_margin': 0,
-  'label_rectangle_top_margin': 0,
-  'label_text_size': 12,
 }
 @cache
 def get_font(path_or_url: str = 'https://github.com/googlefonts/roboto/raw/main/src/hinted/Roboto-Regular.ttf', size: int = DEFAULTS['label_text_size']):
-  if urlparse(path_or_url).scheme in ["http", "https"]: # Online
-      return ImageFont.truetype(requests.get(path_or_url, stream=True).raw, size=size)
-  else: # Local
-    return ImageFont.truetype(path_or_url, size=size)
 def visualize_bboxes_on_image(
-    image: Image.Image,
-    bboxes: List[List[int]],
-    labels: List[str] = None,
-    bbox_outline_width = DEFAULTS["bbox_outline_width"],
-    bbox_outline_color = DEFAULTS["bbox_outline_color"],
-    bbox_fill_color: str | list[tuple | str] = DEFAULTS["bbox_fill_color"],
-    label_text_color: str | list[tuple | str] = DEFAULTS["label_text_color"],
-    label_fill_color = DEFAULTS["label_fill_color"],
-    label_text_padding = DEFAULTS["label_text_padding"],
-    label_rectangle_left_margin = DEFAULTS["label_rectangle_left_margin"],
-    label_rectangle_top_margin = DEFAULTS['label_rectangle_top_margin'],
-    label_text_size = DEFAULTS["label_text_size"],
-    convert_to_x0y0x1y1 = None) -> Image.Image:
-  '''
-  Visualize bounding boxes on an image
-  Args:
-    image: Image to visualize
-    bboxes: List of bounding boxes
-    labels: Titles of the bounding boxes
-    bbox_outline_width: Width of the bounding box
-    bbox_outline_color: Color of the bounding box
-    bbox_fill_color: Fill color of the bounding box
-    label_text_color: Color of the label text
-    label_fill_color: Color of the label rectangle
-    label_text_padding: Padding of the label text
-    label_rectangle_left_margin: Left padding of the label rectangle
-    label_rectangle_top_margin: Top padding of the label rectangle
-    label_text_size: Font size of the label text
-    convert_to_x0y0x1y1: Function to convert bounding box to x0y0x1y1 format
-  Returns:
-    Image: Image annotated with bounding boxes
-  '''
-  image = image.copy().convert("RGB")
-  draw = ImageDraw.Draw(image)
-  font = get_font(size = label_text_size)
-  labels = (labels or []) + np.full(len(bboxes) - len(labels or []), None).tolist()
-  bbox_fill_colors = bbox_fill_color if isinstance(bbox_fill_color, list) else [bbox_fill_color] * len(bboxes)
-  bbox_outline_colors = bbox_outline_color if isinstance(bbox_outline_color, list) else [bbox_outline_color] * len(bboxes)
-  for bbox, label, _bbox_fill_color, _bbox_outline_color in zip(bboxes, labels, bbox_fill_colors, bbox_outline_colors):
-    x0, y0, x1, y1 = convert_to_x0y0x1y1(bbox) if convert_to_x0y0x1y1 is not None else bbox
-    rectangle_image = Image.new('RGBA', image.size)
-    rectangle_image_draw = ImageDraw.Draw(rectangle_image)
-    rectangle_image_draw.rectangle(
-      xy = [x0, y0, x1, y1],
-      fill = _bbox_fill_color,
-      outline = _bbox_outline_color,
-      width = bbox_outline_width)
-    image.paste(im = rectangle_image, mask = rectangle_image)
-    if label is not None:
-      draw_text_on_image(
-        draw,
-        [x0, y0],
-        label,
-        label_text_color,
-        label_fill_color,
-        label_text_padding,
-        label_rectangle_left_margin,
-        label_rectangle_top_margin,
-        label_text_size,
-        font)
-  return image
 def draw_text_on_image(
-    image_or_draw: Image.Image | ImageDraw.ImageDraw,
-    text_position_xy: List[int],
-    label: str,
-    label_text_color = DEFAULTS["label_text_color"],
-    label_fill_color = DEFAULTS["label_fill_color"],
-    label_text_padding = DEFAULTS["label_text_padding"],
-    label_rectangle_left_margin = DEFAULTS["label_rectangle_left_margin"],
-    label_rectangle_top_margin = DEFAULTS['label_rectangle_top_margin'],
-    label_text_size = DEFAULTS["label_text_size"],
-    font: ImageFont.FreeTypeFont = None) -> Image.Image:
-  is_image = isinstance(image_or_draw, Image.Image)
-  image = image_or_draw.copy().convert("RGB") if is_image else None
-  font = font or get_font(size = label_text_size)
-  x0, y0 = text_position_xy
-  text_position = (x0 - label_rectangle_left_margin + label_text_padding, y0 - label_rectangle_top_margin + label_text_padding)
-  draw = ImageDraw.Draw(image) if is_image else image_or_draw
-  _, _, text_bbox_right, text_bbox_bottom = draw.textbbox(text_position, label, font=font)
-  xy = [
-    text_position[0] - label_text_padding,
-    text_position[1] - label_text_padding,
-    text_bbox_right + label_text_padding + label_text_padding,
-    text_bbox_bottom + label_text_padding + label_text_padding
-  ]
-  draw.rectangle(xy, fill = label_fill_color)
-  draw.text(text_position, label, font=font, fill=label_text_color)
-  return image

 import requests
 from typing import List
 from functools import cache
+import matplotlib.colors as colors
 DEFAULTS = {
+    'bbox_outline_width': 2,
+    # color name or hex code or tuple of RGBA or tuple of RGB or tuple (color_name, alpha)
+    # between 0 (fully transparent) and 255 (fully opaque)
+    'bbox_outline_color': ('blue', 123),
+    # color name or hex code or tuple of RGBA or tuple of RGB or tuple (color_name, alpha)
+    # between 0 (fully transparent) and 255 (fully opaque)
+    'bbox_fill_color': ('red', 50),
+    'label_text_color': "black",
+    'label_fill_color': "red",
+    'label_text_padding': 0,
+    'label_rectangle_left_margin': 0,
+    'label_rectangle_top_margin': 0,
+    'label_text_size': 12,
 }
 @cache
 def get_font(path_or_url: str = 'https://github.com/googlefonts/roboto/raw/main/src/hinted/Roboto-Regular.ttf', size: int = DEFAULTS['label_text_size']):
+    if urlparse(path_or_url).scheme in ["http", "https"]:  # Online
+        return ImageFont.truetype(requests.get(path_or_url, stream=True).raw, size=size)
+    else:  # Local
+        return ImageFont.truetype(path_or_url, size=size)
+named_colors_mapping = colors.get_named_colors_mapping()
+@cache
+def get_color(color: str | tuple) -> tuple | str:
+    if isinstance(color, tuple):
+        if len(color) == 2:
+            real_color, alpha = (color[0], int(color[1]))
+            if colors.is_color_like(real_color):
+                real_color_rgb = colors.hex2color(named_colors_mapping.get(real_color, real_color))
+                if len(real_color_rgb) == 3:
+                    real_color_alpha = (np.array(real_color_rgb, dtype=int) * 255).tolist() + [alpha]
+                    return tuple(real_color_alpha)
+    return color
 def visualize_bboxes_on_image(
+        image: Image.Image,
+        bboxes: List[List[int]],
+        labels: List[str] = None,
+        bbox_outline_width=DEFAULTS["bbox_outline_width"],
+        bbox_outline_color=DEFAULTS["bbox_outline_color"],
+        bbox_fill_color: str | list[tuple | str] = DEFAULTS["bbox_fill_color"],
+        label_text_color: str | list[tuple |
+                                     str] = DEFAULTS["label_text_color"],
+        label_fill_color=DEFAULTS["label_fill_color"],
+        label_text_padding=DEFAULTS["label_text_padding"],
+        label_rectangle_left_margin=DEFAULTS["label_rectangle_left_margin"],
+        label_rectangle_top_margin=DEFAULTS['label_rectangle_top_margin'],
+        label_text_size=DEFAULTS["label_text_size"],
+        convert_to_x0y0x1y1=None) -> Image.Image:
+    '''
+    Visualize bounding boxes on an image
+    Args:
+      image: Image to visualize
+      bboxes: List of bounding boxes
+      labels: Titles of the bounding boxes
+      bbox_outline_width: Width of the bounding box
+      bbox_outline_color: Color of the bounding box
+      bbox_fill_color: Fill color of the bounding box
+      label_text_color: Color of the label text
+      label_fill_color: Color of the label rectangle
+      label_text_padding: Padding of the label text
+      label_rectangle_left_margin: Left padding of the label rectangle
+      label_rectangle_top_margin: Top padding of the label rectangle
+      label_text_size: Font size of the label text
+      convert_to_x0y0x1y1: Function to convert bounding box to x0y0x1y1 format
+    Returns:
+      Image: Image annotated with bounding boxes
+    '''
+    image = image.copy().convert("RGB")
+    draw = ImageDraw.Draw(image)
+    font = get_font(size=label_text_size)
+    labels = (labels or []) + np.full(len(bboxes) -
+                                      len(labels or []), None).tolist()
+    bbox_fill_colors = bbox_fill_color if isinstance(bbox_fill_color, list) else [
+        bbox_fill_color] * len(bboxes)
+    bbox_outline_colors = bbox_outline_color if isinstance(
+        bbox_outline_color, list) else [bbox_outline_color] * len(bboxes)
+    for bbox, label, _bbox_fill_color, _bbox_outline_color in zip(bboxes, labels, bbox_fill_colors, bbox_outline_colors):
+        x0, y0, x1, y1 = convert_to_x0y0x1y1(
+            bbox) if convert_to_x0y0x1y1 is not None else bbox
+        _bbox_fill_color = get_color(_bbox_fill_color)
+        _bbox_outline_color = get_color(_bbox_outline_color)
+        rectangle_image = Image.new('RGBA', image.size)
+        rectangle_image_draw = ImageDraw.Draw(rectangle_image)
+        rectangle_image_draw.rectangle(
+            xy=[x0, y0, x1, y1],
+            fill=_bbox_fill_color,
+            outline=_bbox_outline_color,
+            width=bbox_outline_width)
+        image.paste(im=rectangle_image, mask=rectangle_image)
+        if label is not None:
+            draw_text_on_image(
+                draw,
+                [x0, y0],
+                label,
+                label_text_color,
+                label_fill_color,
+                label_text_padding,
+                label_rectangle_left_margin,
+                label_rectangle_top_margin,
+                label_text_size,
+                font)
+    return image
 def draw_text_on_image(
+        image_or_draw: Image.Image | ImageDraw.ImageDraw,
+        text_position_xy: List[int],
+        label: str,
+        label_text_color=DEFAULTS["label_text_color"],
+        label_fill_color=DEFAULTS["label_fill_color"],
+        label_text_padding=DEFAULTS["label_text_padding"],
+        label_rectangle_left_margin=DEFAULTS["label_rectangle_left_margin"],
+        label_rectangle_top_margin=DEFAULTS['label_rectangle_top_margin'],
+        label_text_size=DEFAULTS["label_text_size"],
+        font: ImageFont.FreeTypeFont = None) -> Image.Image:
+    is_image = isinstance(image_or_draw, Image.Image)
+    image = image_or_draw.copy().convert("RGB") if is_image else None
+    font = font or get_font(size=label_text_size)
+    x0, y0 = text_position_xy
+    text_position = (x0 - label_rectangle_left_margin + label_text_padding,
+                     y0 - label_rectangle_top_margin + label_text_padding)
+    draw = ImageDraw.Draw(image) if is_image else image_or_draw
+    _, _, text_bbox_right, text_bbox_bottom = draw.textbbox(
+        text_position, label, font=font)
+    xy = [
+        text_position[0] - label_text_padding,
+        text_position[1] - label_text_padding,
+        text_bbox_right + label_text_padding + label_text_padding,
+        text_bbox_bottom + label_text_padding + label_text_padding
+    ]
+    draw.rectangle(xy, fill=label_fill_color)
+    draw.text(text_position, label, font=font, fill=label_text_color)
+    return image