Tom Aarsen commited on
Commit
3921dd6
·
1 Parent(s): eded98b

Add SparseEncoder & CrossEncoder support to backend-export

Browse files
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: ⚙️
4
  colorFrom: indigo
5
  colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 5.5.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
4
  colorFrom: indigo
5
  colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 5.42.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
app.py CHANGED
@@ -1,18 +1,30 @@
1
  from enum import Enum
2
- from functools import partial
 
3
  from pathlib import Path
4
  from typing import Optional, Tuple
5
  import gradio as gr
6
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
7
  import huggingface_hub
8
- from sentence_transformers import SentenceTransformer
9
  from sentence_transformers import (
10
  export_dynamic_quantized_onnx_model as st_export_dynamic_quantized_onnx_model,
11
  export_optimized_onnx_model as st_export_optimized_onnx_model,
12
  export_static_quantized_openvino_model as st_export_static_quantized_openvino_model,
13
  )
14
- from huggingface_hub import model_info, upload_folder, get_repo_discussions, list_repo_commits, HfFileSystem
15
- from huggingface_hub.errors import RepositoryNotFoundError
 
 
 
 
 
 
 
 
 
 
 
16
  from optimum.intel import OVQuantizationConfig
17
  from tempfile import TemporaryDirectory
18
 
@@ -29,9 +41,20 @@ class Backend(Enum):
29
  return self.value
30
 
31
 
 
 
 
 
 
 
 
 
 
 
32
  backends = [str(backend) for backend in Backend]
33
  FILE_SYSTEM = HfFileSystem()
34
 
 
35
  def is_new_model(model_id: str) -> bool:
36
  """
37
  Check if the model ID exists on the Hugging Face Hub. If we get a request error, then we
@@ -50,12 +73,59 @@ def is_sentence_transformer_model(model_id: str) -> bool:
50
  return "sentence-transformers" in model_info(model_id).tags
51
 
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  def get_last_commit(model_id: str) -> str:
54
  """
55
  Get the last commit hash of the model ID.
56
  """
57
  return f"https://huggingface.co/{model_id}/commit/{list_repo_commits(model_id)[0].commit_id}"
58
 
 
59
  def get_last_pr(model_id: str) -> Tuple[str, int]:
60
  last_pr = next(get_repo_discussions(model_id))
61
  return last_pr.url, last_pr.num
@@ -80,12 +150,25 @@ def export_to_torch(model_id, create_pr, output_model_id):
80
  )
81
 
82
 
83
- def export_to_onnx(model_id: str, create_pr: bool, output_model_id: str, token: Optional[str] = None) -> None:
 
 
 
 
 
 
84
  if does_file_glob_exist(output_model_id, "**/model.onnx"):
85
  raise FileExistsError("An ONNX model already exists in the repository")
86
 
87
- model = SentenceTransformer(model_id, backend="onnx")
88
-
 
 
 
 
 
 
 
89
  commit_message = "Add exported onnx model 'model.onnx'"
90
 
91
  if is_new_model(output_model_id):
@@ -110,22 +193,27 @@ Hello!
110
  ## Tip:
111
  Consider testing this pull request before merging by loading the model from this PR with the `revision` argument:
112
  ```python
113
- from sentence_transformers import SentenceTransformer
114
 
115
  # TODO: Fill in the PR number
116
  pr_number = 2
117
- model = SentenceTransformer(
118
  "{output_model_id}",
119
  revision=f"refs/pr/{{pr_number}}",
120
  backend="onnx",
121
  )
122
 
123
  # Verify that everything works as expected
124
- embeddings = model.encode(["The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium."])
125
  print(embeddings.shape)
126
 
127
  similarities = model.similarity(embeddings, embeddings)
128
- print(similarities)
 
 
 
 
 
129
  ```
130
  """
131
 
@@ -139,16 +227,24 @@ print(similarities)
139
  token=token,
140
  )
141
 
142
- def export_to_onnx_snippet(model_id: str, create_pr: bool, output_model_id: str) -> str:
143
- return """\
 
 
 
 
 
 
 
144
  pip install sentence_transformers[onnx-gpu]
145
  # or
146
  pip install sentence_transformers[onnx]
147
- """, f"""\
148
- from sentence_transformers import SentenceTransformer
 
149
 
150
  # 1. Load the model to be exported with the ONNX backend
151
- model = SentenceTransformer(
152
  "{model_id}",
153
  backend="onnx",
154
  )
@@ -160,31 +256,60 @@ model = SentenceTransformer(
160
  "{output_model_id}",
161
  create_pr=True,
162
  )'''}
163
- """, f"""\
164
- from sentence_transformers import SentenceTransformer
 
165
 
166
  # 1. Load the model from the Hugging Face Hub
167
  # (until merged) Use the `revision` argument to load the model from the PR
168
  pr_number = 2
169
- model = SentenceTransformer(
170
  "{output_model_id}",
171
  revision=f"refs/pr/{{pr_number}}",
172
  backend="onnx",
173
  )
174
-
 
 
175
  # 2. Inference works as normal
176
  embeddings = model.encode(["The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium."])
177
  similarities = model.similarity(embeddings, embeddings)
178
  """
 
 
 
 
 
 
 
 
 
 
179
 
180
 
181
  def export_to_onnx_dynamic_quantization(
182
- model_id: str, create_pr: bool, output_model_id: str, onnx_quantization_config: str, token: Optional[str] = None
 
 
 
 
 
183
  ) -> None:
184
- if does_file_glob_exist(output_model_id, f"onnx/model_qint8_{onnx_quantization_config}.onnx"):
185
- raise FileExistsError("The quantized ONNX model already exists in the repository")
186
-
187
- model = SentenceTransformer(model_id, backend="onnx")
 
 
 
 
 
 
 
 
 
 
 
188
 
189
  if not create_pr and is_new_model(output_model_id):
190
  model.push_to_hub(repo_id=output_model_id, token=token)
@@ -202,7 +327,20 @@ def export_to_onnx_dynamic_quantization(
202
  )
203
  except ValueError:
204
  # Currently, quantization with optimum has some issues if there's already an ONNX model in a subfolder
205
- model = SentenceTransformer(model_id, backend="onnx", model_kwargs={"export": True})
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  st_export_dynamic_quantized_onnx_model(
207
  model,
208
  quantization_config=onnx_quantization_config,
@@ -213,21 +351,31 @@ def export_to_onnx_dynamic_quantization(
213
  finally:
214
  huggingface_hub.upload_folder = original_upload_folder
215
 
 
216
  def export_to_onnx_dynamic_quantization_snippet(
217
- model_id: str, create_pr: bool, output_model_id: str, onnx_quantization_config: str
218
- ) -> str:
219
- return """\
 
 
 
 
 
 
 
 
220
  pip install sentence_transformers[onnx-gpu]
221
  # or
222
  pip install sentence_transformers[onnx]
223
- """, f"""\
 
224
  from sentence_transformers import (
225
- SentenceTransformer,
226
  export_dynamic_quantized_onnx_model,
227
  )
228
 
229
- # 1. Load the model to be quantized with the ONNX backend
230
- model = SentenceTransformer(
231
  "{model_id}",
232
  backend="onnx",
233
  )
@@ -240,29 +388,61 @@ export_dynamic_quantized_onnx_model(
240
  push_to_hub=True,
241
  {''' create_pr=True,
242
  ''' if create_pr else ''})
243
- """, f"""\
244
- from sentence_transformers import SentenceTransformer
 
245
 
246
  # 1. Load the model from the Hugging Face Hub
247
  # (until merged) Use the `revision` argument to load the model from the PR
248
  pr_number = 2
249
- model = SentenceTransformer(
250
  "{output_model_id}",
251
  revision=f"refs/pr/{{pr_number}}",
252
  backend="onnx",
253
  model_kwargs={{"file_name": "model_qint8_{onnx_quantization_config}.onnx"}},
254
  )
255
-
 
 
256
  # 2. Inference works as normal
257
  embeddings = model.encode(["The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium."])
258
  similarities = model.similarity(embeddings, embeddings)
259
  """
 
 
 
 
 
 
 
 
 
 
260
 
261
- def export_to_onnx_optimization(model_id: str, create_pr: bool, output_model_id: str, onnx_optimization_config: str, token: Optional[str] = None) -> None:
262
- if does_file_glob_exist(output_model_id, f"onnx/model_{onnx_optimization_config}.onnx"):
263
- raise FileExistsError("The optimized ONNX model already exists in the repository")
264
 
265
- model = SentenceTransformer(model_id, backend="onnx")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
 
267
  if not create_pr and is_new_model(output_model_id):
268
  model.push_to_hub(repo_id=output_model_id, token=token)
@@ -281,19 +461,31 @@ def export_to_onnx_optimization(model_id: str, create_pr: bool, output_model_id:
281
  finally:
282
  huggingface_hub.upload_folder = original_upload_folder
283
 
284
- def export_to_onnx_optimization_snippet(model_id: str, create_pr: bool, output_model_id: str, onnx_optimization_config: str) -> str:
285
- return """\
 
 
 
 
 
 
 
 
 
 
 
286
  pip install sentence_transformers[onnx-gpu]
287
  # or
288
  pip install sentence_transformers[onnx]
289
- """, f"""\
 
290
  from sentence_transformers import (
291
- SentenceTransformer,
292
  export_optimized_onnx_model,
293
  )
294
 
295
  # 1. Load the model to be optimized with the ONNX backend
296
- model = SentenceTransformer(
297
  "{model_id}",
298
  backend="onnx",
299
  )
@@ -306,30 +498,56 @@ export_optimized_onnx_model(
306
  push_to_hub=True,
307
  {''' create_pr=True,
308
  ''' if create_pr else ''})
309
- """, f"""\
310
- from sentence_transformers import SentenceTransformer
 
311
 
312
  # 1. Load the model from the Hugging Face Hub
313
  # (until merged) Use the `revision` argument to load the model from the PR
314
  pr_number = 2
315
- model = SentenceTransformer(
316
  "{output_model_id}",
317
  revision=f"refs/pr/{{pr_number}}",
318
  backend="onnx",
319
  model_kwargs={{"file_name": "model_{onnx_optimization_config}.onnx"}},
320
  )
321
-
 
 
322
  # 2. Inference works as normal
323
  embeddings = model.encode(["The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium."])
324
  similarities = model.similarity(embeddings, embeddings)
325
  """
 
 
 
 
 
 
 
 
 
 
326
 
327
 
328
- def export_to_openvino(model_id: str, create_pr: bool, output_model_id: str, token: Optional[str] = None) -> None:
 
 
 
 
 
 
329
  if does_file_glob_exist(output_model_id, "**/openvino_model.xml"):
330
  raise FileExistsError("The OpenVINO model already exists in the repository")
331
 
332
- model = SentenceTransformer(model_id, backend="openvino")
 
 
 
 
 
 
 
333
 
334
  commit_message = "Add exported openvino model 'openvino_model.xml'"
335
 
@@ -355,22 +573,27 @@ Hello!
355
  ## Tip:
356
  Consider testing this pull request before merging by loading the model from this PR with the `revision` argument:
357
  ```python
358
- from sentence_transformers import SentenceTransformer
359
 
360
  # TODO: Fill in the PR number
361
  pr_number = 2
362
- model = SentenceTransformer(
363
  "{output_model_id}",
364
  revision=f"refs/pr/{{pr_number}}",
365
  backend="openvino",
366
  )
367
 
368
  # Verify that everything works as expected
369
- embeddings = model.encode(["The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium."])
370
  print(embeddings.shape)
371
 
372
  similarities = model.similarity(embeddings, embeddings)
373
- print(similarities)
 
 
 
 
 
374
  ```
375
  """
376
 
@@ -384,14 +607,22 @@ print(similarities)
384
  token=token,
385
  )
386
 
387
- def export_to_openvino_snippet(model_id: str, create_pr: bool, output_model_id: str) -> str:
388
- return """\
 
 
 
 
 
 
 
389
  pip install sentence_transformers[openvino]
390
- """, f"""\
391
- from sentence_transformers import SentenceTransformer
 
392
 
393
  # 1. Load the model to be exported with the OpenVINO backend
394
- model = SentenceTransformer(
395
  "{model_id}",
396
  backend="openvino",
397
  )
@@ -403,25 +634,40 @@ model = SentenceTransformer(
403
  "{output_model_id}",
404
  create_pr=True,
405
  )'''}
406
- """, f"""\
407
- from sentence_transformers import SentenceTransformer
 
408
 
409
  # 1. Load the model from the Hugging Face Hub
410
  # (until merged) Use the `revision` argument to load the model from the PR
411
  pr_number = 2
412
- model = SentenceTransformer(
413
  "{output_model_id}",
414
  revision=f"refs/pr/{{pr_number}}",
415
  backend="openvino",
416
  )
417
-
 
 
418
  # 2. Inference works as normal
419
  embeddings = model.encode(["The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium."])
420
  similarities = model.similarity(embeddings, embeddings)
421
  """
 
 
 
 
 
 
 
 
 
 
 
422
 
423
  def export_to_openvino_static_quantization(
424
  model_id: str,
 
425
  create_pr: bool,
426
  output_model_id: str,
427
  ov_quant_dataset_name: str,
@@ -431,10 +677,21 @@ def export_to_openvino_static_quantization(
431
  ov_quant_dataset_num_samples: int,
432
  token: Optional[str] = None,
433
  ) -> None:
434
- if does_file_glob_exist(output_model_id, "openvino/openvino_model_qint8_quantized.xml"):
435
- raise FileExistsError("The quantized OpenVINO model already exists in the repository")
 
 
 
 
436
 
437
- model = SentenceTransformer(model_id, backend="openvino")
 
 
 
 
 
 
 
438
 
439
  if not create_pr and is_new_model(output_model_id):
440
  model.push_to_hub(repo_id=output_model_id, token=token)
@@ -459,8 +716,10 @@ def export_to_openvino_static_quantization(
459
  finally:
460
  huggingface_hub.upload_folder = original_upload_folder
461
 
 
462
  def export_to_openvino_static_quantization_snippet(
463
  model_id: str,
 
464
  create_pr: bool,
465
  output_model_id: str,
466
  ov_quant_dataset_name: str,
@@ -468,18 +727,23 @@ def export_to_openvino_static_quantization_snippet(
468
  ov_quant_dataset_split: str,
469
  ov_quant_dataset_column_name: str,
470
  ov_quant_dataset_num_samples: int,
471
- ) -> str:
472
- return """\
 
 
 
 
473
  pip install sentence_transformers[openvino]
474
- """, f"""\
 
475
  from sentence_transformers import (
476
- SentenceTransformer,
477
  export_static_quantized_openvino_model,
478
  )
479
  from optimum.intel import OVQuantizationConfig
480
 
481
  # 1. Load the model to be quantized with the OpenVINO backend
482
- model = SentenceTransformer(
483
  "{model_id}",
484
  backend="openvino",
485
  )
@@ -498,23 +762,37 @@ export_static_quantized_openvino_model(
498
  push_to_hub=True,
499
  {''' create_pr=True,
500
  ''' if create_pr else ''})
501
- """, f"""\
502
- from sentence_transformers import SentenceTransformer
 
503
 
504
  # 1. Load the model from the Hugging Face Hub
505
  # (until merged) Use the `revision` argument to load the model from the PR
506
  pr_number = 2
507
- model = SentenceTransformer(
508
  "{output_model_id}",
509
  revision=f"refs/pr/{{pr_number}}",
510
  backend="openvino",
511
  model_kwargs={{"file_name": "openvino_model_qint8_quantized.xml"}},
512
  )
513
-
 
 
514
  # 2. Inference works as normal
515
  embeddings = model.encode(["The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium."])
516
  similarities = model.similarity(embeddings, embeddings)
517
  """
 
 
 
 
 
 
 
 
 
 
 
518
 
519
  def on_submit(
520
  model_id,
@@ -533,35 +811,67 @@ def on_submit(
533
  profile: Optional[gr.OAuthProfile] = None,
534
  ):
535
  if oauth_token is None or profile is None:
536
- return "Commit or PR url:<br>...", inference_snippet, gr.Textbox("Please sign in with Hugging Face to use this Space", visible=True)
 
 
 
 
 
 
537
 
538
  if not model_id:
539
- return "Commit or PR url:<br>...", inference_snippet, gr.Textbox("Please enter a model ID", visible=True)
 
 
 
 
540
 
541
  if not is_sentence_transformer_model(model_id):
542
- return "Commit or PR url:<br>...", inference_snippet, gr.Textbox("The source model must have a Sentence Transformers tag", visible=True)
 
 
 
 
 
 
543
 
544
  if output_model_id and "/" not in output_model_id:
545
  output_model_id = f"{profile.name}/{output_model_id}"
546
 
547
  output_model_id = output_model_id if not create_pr else model_id
 
548
 
549
  try:
550
  if backend == Backend.ONNX.value:
551
- export_to_onnx(model_id, create_pr, output_model_id, token=oauth_token.token)
 
 
552
  elif backend == Backend.ONNX_DYNAMIC_QUANTIZATION.value:
553
  export_to_onnx_dynamic_quantization(
554
- model_id, create_pr, output_model_id, onnx_quantization_config, token=oauth_token.token
 
 
 
 
 
555
  )
556
  elif backend == Backend.ONNX_OPTIMIZATION.value:
557
  export_to_onnx_optimization(
558
- model_id, create_pr, output_model_id, onnx_optimization_config, token=oauth_token.token
 
 
 
 
 
559
  )
560
  elif backend == Backend.OPENVINO.value:
561
- export_to_openvino(model_id, create_pr, output_model_id, token=oauth_token.token)
 
 
562
  elif backend == Backend.OPENVINO_STATIC_QUANTIZATION.value:
563
  export_to_openvino_static_quantization(
564
  model_id,
 
565
  create_pr,
566
  output_model_id,
567
  ov_quant_dataset_name,
@@ -572,19 +882,32 @@ def on_submit(
572
  token=oauth_token.token,
573
  )
574
  except FileExistsError as exc:
575
- return "Commit or PR url:<br>...", inference_snippet, gr.Textbox(str(exc), visible=True)
 
 
 
 
576
 
577
  if create_pr:
578
  url, num = get_last_pr(output_model_id)
579
- return f"PR url:<br>{url}", inference_snippet.replace("pr_number = 2", f"pr_number = {num}"), gr.Textbox(visible=False)
580
-
 
 
 
 
581
  # Remove the lines that refer to the revision argument
582
  lines = inference_snippet.splitlines()
583
  del lines[7]
584
  del lines[4]
585
  del lines[3]
586
  inference_snippet = "\n".join(lines)
587
- return f"Commit url:<br>{get_last_commit(output_model_id)}", inference_snippet, gr.Textbox(visible=False)
 
 
 
 
 
588
 
589
  def on_change(
590
  model_id,
@@ -602,31 +925,44 @@ def on_change(
602
  profile: Optional[gr.OAuthProfile] = None,
603
  ) -> str:
604
  if oauth_token is None or profile is None:
605
- return "", "", "", gr.Textbox("Please sign in with Hugging Face to use this Space", visible=True)
 
 
 
 
 
 
 
606
 
607
  if not model_id:
608
  return "", "", "", gr.Textbox("Please enter a model ID", visible=True)
609
-
610
  if output_model_id and "/" not in output_model_id:
611
  output_model_id = f"{profile.username}/{output_model_id}"
612
 
613
  output_model_id = output_model_id if not create_pr else model_id
 
614
 
615
  if backend == Backend.ONNX.value:
616
- snippets = export_to_onnx_snippet(model_id, create_pr, output_model_id)
 
 
617
  elif backend == Backend.ONNX_DYNAMIC_QUANTIZATION.value:
618
  snippets = export_to_onnx_dynamic_quantization_snippet(
619
- model_id, create_pr, output_model_id, onnx_quantization_config
620
  )
621
  elif backend == Backend.ONNX_OPTIMIZATION.value:
622
  snippets = export_to_onnx_optimization_snippet(
623
- model_id, create_pr, output_model_id, onnx_optimization_config
624
  )
625
  elif backend == Backend.OPENVINO.value:
626
- snippets = export_to_openvino_snippet(model_id, create_pr, output_model_id)
 
 
627
  elif backend == Backend.OPENVINO_STATIC_QUANTIZATION.value:
628
  snippets = export_to_openvino_static_quantization_snippet(
629
  model_id,
 
630
  create_pr,
631
  output_model_id,
632
  ov_quant_dataset_name,
@@ -637,7 +973,7 @@ def on_change(
637
  )
638
  else:
639
  return "", "", "", gr.Textbox("Unexpected backend!", visible=True)
640
-
641
  return *snippets, gr.Textbox(visible=False)
642
 
643
 
@@ -664,34 +1000,75 @@ with gr.Blocks(
664
  with gr.Row():
665
  # Left Input Column
666
  with gr.Column(scale=2):
667
-
668
  gr.Markdown(
669
  value="""\
670
- ### Export a Sentence Transformer model to accelerated backends
671
 
672
- Sentence Transformers embedding models can be optimized for **faster inference** on CPU and GPU devices by exporting, quantizing, and optimizing them in ONNX and OpenVINO formats.
673
- Observe the [Speeding up Inference](https://sbert.net/docs/sentence_transformer/usage/efficiency.html) documentation for more information.
 
 
 
674
  """,
675
  label="",
676
  container=True,
677
  )
678
- gr.HTML(value="""\
 
679
  <details><summary>Click to see performance benchmarks</summary>
680
 
681
  <table>
682
  <thead>
683
  <tr>
684
- <th>GPU</th>
685
- <th>CPU</th>
686
  </tr>
687
  </thead>
688
  <tbody>
689
  <tr>
690
  <td>
691
- <img src="https://huggingface.co/spaces/tomaarsen/backend-export/resolve/main/images/backends_benchmark_gpu.png" alt="">
692
  </td>
693
  <td>
694
- <img src="https://huggingface.co/spaces/tomaarsen/backend-export/resolve/main/images/backends_benchmark_cpu.png" alt="">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
695
  </td>
696
  </tr>
697
  </tbody>
@@ -706,11 +1083,12 @@ Observe the [Speeding up Inference](https://sbert.net/docs/sentence_transformer/
706
  </ul>
707
 
708
  </details>
709
- """)
 
710
 
711
  model_id = HuggingfaceHubSearch(
712
- label="Sentence Transformer model to export",
713
- placeholder="Search for Sentence Transformer models on Hugging Face",
714
  search_type="model",
715
  )
716
  create_pr = gr.Checkbox(
@@ -741,33 +1119,33 @@ Observe the [Speeding up Inference](https://sbert.net/docs/sentence_transformer/
741
  gr.Markdown(
742
  value="[ONNX Documentation](https://sbert.net/docs/sentence_transformer/usage/efficiency.html#onnx)",
743
  container=True,
744
- elem_classes=["small-text"]
745
  )
746
  with gr.Group(visible=False) as onnx_dynamic_quantization_group:
747
  onnx_quantization_config = gr.Radio(
748
  choices=["arm64", "avx2", "avx512", "avx512_vnni"],
749
  value="avx512_vnni",
750
  label="Quantization config",
751
- info="[ONNX Quantization Documentation](https://sbert.net/docs/sentence_transformer/usage/efficiency.html#quantizing-onnx-models)"
752
  )
753
  with gr.Group(visible=False) as onnx_optimization_group:
754
  onnx_optimization_config = gr.Radio(
755
  choices=["O1", "O2", "O3", "O4"],
756
  value="O4",
757
  label="Optimization config",
758
- info="[ONNX Optimization Documentation](https://sbert.net/docs/sentence_transformer/usage/efficiency.html#optimizing-onnx-models)"
759
  )
760
  with gr.Group(visible=False) as openvino_group:
761
  gr.Markdown(
762
  value="[OpenVINO Documentation](https://sbert.net/docs/sentence_transformer/usage/efficiency.html#openvino)",
763
  container=True,
764
- elem_classes=["small-text"]
765
  )
766
  with gr.Group(visible=False) as openvino_static_quantization_group:
767
  gr.Markdown(
768
  value="[OpenVINO Quantization Documentation](https://sbert.net/docs/sentence_transformer/usage/efficiency.html#quantizing-openvino-models)",
769
  container=True,
770
- elem_classes=["small-text"]
771
  )
772
  ov_quant_dataset_name = HuggingfaceHubSearch(
773
  value="nyu-mll/glue",
 
1
  from enum import Enum
2
+ from functools import lru_cache, partial
3
+ import json
4
  from pathlib import Path
5
  from typing import Optional, Tuple
6
  import gradio as gr
7
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
8
  import huggingface_hub
9
+ from sentence_transformers import CrossEncoder, SentenceTransformer, SparseEncoder
10
  from sentence_transformers import (
11
  export_dynamic_quantized_onnx_model as st_export_dynamic_quantized_onnx_model,
12
  export_optimized_onnx_model as st_export_optimized_onnx_model,
13
  export_static_quantized_openvino_model as st_export_static_quantized_openvino_model,
14
  )
15
+ from huggingface_hub import (
16
+ model_info,
17
+ upload_folder,
18
+ get_repo_discussions,
19
+ list_repo_commits,
20
+ HfFileSystem,
21
+ hf_hub_download,
22
+ )
23
+ from huggingface_hub.errors import (
24
+ RepositoryNotFoundError,
25
+ HFValidationError,
26
+ EntryNotFoundError,
27
+ )
28
  from optimum.intel import OVQuantizationConfig
29
  from tempfile import TemporaryDirectory
30
 
 
41
  return self.value
42
 
43
 
44
+ class Archetype(Enum):
45
+ SENTENCE_TRANSFORMER = "SentenceTransformer"
46
+ SPARSE_ENCODER = "SparseEncoder"
47
+ CROSS_ENCODER = "CrossEncoder"
48
+ OTHER = "Other"
49
+
50
+ def __str__(self):
51
+ return self.value
52
+
53
+
54
  backends = [str(backend) for backend in Backend]
55
  FILE_SYSTEM = HfFileSystem()
56
 
57
+
58
  def is_new_model(model_id: str) -> bool:
59
  """
60
  Check if the model ID exists on the Hugging Face Hub. If we get a request error, then we
 
73
  return "sentence-transformers" in model_info(model_id).tags
74
 
75
 
76
+ @lru_cache()
77
+ def get_archetype(model_id: str) -> Archetype:
78
+ if "/" not in model_id:
79
+ return Archetype.OTHER
80
+
81
+ try:
82
+ config_sentence_transformers_path = hf_hub_download(
83
+ model_id, filename="config_sentence_transformers.json"
84
+ )
85
+ except (RepositoryNotFoundError, HFValidationError):
86
+ return Archetype.OTHER
87
+ except EntryNotFoundError:
88
+ config_sentence_transformers_path = None
89
+
90
+ try:
91
+ config_path = hf_hub_download(model_id, filename="config.json")
92
+ except (RepositoryNotFoundError, HFValidationError):
93
+ return Archetype.OTHER
94
+ except EntryNotFoundError:
95
+ config_path = None
96
+
97
+ if config_sentence_transformers_path is None and config_path is None:
98
+ return Archetype.OTHER
99
+
100
+ if config_sentence_transformers_path is not None:
101
+ with open(config_sentence_transformers_path, "r", encoding="utf8") as f:
102
+ st_config = json.load(f)
103
+ model_type = st_config.get("model_type", "SentenceTransformer")
104
+ if model_type == "SentenceTransformer":
105
+ return Archetype.SENTENCE_TRANSFORMER
106
+ elif model_type == "SparseEncoder":
107
+ return Archetype.SPARSE_ENCODER
108
+ else:
109
+ return Archetype.OTHER
110
+
111
+ if config_path is not None:
112
+ with open(config_path, "r", encoding="utf8") as f:
113
+ config = json.load(f)
114
+ if "sentence_transformers" in config or config["architectures"][0].endswith(
115
+ "ForSequenceClassification"
116
+ ):
117
+ return Archetype.CROSS_ENCODER
118
+
119
+ return Archetype.OTHER
120
+
121
+
122
  def get_last_commit(model_id: str) -> str:
123
  """
124
  Get the last commit hash of the model ID.
125
  """
126
  return f"https://huggingface.co/{model_id}/commit/{list_repo_commits(model_id)[0].commit_id}"
127
 
128
+
129
  def get_last_pr(model_id: str) -> Tuple[str, int]:
130
  last_pr = next(get_repo_discussions(model_id))
131
  return last_pr.url, last_pr.num
 
150
  )
151
 
152
 
153
+ def export_to_onnx(
154
+ model_id: str,
155
+ archetype: Archetype,
156
+ create_pr: bool,
157
+ output_model_id: str,
158
+ token: Optional[str] = None,
159
+ ) -> None:
160
  if does_file_glob_exist(output_model_id, "**/model.onnx"):
161
  raise FileExistsError("An ONNX model already exists in the repository")
162
 
163
+ if archetype == Archetype.SENTENCE_TRANSFORMER:
164
+ model = SentenceTransformer(model_id, backend="onnx")
165
+ elif archetype == Archetype.SPARSE_ENCODER:
166
+ model = SparseEncoder(model_id, backend="onnx")
167
+ elif archetype == Archetype.CROSS_ENCODER:
168
+ model = CrossEncoder(model_id, backend="onnx")
169
+ else:
170
+ return
171
+
172
  commit_message = "Add exported onnx model 'model.onnx'"
173
 
174
  if is_new_model(output_model_id):
 
193
  ## Tip:
194
  Consider testing this pull request before merging by loading the model from this PR with the `revision` argument:
195
  ```python
196
+ from sentence_transformers import {archetype}
197
 
198
  # TODO: Fill in the PR number
199
  pr_number = 2
200
+ model = {archetype}(
201
  "{output_model_id}",
202
  revision=f"refs/pr/{{pr_number}}",
203
  backend="onnx",
204
  )
205
 
206
  # Verify that everything works as expected
207
+ {'''embeddings = model.encode(["The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium."])
208
  print(embeddings.shape)
209
 
210
  similarities = model.similarity(embeddings, embeddings)
211
+ print(similarities)''' if archetype in {Archetype.SENTENCE_TRANSFORMER, Archetype.SPARSE_ENCODER} else
212
+ '''predictions = model.predict([
213
+ ["Which planet is known as the Red Planet?", "Mars, known for its reddish appearance, is often referred to as the Red Planet."],
214
+ ["Which planet is known as the Red Planet?", "Jupiter, the largest planet in our solar system, has a prominent red spot."],
215
+ ])
216
+ print(predictions)'''}
217
  ```
218
  """
219
 
 
227
  token=token,
228
  )
229
 
230
+
231
+ def export_to_onnx_snippet(
232
+ model_id: str, archetype: Archetype, create_pr: bool, output_model_id: str
233
+ ) -> Tuple[str, str, str]:
234
+ if archetype == Archetype.OTHER:
235
+ return "", "", ""
236
+
237
+ return (
238
+ """\
239
  pip install sentence_transformers[onnx-gpu]
240
  # or
241
  pip install sentence_transformers[onnx]
242
+ """,
243
+ f"""\
244
+ from sentence_transformers import {archetype}
245
 
246
  # 1. Load the model to be exported with the ONNX backend
247
+ model = {archetype}(
248
  "{model_id}",
249
  backend="onnx",
250
  )
 
256
  "{output_model_id}",
257
  create_pr=True,
258
  )'''}
259
+ """,
260
+ f"""\
261
+ from sentence_transformers import {archetype}
262
 
263
  # 1. Load the model from the Hugging Face Hub
264
  # (until merged) Use the `revision` argument to load the model from the PR
265
  pr_number = 2
266
+ model = {archetype}(
267
  "{output_model_id}",
268
  revision=f"refs/pr/{{pr_number}}",
269
  backend="onnx",
270
  )
271
+ """
272
+ + (
273
+ """
274
  # 2. Inference works as normal
275
  embeddings = model.encode(["The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium."])
276
  similarities = model.similarity(embeddings, embeddings)
277
  """
278
+ if archetype in {Archetype.SENTENCE_TRANSFORMER, Archetype.SPARSE_ENCODER}
279
+ else """
280
+ # 2. Inference works as normal
281
+ predictions = model.predict([
282
+ ["Which planet is known as the Red Planet?", "Mars, known for its reddish appearance, is often referred to as the Red Planet."],
283
+ ["Which planet is known as the Red Planet?", "Jupiter, the largest planet in our solar system, has a prominent red spot."],
284
+ ])
285
+ """
286
+ ),
287
+ )
288
 
289
 
290
  def export_to_onnx_dynamic_quantization(
291
+ model_id: str,
292
+ archetype: Archetype,
293
+ create_pr: bool,
294
+ output_model_id: str,
295
+ onnx_quantization_config: str,
296
+ token: Optional[str] = None,
297
  ) -> None:
298
+ if does_file_glob_exist(
299
+ output_model_id, f"onnx/model_qint8_{onnx_quantization_config}.onnx"
300
+ ):
301
+ raise FileExistsError(
302
+ "The quantized ONNX model already exists in the repository"
303
+ )
304
+
305
+ if archetype == Archetype.SENTENCE_TRANSFORMER:
306
+ model = SentenceTransformer(model_id, backend="onnx")
307
+ elif archetype == Archetype.SPARSE_ENCODER:
308
+ model = SparseEncoder(model_id, backend="onnx")
309
+ elif archetype == Archetype.CROSS_ENCODER:
310
+ model = CrossEncoder(model_id, backend="onnx")
311
+ else:
312
+ return
313
 
314
  if not create_pr and is_new_model(output_model_id):
315
  model.push_to_hub(repo_id=output_model_id, token=token)
 
327
  )
328
  except ValueError:
329
  # Currently, quantization with optimum has some issues if there's already an ONNX model in a subfolder
330
+ if archetype == Archetype.SENTENCE_TRANSFORMER:
331
+ model = SentenceTransformer(
332
+ model_id, backend="onnx", model_kwargs={"export": True}
333
+ )
334
+ elif archetype == Archetype.SPARSE_ENCODER:
335
+ model = SparseEncoder(
336
+ model_id, backend="onnx", model_kwargs={"export": True}
337
+ )
338
+ elif archetype == Archetype.CROSS_ENCODER:
339
+ model = CrossEncoder(
340
+ model_id, backend="onnx", model_kwargs={"export": True}
341
+ )
342
+ else:
343
+ return
344
  st_export_dynamic_quantized_onnx_model(
345
  model,
346
  quantization_config=onnx_quantization_config,
 
351
  finally:
352
  huggingface_hub.upload_folder = original_upload_folder
353
 
354
+
355
  def export_to_onnx_dynamic_quantization_snippet(
356
+ model_id: str,
357
+ archetype: Archetype,
358
+ create_pr: bool,
359
+ output_model_id: str,
360
+ onnx_quantization_config: str,
361
+ ) -> Tuple[str, str, str]:
362
+ if archetype == Archetype.OTHER:
363
+ return "", "", ""
364
+
365
+ return (
366
+ """\
367
  pip install sentence_transformers[onnx-gpu]
368
  # or
369
  pip install sentence_transformers[onnx]
370
+ """,
371
+ f"""\
372
  from sentence_transformers import (
373
+ {archetype},
374
  export_dynamic_quantized_onnx_model,
375
  )
376
 
377
+ # 1. Load the model to be exported with the ONNX backend
378
+ model = {archetype}(
379
  "{model_id}",
380
  backend="onnx",
381
  )
 
388
  push_to_hub=True,
389
  {''' create_pr=True,
390
  ''' if create_pr else ''})
391
+ """,
392
+ f"""\
393
+ from sentence_transformers import {archetype}
394
 
395
  # 1. Load the model from the Hugging Face Hub
396
  # (until merged) Use the `revision` argument to load the model from the PR
397
  pr_number = 2
398
+ model = {archetype}(
399
  "{output_model_id}",
400
  revision=f"refs/pr/{{pr_number}}",
401
  backend="onnx",
402
  model_kwargs={{"file_name": "model_qint8_{onnx_quantization_config}.onnx"}},
403
  )
404
+ """
405
+ + (
406
+ """
407
  # 2. Inference works as normal
408
  embeddings = model.encode(["The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium."])
409
  similarities = model.similarity(embeddings, embeddings)
410
  """
411
+ if archetype in {Archetype.SENTENCE_TRANSFORMER, Archetype.SPARSE_ENCODER}
412
+ else """
413
+ # 2. Inference works as normal
414
+ predictions = model.predict([
415
+ ["Which planet is known as the Red Planet?", "Mars, known for its reddish appearance, is often referred to as the Red Planet."],
416
+ ["Which planet is known as the Red Planet?", "Jupiter, the largest planet in our solar system, has a prominent red spot."],
417
+ ])
418
+ """
419
+ ),
420
+ )
421
 
 
 
 
422
 
423
+ def export_to_onnx_optimization(
424
+ model_id: str,
425
+ archetype: Archetype,
426
+ create_pr: bool,
427
+ output_model_id: str,
428
+ onnx_optimization_config: str,
429
+ token: Optional[str] = None,
430
+ ) -> None:
431
+ if does_file_glob_exist(
432
+ output_model_id, f"onnx/model_{onnx_optimization_config}.onnx"
433
+ ):
434
+ raise FileExistsError(
435
+ "The optimized ONNX model already exists in the repository"
436
+ )
437
+
438
+ if archetype == Archetype.SENTENCE_TRANSFORMER:
439
+ model = SentenceTransformer(model_id, backend="onnx")
440
+ elif archetype == Archetype.SPARSE_ENCODER:
441
+ model = SparseEncoder(model_id, backend="onnx")
442
+ elif archetype == Archetype.CROSS_ENCODER:
443
+ model = CrossEncoder(model_id, backend="onnx")
444
+ else:
445
+ return
446
 
447
  if not create_pr and is_new_model(output_model_id):
448
  model.push_to_hub(repo_id=output_model_id, token=token)
 
461
  finally:
462
  huggingface_hub.upload_folder = original_upload_folder
463
 
464
+
465
+ def export_to_onnx_optimization_snippet(
466
+ model_id: str,
467
+ archetype: Archetype,
468
+ create_pr: bool,
469
+ output_model_id: str,
470
+ onnx_optimization_config: str,
471
+ ) -> Tuple[str, str, str]:
472
+ if archetype == Archetype.OTHER:
473
+ return "", "", ""
474
+
475
+ return (
476
+ """\
477
  pip install sentence_transformers[onnx-gpu]
478
  # or
479
  pip install sentence_transformers[onnx]
480
+ """,
481
+ f"""\
482
  from sentence_transformers import (
483
+ {archetype},
484
  export_optimized_onnx_model,
485
  )
486
 
487
  # 1. Load the model to be optimized with the ONNX backend
488
+ model = {archetype}(
489
  "{model_id}",
490
  backend="onnx",
491
  )
 
498
  push_to_hub=True,
499
  {''' create_pr=True,
500
  ''' if create_pr else ''})
501
+ """,
502
+ f"""\
503
+ from sentence_transformers import {archetype}
504
 
505
  # 1. Load the model from the Hugging Face Hub
506
  # (until merged) Use the `revision` argument to load the model from the PR
507
  pr_number = 2
508
+ model = {archetype}(
509
  "{output_model_id}",
510
  revision=f"refs/pr/{{pr_number}}",
511
  backend="onnx",
512
  model_kwargs={{"file_name": "model_{onnx_optimization_config}.onnx"}},
513
  )
514
+ """
515
+ + (
516
+ """
517
  # 2. Inference works as normal
518
  embeddings = model.encode(["The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium."])
519
  similarities = model.similarity(embeddings, embeddings)
520
  """
521
+ if archetype in {Archetype.SENTENCE_TRANSFORMER, Archetype.SPARSE_ENCODER}
522
+ else """
523
+ # 2. Inference works as normal
524
+ predictions = model.predict([
525
+ ["Which planet is known as the Red Planet?", "Mars, known for its reddish appearance, is often referred to as the Red Planet."],
526
+ ["Which planet is known as the Red Planet?", "Jupiter, the largest planet in our solar system, has a prominent red spot."],
527
+ ])
528
+ """
529
+ ),
530
+ )
531
 
532
 
533
+ def export_to_openvino(
534
+ model_id: str,
535
+ archetype: Archetype,
536
+ create_pr: bool,
537
+ output_model_id: str,
538
+ token: Optional[str] = None,
539
+ ) -> None:
540
  if does_file_glob_exist(output_model_id, "**/openvino_model.xml"):
541
  raise FileExistsError("The OpenVINO model already exists in the repository")
542
 
543
+ if archetype == Archetype.SENTENCE_TRANSFORMER:
544
+ model = SentenceTransformer(model_id, backend="openvino")
545
+ elif archetype == Archetype.SPARSE_ENCODER:
546
+ model = SparseEncoder(model_id, backend="openvino")
547
+ elif archetype == Archetype.CROSS_ENCODER:
548
+ model = CrossEncoder(model_id, backend="openvino")
549
+ else:
550
+ return
551
 
552
  commit_message = "Add exported openvino model 'openvino_model.xml'"
553
 
 
573
  ## Tip:
574
  Consider testing this pull request before merging by loading the model from this PR with the `revision` argument:
575
  ```python
576
+ from sentence_transformers import {archetype}
577
 
578
  # TODO: Fill in the PR number
579
  pr_number = 2
580
+ model = {archetype}(
581
  "{output_model_id}",
582
  revision=f"refs/pr/{{pr_number}}",
583
  backend="openvino",
584
  )
585
 
586
  # Verify that everything works as expected
587
+ {'''embeddings = model.encode(["The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium."])
588
  print(embeddings.shape)
589
 
590
  similarities = model.similarity(embeddings, embeddings)
591
+ print(similarities)''' if archetype in {Archetype.SENTENCE_TRANSFORMER, Archetype.SPARSE_ENCODER} else
592
+ '''predictions = model.predict([
593
+ ["Which planet is known as the Red Planet?", "Mars, known for its reddish appearance, is often referred to as the Red Planet."],
594
+ ["Which planet is known as the Red Planet?", "Jupiter, the largest planet in our solar system, has a prominent red spot."],
595
+ ])
596
+ print(predictions)'''}
597
  ```
598
  """
599
 
 
607
  token=token,
608
  )
609
 
610
+
611
+ def export_to_openvino_snippet(
612
+ model_id: str, archetype: Archetype, create_pr: bool, output_model_id: str
613
+ ) -> Tuple[str, str, str]:
614
+ if archetype == Archetype.OTHER:
615
+ return "", "", ""
616
+
617
+ return (
618
+ """\
619
  pip install sentence_transformers[openvino]
620
+ """,
621
+ f"""\
622
+ from sentence_transformers import {archetype}
623
 
624
  # 1. Load the model to be exported with the OpenVINO backend
625
+ model = {archetype}(
626
  "{model_id}",
627
  backend="openvino",
628
  )
 
634
  "{output_model_id}",
635
  create_pr=True,
636
  )'''}
637
+ """,
638
+ f"""\
639
+ from sentence_transformers import {archetype}
640
 
641
  # 1. Load the model from the Hugging Face Hub
642
  # (until merged) Use the `revision` argument to load the model from the PR
643
  pr_number = 2
644
+ model = {archetype}(
645
  "{output_model_id}",
646
  revision=f"refs/pr/{{pr_number}}",
647
  backend="openvino",
648
  )
649
+ """
650
+ + (
651
+ """
652
  # 2. Inference works as normal
653
  embeddings = model.encode(["The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium."])
654
  similarities = model.similarity(embeddings, embeddings)
655
  """
656
+ if archetype in {Archetype.SENTENCE_TRANSFORMER, Archetype.SPARSE_ENCODER}
657
+ else """
658
+ # 2. Inference works as normal
659
+ predictions = model.predict([
660
+ ["Which planet is known as the Red Planet?", "Mars, known for its reddish appearance, is often referred to as the Red Planet."],
661
+ ["Which planet is known as the Red Planet?", "Jupiter, the largest planet in our solar system, has a prominent red spot."],
662
+ ])
663
+ """
664
+ ),
665
+ )
666
+
667
 
668
  def export_to_openvino_static_quantization(
669
  model_id: str,
670
+ archetype: Archetype,
671
  create_pr: bool,
672
  output_model_id: str,
673
  ov_quant_dataset_name: str,
 
677
  ov_quant_dataset_num_samples: int,
678
  token: Optional[str] = None,
679
  ) -> None:
680
+ if does_file_glob_exist(
681
+ output_model_id, "openvino/openvino_model_qint8_quantized.xml"
682
+ ):
683
+ raise FileExistsError(
684
+ "The quantized OpenVINO model already exists in the repository"
685
+ )
686
 
687
+ if archetype == Archetype.SENTENCE_TRANSFORMER:
688
+ model = SentenceTransformer(model_id, backend="openvino")
689
+ elif archetype == Archetype.SPARSE_ENCODER:
690
+ model = SparseEncoder(model_id, backend="openvino")
691
+ elif archetype == Archetype.CROSS_ENCODER:
692
+ model = CrossEncoder(model_id, backend="openvino")
693
+ else:
694
+ return
695
 
696
  if not create_pr and is_new_model(output_model_id):
697
  model.push_to_hub(repo_id=output_model_id, token=token)
 
716
  finally:
717
  huggingface_hub.upload_folder = original_upload_folder
718
 
719
+
720
  def export_to_openvino_static_quantization_snippet(
721
  model_id: str,
722
+ archetype: Archetype,
723
  create_pr: bool,
724
  output_model_id: str,
725
  ov_quant_dataset_name: str,
 
727
  ov_quant_dataset_split: str,
728
  ov_quant_dataset_column_name: str,
729
  ov_quant_dataset_num_samples: int,
730
+ ) -> Tuple[str, str, str]:
731
+ if archetype == Archetype.OTHER:
732
+ return "", "", ""
733
+
734
+ return (
735
+ """\
736
  pip install sentence_transformers[openvino]
737
+ """,
738
+ f"""\
739
  from sentence_transformers import (
740
+ {archetype},
741
  export_static_quantized_openvino_model,
742
  )
743
  from optimum.intel import OVQuantizationConfig
744
 
745
  # 1. Load the model to be quantized with the OpenVINO backend
746
+ model = {archetype}(
747
  "{model_id}",
748
  backend="openvino",
749
  )
 
762
  push_to_hub=True,
763
  {''' create_pr=True,
764
  ''' if create_pr else ''})
765
+ """,
766
+ f"""\
767
+ from sentence_transformers import {archetype}
768
 
769
  # 1. Load the model from the Hugging Face Hub
770
  # (until merged) Use the `revision` argument to load the model from the PR
771
  pr_number = 2
772
+ model = {archetype}(
773
  "{output_model_id}",
774
  revision=f"refs/pr/{{pr_number}}",
775
  backend="openvino",
776
  model_kwargs={{"file_name": "openvino_model_qint8_quantized.xml"}},
777
  )
778
+ """
779
+ + (
780
+ """
781
  # 2. Inference works as normal
782
  embeddings = model.encode(["The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium."])
783
  similarities = model.similarity(embeddings, embeddings)
784
  """
785
+ if archetype in {Archetype.SENTENCE_TRANSFORMER, Archetype.SPARSE_ENCODER}
786
+ else """
787
+ # 2. Inference works as normal
788
+ predictions = model.predict([
789
+ ["Which planet is known as the Red Planet?", "Mars, known for its reddish appearance, is often referred to as the Red Planet."],
790
+ ["Which planet is known as the Red Planet?", "Jupiter, the largest planet in our solar system, has a prominent red spot."],
791
+ ])
792
+ """
793
+ ),
794
+ )
795
+
796
 
797
  def on_submit(
798
  model_id,
 
811
  profile: Optional[gr.OAuthProfile] = None,
812
  ):
813
  if oauth_token is None or profile is None:
814
+ return (
815
+ "Commit or PR url:<br>...",
816
+ inference_snippet,
817
+ gr.Textbox(
818
+ "Please sign in with Hugging Face to use this Space", visible=True
819
+ ),
820
+ )
821
 
822
  if not model_id:
823
+ return (
824
+ "Commit or PR url:<br>...",
825
+ inference_snippet,
826
+ gr.Textbox("Please enter a model ID", visible=True),
827
+ )
828
 
829
  if not is_sentence_transformer_model(model_id):
830
+ return (
831
+ "Commit or PR url:<br>...",
832
+ inference_snippet,
833
+ gr.Textbox(
834
+ "The source model must have a Sentence Transformers tag", visible=True
835
+ ),
836
+ )
837
 
838
  if output_model_id and "/" not in output_model_id:
839
  output_model_id = f"{profile.name}/{output_model_id}"
840
 
841
  output_model_id = output_model_id if not create_pr else model_id
842
+ archetype = get_archetype(model_id)
843
 
844
  try:
845
  if backend == Backend.ONNX.value:
846
+ export_to_onnx(
847
+ model_id, archetype, create_pr, output_model_id, token=oauth_token.token
848
+ )
849
  elif backend == Backend.ONNX_DYNAMIC_QUANTIZATION.value:
850
  export_to_onnx_dynamic_quantization(
851
+ model_id,
852
+ archetype,
853
+ create_pr,
854
+ output_model_id,
855
+ onnx_quantization_config,
856
+ token=oauth_token.token,
857
  )
858
  elif backend == Backend.ONNX_OPTIMIZATION.value:
859
  export_to_onnx_optimization(
860
+ model_id,
861
+ archetype,
862
+ create_pr,
863
+ output_model_id,
864
+ onnx_optimization_config,
865
+ token=oauth_token.token,
866
  )
867
  elif backend == Backend.OPENVINO.value:
868
+ export_to_openvino(
869
+ model_id, archetype, create_pr, output_model_id, token=oauth_token.token
870
+ )
871
  elif backend == Backend.OPENVINO_STATIC_QUANTIZATION.value:
872
  export_to_openvino_static_quantization(
873
  model_id,
874
+ archetype,
875
  create_pr,
876
  output_model_id,
877
  ov_quant_dataset_name,
 
882
  token=oauth_token.token,
883
  )
884
  except FileExistsError as exc:
885
+ return (
886
+ "Commit or PR url:<br>...",
887
+ inference_snippet,
888
+ gr.Textbox(str(exc), visible=True),
889
+ )
890
 
891
  if create_pr:
892
  url, num = get_last_pr(output_model_id)
893
+ return (
894
+ f"PR url:<br>{url}",
895
+ inference_snippet.replace("pr_number = 2", f"pr_number = {num}"),
896
+ gr.Textbox(visible=False),
897
+ )
898
+
899
  # Remove the lines that refer to the revision argument
900
  lines = inference_snippet.splitlines()
901
  del lines[7]
902
  del lines[4]
903
  del lines[3]
904
  inference_snippet = "\n".join(lines)
905
+ return (
906
+ f"Commit url:<br>{get_last_commit(output_model_id)}",
907
+ inference_snippet,
908
+ gr.Textbox(visible=False),
909
+ )
910
+
911
 
912
  def on_change(
913
  model_id,
 
925
  profile: Optional[gr.OAuthProfile] = None,
926
  ) -> str:
927
  if oauth_token is None or profile is None:
928
+ return (
929
+ "",
930
+ "",
931
+ "",
932
+ gr.Textbox(
933
+ "Please sign in with Hugging Face to use this Space", visible=True
934
+ ),
935
+ )
936
 
937
  if not model_id:
938
  return "", "", "", gr.Textbox("Please enter a model ID", visible=True)
939
+
940
  if output_model_id and "/" not in output_model_id:
941
  output_model_id = f"{profile.username}/{output_model_id}"
942
 
943
  output_model_id = output_model_id if not create_pr else model_id
944
+ archetype = get_archetype(model_id)
945
 
946
  if backend == Backend.ONNX.value:
947
+ snippets = export_to_onnx_snippet(
948
+ model_id, archetype, create_pr, output_model_id
949
+ )
950
  elif backend == Backend.ONNX_DYNAMIC_QUANTIZATION.value:
951
  snippets = export_to_onnx_dynamic_quantization_snippet(
952
+ model_id, archetype, create_pr, output_model_id, onnx_quantization_config
953
  )
954
  elif backend == Backend.ONNX_OPTIMIZATION.value:
955
  snippets = export_to_onnx_optimization_snippet(
956
+ model_id, archetype, create_pr, output_model_id, onnx_optimization_config
957
  )
958
  elif backend == Backend.OPENVINO.value:
959
+ snippets = export_to_openvino_snippet(
960
+ model_id, archetype, create_pr, output_model_id
961
+ )
962
  elif backend == Backend.OPENVINO_STATIC_QUANTIZATION.value:
963
  snippets = export_to_openvino_static_quantization_snippet(
964
  model_id,
965
+ archetype,
966
  create_pr,
967
  output_model_id,
968
  ov_quant_dataset_name,
 
973
  )
974
  else:
975
  return "", "", "", gr.Textbox("Unexpected backend!", visible=True)
976
+
977
  return *snippets, gr.Textbox(visible=False)
978
 
979
 
 
1000
  with gr.Row():
1001
  # Left Input Column
1002
  with gr.Column(scale=2):
 
1003
  gr.Markdown(
1004
  value="""\
1005
+ ### Export a SentenceTransformer, SparseEncoder, or CrossEncoder model to accelerated backends
1006
 
1007
+ Sentence Transformers models can be optimized for **faster inference** on CPU and GPU devices by exporting, quantizing, and optimizing them in ONNX and OpenVINO formats.
1008
+ Observe the Speeding up Inference documentation for more information:
1009
+ * [SentenceTransformer > Speeding up Inference](https://sbert.net/docs/sentence_transformer/usage/efficiency.html)
1010
+ * [SparseEncoder > Speeding up Inference](https://sbert.net/docs/sparse_encoder/usage/efficiency.html)
1011
+ * [CrossEncoder > Speeding up Inference](https://sbert.net/docs/cross_encoder/usage/efficiency.html)
1012
  """,
1013
  label="",
1014
  container=True,
1015
  )
1016
+ gr.HTML(
1017
+ value="""\
1018
  <details><summary>Click to see performance benchmarks</summary>
1019
 
1020
  <table>
1021
  <thead>
1022
  <tr>
1023
+ <th>SentenceTransformer GPU</th>
1024
+ <th>SentenceTransformer CPU</th>
1025
  </tr>
1026
  </thead>
1027
  <tbody>
1028
  <tr>
1029
  <td>
1030
+ <img src="https://sbert.net/_images/backends_benchmark_gpu.png" alt="">
1031
  </td>
1032
  <td>
1033
+ <img src="https://sbert.net/_images/backends_benchmark_cpu.png" alt="">
1034
+ </td>
1035
+ </tr>
1036
+ </tbody>
1037
+ </table>
1038
+
1039
+ <table>
1040
+ <thead>
1041
+ <tr>
1042
+ <th>SparseEncoder GPU</th>
1043
+ <th>SparseEncoder CPU</th>
1044
+ </tr>
1045
+ </thead>
1046
+ <tbody>
1047
+ <tr>
1048
+ <td>
1049
+ <img src="https://sbert.net/_images/se_backends_benchmark_gpu.png" alt="">
1050
+ </td>
1051
+ <td>
1052
+ <img src="https://sbert.net/_images/se_backends_benchmark_cpu.png" alt="">
1053
+ </td>
1054
+ </tr>
1055
+ </tbody>
1056
+ </table>
1057
+
1058
+ <table>
1059
+ <thead>
1060
+ <tr>
1061
+ <th>CrossEncoder GPU</th>
1062
+ <th>CrossEncoder CPU</th>
1063
+ </tr>
1064
+ </thead>
1065
+ <tbody>
1066
+ <tr>
1067
+ <td>
1068
+ <img src="https://sbert.net/_images/ce_backends_benchmark_gpu.png" alt="">
1069
+ </td>
1070
+ <td>
1071
+ <img src="https://sbert.net/_images/ce_backends_benchmark_cpu.png" alt="">
1072
  </td>
1073
  </tr>
1074
  </tbody>
 
1083
  </ul>
1084
 
1085
  </details>
1086
+ """
1087
+ )
1088
 
1089
  model_id = HuggingfaceHubSearch(
1090
+ label="SentenceTransformer, SparseEncoder, or CrossEncoder model to export",
1091
+ placeholder="Search for SentenceTransformer, SparseEncoder, or CrossEncoder models on Hugging Face",
1092
  search_type="model",
1093
  )
1094
  create_pr = gr.Checkbox(
 
1119
  gr.Markdown(
1120
  value="[ONNX Documentation](https://sbert.net/docs/sentence_transformer/usage/efficiency.html#onnx)",
1121
  container=True,
1122
+ elem_classes=["small-text"],
1123
  )
1124
  with gr.Group(visible=False) as onnx_dynamic_quantization_group:
1125
  onnx_quantization_config = gr.Radio(
1126
  choices=["arm64", "avx2", "avx512", "avx512_vnni"],
1127
  value="avx512_vnni",
1128
  label="Quantization config",
1129
+ info="[ONNX Quantization Documentation](https://sbert.net/docs/sentence_transformer/usage/efficiency.html#quantizing-onnx-models)",
1130
  )
1131
  with gr.Group(visible=False) as onnx_optimization_group:
1132
  onnx_optimization_config = gr.Radio(
1133
  choices=["O1", "O2", "O3", "O4"],
1134
  value="O4",
1135
  label="Optimization config",
1136
+ info="[ONNX Optimization Documentation](https://sbert.net/docs/sentence_transformer/usage/efficiency.html#optimizing-onnx-models)",
1137
  )
1138
  with gr.Group(visible=False) as openvino_group:
1139
  gr.Markdown(
1140
  value="[OpenVINO Documentation](https://sbert.net/docs/sentence_transformer/usage/efficiency.html#openvino)",
1141
  container=True,
1142
+ elem_classes=["small-text"],
1143
  )
1144
  with gr.Group(visible=False) as openvino_static_quantization_group:
1145
  gr.Markdown(
1146
  value="[OpenVINO Quantization Documentation](https://sbert.net/docs/sentence_transformer/usage/efficiency.html#quantizing-openvino-models)",
1147
  container=True,
1148
+ elem_classes=["small-text"],
1149
  )
1150
  ov_quant_dataset_name = HuggingfaceHubSearch(
1151
  value="nyu-mll/glue",
images/backends_benchmark_cpu.png DELETED
Binary file (63.2 kB)
 
images/backends_benchmark_gpu.png DELETED
Binary file (59.9 kB)
 
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
- sentence_transformers[onnx-gpu,openvino]==3.3.0
2
  onnx==1.16.1
3
  https://huggingface.co/spaces/CISCai/chat-template-editor/resolve/08c8e90c53677ae70c66b3d90bf4e63a173b5505/gradio_huggingfacehub_search-0.0.8-py3-none-any.whl
4
- gradio[oauth]==5.5.0
5
- huggingface_hub==0.26.2
 
1
+ sentence_transformers[onnx-gpu,openvino]==5.1.0
2
  onnx==1.16.1
3
  https://huggingface.co/spaces/CISCai/chat-template-editor/resolve/08c8e90c53677ae70c66b3d90bf4e63a173b5505/gradio_huggingfacehub_search-0.0.8-py3-none-any.whl
4
+ gradio[oauth]==5.42.0
5
+ huggingface_hub==0.34.4