Morgan Funtowicz
commited on
Commit
·
4dad7a7
1
Parent(s):
a6ada40
fix: wrong usage computation
Browse files- handler.py +5 -5
handler.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
import platform
|
2 |
from typing import Tuple, List
|
3 |
|
4 |
-
import numpy as np
|
5 |
import torch
|
6 |
from hfendpoints.http import Context, run
|
7 |
from hfendpoints.tasks import Usage
|
@@ -48,13 +47,13 @@ def get_cores_pinning_strategy() -> "CPUPool":
|
|
48 |
return ipex.cpu.runtime.CPUPool(pinned_cpu_cores_id)
|
49 |
|
50 |
|
51 |
-
def get_usage(mask: torch.IntTensor) -> Usage:
|
52 |
"""
|
53 |
Compute the number of processed tokens and return as Usage object matching OpenAI
|
54 |
:param mask: Attention mask tensor, as returned by the model
|
55 |
:return: Usage object matching OpenAI specifications
|
56 |
"""
|
57 |
-
num_tokens =
|
58 |
return Usage(prompt_tokens=num_tokens, total_tokens=num_tokens)
|
59 |
|
60 |
|
@@ -68,7 +67,7 @@ class SentenceTransformerWithUsage(Module):
|
|
68 |
def forward(self, sentences: list[str]) -> Tuple[List[List[int]], List[List[int]]]:
|
69 |
vectors = self._model.encode(sentences, output_value=None)
|
70 |
return (
|
71 |
-
[vector['attention_mask']
|
72 |
[vector['sentence_embedding'].tolist() for vector in vectors]
|
73 |
)
|
74 |
|
@@ -124,6 +123,7 @@ class SentenceTransformerHandler(Handler):
|
|
124 |
|
125 |
# TODO: Change the way we return usage
|
126 |
usage = get_usage(mask)
|
|
|
127 |
return EmbeddingResponse(embeddings=vectors, num_tokens=usage.total_tokens)
|
128 |
|
129 |
|
@@ -137,7 +137,7 @@ def entrypoint():
|
|
137 |
handler = SentenceTransformerHandler(config)
|
138 |
|
139 |
# Allocate endpoint
|
140 |
-
from hfendpoints.
|
141 |
endpoint = EmbeddingEndpoint(handler)
|
142 |
run(endpoint, config.interface, config.port)
|
143 |
|
|
|
1 |
import platform
|
2 |
from typing import Tuple, List
|
3 |
|
|
|
4 |
import torch
|
5 |
from hfendpoints.http import Context, run
|
6 |
from hfendpoints.tasks import Usage
|
|
|
47 |
return ipex.cpu.runtime.CPUPool(pinned_cpu_cores_id)
|
48 |
|
49 |
|
50 |
+
def get_usage(mask: List[torch.IntTensor]) -> Usage:
|
51 |
"""
|
52 |
Compute the number of processed tokens and return as Usage object matching OpenAI
|
53 |
:param mask: Attention mask tensor, as returned by the model
|
54 |
:return: Usage object matching OpenAI specifications
|
55 |
"""
|
56 |
+
num_tokens = sum(x.sum().detach().item() for x in mask)
|
57 |
return Usage(prompt_tokens=num_tokens, total_tokens=num_tokens)
|
58 |
|
59 |
|
|
|
67 |
def forward(self, sentences: list[str]) -> Tuple[List[List[int]], List[List[int]]]:
|
68 |
vectors = self._model.encode(sentences, output_value=None)
|
69 |
return (
|
70 |
+
[vector['attention_mask'] for vector in vectors],
|
71 |
[vector['sentence_embedding'].tolist() for vector in vectors]
|
72 |
)
|
73 |
|
|
|
123 |
|
124 |
# TODO: Change the way we return usage
|
125 |
usage = get_usage(mask)
|
126 |
+
vectors = vectors if request.is_batched else vectors[0]
|
127 |
return EmbeddingResponse(embeddings=vectors, num_tokens=usage.total_tokens)
|
128 |
|
129 |
|
|
|
137 |
handler = SentenceTransformerHandler(config)
|
138 |
|
139 |
# Allocate endpoint
|
140 |
+
from hfendpoints.openai.embedding import EmbeddingEndpoint
|
141 |
endpoint = EmbeddingEndpoint(handler)
|
142 |
run(endpoint, config.interface, config.port)
|
143 |
|