Morgan Funtowicz commited on
Commit
4dad7a7
·
1 Parent(s): a6ada40

fix: wrong usage computation

Browse files
Files changed (1) hide show
  1. handler.py +5 -5
handler.py CHANGED
@@ -1,7 +1,6 @@
1
  import platform
2
  from typing import Tuple, List
3
 
4
- import numpy as np
5
  import torch
6
  from hfendpoints.http import Context, run
7
  from hfendpoints.tasks import Usage
@@ -48,13 +47,13 @@ def get_cores_pinning_strategy() -> "CPUPool":
48
  return ipex.cpu.runtime.CPUPool(pinned_cpu_cores_id)
49
 
50
 
51
- def get_usage(mask: torch.IntTensor) -> Usage:
52
  """
53
  Compute the number of processed tokens and return as Usage object matching OpenAI
54
  :param mask: Attention mask tensor, as returned by the model
55
  :return: Usage object matching OpenAI specifications
56
  """
57
- num_tokens = np.sum(mask)
58
  return Usage(prompt_tokens=num_tokens, total_tokens=num_tokens)
59
 
60
 
@@ -68,7 +67,7 @@ class SentenceTransformerWithUsage(Module):
68
  def forward(self, sentences: list[str]) -> Tuple[List[List[int]], List[List[int]]]:
69
  vectors = self._model.encode(sentences, output_value=None)
70
  return (
71
- [vector['attention_mask'].tolist() for vector in vectors],
72
  [vector['sentence_embedding'].tolist() for vector in vectors]
73
  )
74
 
@@ -124,6 +123,7 @@ class SentenceTransformerHandler(Handler):
124
 
125
  # TODO: Change the way we return usage
126
  usage = get_usage(mask)
 
127
  return EmbeddingResponse(embeddings=vectors, num_tokens=usage.total_tokens)
128
 
129
 
@@ -137,7 +137,7 @@ def entrypoint():
137
  handler = SentenceTransformerHandler(config)
138
 
139
  # Allocate endpoint
140
- from hfendpoints.hfinference.embedding import EmbeddingEndpoint
141
  endpoint = EmbeddingEndpoint(handler)
142
  run(endpoint, config.interface, config.port)
143
 
 
1
  import platform
2
  from typing import Tuple, List
3
 
 
4
  import torch
5
  from hfendpoints.http import Context, run
6
  from hfendpoints.tasks import Usage
 
47
  return ipex.cpu.runtime.CPUPool(pinned_cpu_cores_id)
48
 
49
 
50
+ def get_usage(mask: List[torch.IntTensor]) -> Usage:
51
  """
52
  Compute the number of processed tokens and return as Usage object matching OpenAI
53
  :param mask: Attention mask tensor, as returned by the model
54
  :return: Usage object matching OpenAI specifications
55
  """
56
+ num_tokens = sum(x.sum().detach().item() for x in mask)
57
  return Usage(prompt_tokens=num_tokens, total_tokens=num_tokens)
58
 
59
 
 
67
  def forward(self, sentences: list[str]) -> Tuple[List[List[int]], List[List[int]]]:
68
  vectors = self._model.encode(sentences, output_value=None)
69
  return (
70
+ [vector['attention_mask'] for vector in vectors],
71
  [vector['sentence_embedding'].tolist() for vector in vectors]
72
  )
73
 
 
123
 
124
  # TODO: Change the way we return usage
125
  usage = get_usage(mask)
126
+ vectors = vectors if request.is_batched else vectors[0]
127
  return EmbeddingResponse(embeddings=vectors, num_tokens=usage.total_tokens)
128
 
129
 
 
137
  handler = SentenceTransformerHandler(config)
138
 
139
  # Allocate endpoint
140
+ from hfendpoints.openai.embedding import EmbeddingEndpoint
141
  endpoint = EmbeddingEndpoint(handler)
142
  run(endpoint, config.interface, config.port)
143