ariG23498 HF Staff commited on
Commit
20169cb
·
1 Parent(s): a3e6d78
Files changed (2) hide show
  1. app.py +26 -16
  2. requirements.txt +2 -1
app.py CHANGED
@@ -13,27 +13,37 @@ model = AutoModelForImageTextToText.from_pretrained(
13
 
14
  @spaces.GPU
15
  def process_inputs(image, audio):
16
- # Prepare inputs for the model
17
- inputs = processor(
18
- images=image,
19
- audio=audio,
20
- return_tensors="pt"
21
- ).to(model.device, dtype=model.dtype)
 
 
22
 
23
- # Generate text output
 
 
 
 
 
 
 
 
 
24
  with torch.inference_mode:
25
- outputs = model.generate(
26
- **inputs,
27
- max_new_tokens=256
 
28
  )
29
-
30
- # Decode and return text
31
- text = processor.batch_decode(
32
- outputs,
33
  skip_special_tokens=True,
34
  clean_up_tokenization_spaces=True
35
- )[0]
36
- return text
37
 
38
  # Gradio interface
39
  iface = gr.Interface(
 
13
 
14
  @spaces.GPU
15
  def process_inputs(image, audio):
16
+ messages = [
17
+ {
18
+ "role": "user",
19
+ "content": [
20
+ {"type": "image", "image": image,},
21
+ {"type": "audio", "audio": audio,},
22
+ ]
23
+ },]
24
 
25
+ input_ids = self.processor.apply_chat_template(
26
+ messages,
27
+ add_generation_prompt=True,
28
+ tokenize=True,
29
+ return_dict=True,
30
+ return_tensors="pt",
31
+ )
32
+ input_len = input_ids["input_ids"].shape[-1]
33
+
34
+ input_ids = input_ids.to(self.model.device, dtype=model.dtype)
35
  with torch.inference_mode:
36
+ outputs = self.model.generate(
37
+ **input_ids,
38
+ max_new_tokens=max_tokens,
39
+ disable_compile=True
40
  )
41
+ text = self.processor.batch_decode(
42
+ outputs[:, input_len:],
 
 
43
  skip_special_tokens=True,
44
  clean_up_tokenization_spaces=True
45
+ )
46
+ return text[0]
47
 
48
  # Gradio interface
49
  iface = gr.Interface(
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
  spaces
2
  gradio
3
- transformers==4.53.0
 
 
1
  spaces
2
  gradio
3
+ transformers==4.53.0
4
+ timm==1.0.16