File size: 4,483 Bytes
16dca74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from .payload_model import SingleInferencePayload, VideoInferencePayload
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from pydantic import BaseModel
import torch


class Qwen2_5(BaseModel):
    def __init__(self, model_path: str):
        self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
            model_path, torch_dtype="auto", device_map="auto"
        )
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.processor = AutoProcessor.from_pretrained(model_path)

    def prepare_single_inference(self, image: str, question: str):
        image = f"data:image;base64,{image}"
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "image": image,
                        
                    },
                    {
                        "type": "text",
                        "text": question
                    },
                ],
            }
        ]
        text = self.processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = self.processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
        inputs = inputs.to("cuda")

        return inputs
    
    def prepare_video_inference(self, video: list[str], question: str):
        base64_videos = []
        for frame in video:
            base64_videos.append(f"data:image;base64,{frame}")
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "video",
                        "video": base64_videos,
                    },
                    {
                        "type": "text",
                        "text": question
                    },
                ],
            }
        ]
        text = self.processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
        inputs = self.processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            fps=1.0,
            padding=True,
            return_tensors="pt",
            **video_kwargs,
        )
        inputs = inputs.to("cuda")
        return inputs

    def get_single_inference(self, payload: SingleInferencePayload):
        try:
            processed_inputs = self.prepare_single_inference(payload.image_path, payload.question)
            generated_ids = self.model.generate(**processed_inputs, max_new_tokens=128)
            generated_ids_trimmed = [
                out_ids[len(in_ids) :] for in_ids, out_ids in zip(processed_inputs.input_ids, generated_ids)
            ]
            output_text = self.processor.batch_decode(
                generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
            )
            print(f"Model generated text: {output_text}")
            return {
                "message": output_text,
                "status": 200
            }
        except Exception as e:
            return {
                "message": str(e),
                "status": 500
            }

    def get_video_inference(self, payload: VideoInferencePayload):
        try:
            processed_inputs = self.prepare_video_inference(payload.video_path, payload.question)
            generated_ids = self.model.generate(**processed_inputs, max_new_tokens=128)
            generated_ids_trimmed = [
                out_ids[len(in_ids) :] for in_ids, out_ids in zip(processed_inputs.input_ids, generated_ids)
            ]
            output_text = self.processor.batch_decode(
                generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
            )
            print(f"Model generated text: {output_text}")
            return {
                "message": output_text,
                "status": 200
            }
        except Exception as e:
            return {
                "message": str(e),
                "status": 500
            }