Coool2 commited on
Commit
979a803
·
1 Parent(s): ec692d1

Update agent.py

Browse files
Files changed (1) hide show
  1. agent.py +63 -10
agent.py CHANGED
@@ -26,7 +26,6 @@ from llama_index.core.schema import ImageNode, TextNode
26
  # LlamaIndex specialized imports
27
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
28
  from llama_index.llms.huggingface import HuggingFaceLLM
29
- from llama_index.multi_modal_llms.huggingface import HuggingFaceMultiModal
30
  from llama_index.readers.assemblyai import AssemblyAIAudioTranscriptReader
31
  from llama_index.readers.json import JSONReader
32
  from llama_index.readers.web import BeautifulSoupWebReader
@@ -121,16 +120,70 @@ def initialize_models(use_api_mode=False):
121
  print("Initializing models in non-API mode with local models...")
122
 
123
  try :
124
- proj_llm = HuggingFaceMultiModal.from_model_name(
125
- "Qwen/Qwen2.5-VL-7B-Instruct",
126
- temperature=0.7,
127
- top_p=0.9,
128
- top_k=40,
129
- max_new_tokens=5120,
130
- device_map="auto",
131
- model_kwargs={"torch_dtype": "auto"}
 
 
 
 
 
 
 
 
132
  )
133
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
  # Code LLM
136
  code_llm = HuggingFaceLLM(
 
26
  # LlamaIndex specialized imports
27
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
28
  from llama_index.llms.huggingface import HuggingFaceLLM
 
29
  from llama_index.readers.assemblyai import AssemblyAIAudioTranscriptReader
30
  from llama_index.readers.json import JSONReader
31
  from llama_index.readers.web import BeautifulSoupWebReader
 
120
  print("Initializing models in non-API mode with local models...")
121
 
122
  try :
123
+ from typing import Any, Optional, List, Mapping
124
+ from llama_index.core.llms import CustomLLM, CompletionResponse, CompletionResponseGen, LLMMetadata
125
+ from llama_index.core.llms.callbacks import llm_completion_callback
126
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
127
+ from qwen_vl_utils import process_vision_info
128
+ import torch
129
+
130
+ class QwenVL7BCustomLLM(CustomLLM):
131
+ context_window: int = 32768
132
+ num_output: int = 256
133
+ model_name: str = "Qwen/Qwen2.5-VL-7B-Instruct"
134
+
135
+ def __init__(self, device: str = "cuda"):
136
+ self.device = device
137
+ self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
138
+ self.model_name, torch_dtype=torch.bfloat16, device_map="auto"
139
  )
140
+ self.processor = AutoProcessor.from_pretrained(self.model_name)
141
+
142
+ @property
143
+ def metadata(self) -> LLMMetadata:
144
+ return LLMMetadata(
145
+ context_window=self.context_window,
146
+ num_output=self.num_output,
147
+ model_name=self.model_name,
148
+ )
149
+
150
+ @llm_completion_callback()
151
+ def complete(self, prompt: str, image_paths: Optional[List[str]] = None, **kwargs: Any) -> CompletionResponse:
152
+ # Prepare messages for multimodal input
153
+ messages = [{"role": "user", "content": []}]
154
+ if image_paths:
155
+ for path in image_paths:
156
+ messages[0]["content"].append({"type": "image", "image": path})
157
+ messages[0]["content"].append({"type": "text", "text": prompt})
158
+
159
+ # Process inputs
160
+ text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
161
+ image_inputs, video_inputs = process_vision_info(messages)
162
+ inputs = self.processor(
163
+ text=[text],
164
+ images=image_inputs,
165
+ videos=video_inputs,
166
+ padding=True,
167
+ return_tensors="pt",
168
+ )
169
+ inputs = inputs.to(self.model.device)
170
+
171
+ # Generate output
172
+ generated_ids = self.model.generate(**inputs, max_new_tokens=self.num_output)
173
+ generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
174
+ output_text = self.processor.batch_decode(
175
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
176
+ )[0]
177
+ return CompletionResponse(text=output_text)
178
+
179
+ @llm_completion_callback()
180
+ def stream_complete(self, prompt: str, image_paths: Optional[List[str]] = None, **kwargs: Any) -> CompletionResponseGen:
181
+ response = self.complete(prompt, image_paths)
182
+ for token in response.text:
183
+ yield CompletionResponse(text=token, delta=token)
184
+
185
+
186
+ proj_llm = QwenVL7BCustomLLM()
187
 
188
  # Code LLM
189
  code_llm = HuggingFaceLLM(