Spaces:

sunbv56
/

demo-qwen2.5-vl-vqa-vibook

Sleeping

App Files Files Community

sunbv56 commited on Jun 22

Commit

7936364

verified ·

1 Parent(s): b8ddda5

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -102

app.py CHANGED Viewed

@@ -1,133 +1,124 @@
-# app.py
 import gradio as gr
 import torch
 from PIL import Image
-from transformers import AutoModelForImageTextToText, AutoProcessor, BitsAndBytesConfig
 from gradio.events import SelectData
 import warnings
 import os
 import requests
 warnings.filterwarnings("ignore", category=UserWarning, message="Overriding torch_dtype=None")
-# --- 1. Tải Model và Processor (ĐÃ TỐI ƯU) ---
 MODEL_ID = "sunbv56/qwen2.5-vl-vqa-vibook"
-print(f"🚀 Đang tải model '{MODEL_ID}' và processor với các tối ưu hóa...")
-# ### THAY ĐỔI TỐI ƯU 1: Cấu hình Lượng tử hóa 4-bit (Quantization) ###
-# Sử dụng 4-bit quantization để tăng tốc độ inference và giảm VRAM
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_compute_dtype=torch.bfloat16
-)
 try:
-    # ### THAY ĐỔI TỐI ƯU 2: Tải model với Quantization và Flash Attention 2 ###
     model = AutoModelForImageTextToText.from_pretrained(
-        MODEL_ID,
-        device_map="auto",
         trust_remote_code=True,
-        quantization_config=quantization_config,
-        # attn_implementation="flash_attention_2" # Bỏ comment dòng này nếu bạn có GPU tương thích (NVIDIA 30xx/40xx) và đã cài flash-attn thành công
     )
     processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True, use_fast=True)
-    # ### THAY ĐỔI TỐI ƯU 3: Biên dịch model với torch.compile ###
-    # Lần chạy đầu tiên sẽ mất chút thời gian để biên dịch, nhưng các lần sau sẽ rất nhanh.
-    # Chỉ hoạt động trên Linux/MacOS với PyTorch 2.0+ và GPU.
-    try:
-        print("🚀 Đang cố gắng biên dịch model với torch.compile()...")
-        model = torch.compile(model, mode="reduce-overhead", fullgraph=True)
-        print("✅ Biên dịch model thành công!")
-    except Exception as e:
-        print(f"⚠️ Không thể biên dịch model: {e}. Chạy ở chế độ thông thường.")
     model.eval()
-    print(f"✅ Model và processor đã được tải và tối ưu thành công!")
 except Exception as e:
-    print(f"❌ Lỗi khi tải model/processor: {e}")
     exit()
-# --- 2. Hàm Inference Cốt lõi (Không cần thay đổi) ---
-# Các tối ưu đã được áp dụng ở tầng model, nên hàm này sẽ tự động chạy nhanh hơn.
-def process_vqa(image: Image.Image, question: str):
-    if image.mode != "RGB":
-        image = image.convert("RGB")
-    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": question}]}]
-    prompt_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    model_inputs = processor(text=[prompt_text], images=[image], return_tensors="pt").to(model.device)
-    # Sử dụng torch.no_grad() để tắt việc tính toán gradient, giúp tiết kiệm bộ nhớ và tăng tốc độ
-    with torch.no_grad():
-        generated_ids = model.generate(
-            **model_inputs,
-            max_new_tokens=128,
-            do_sample=False,
-            temperature=1.0,
-            eos_token_id=processor.tokenizer.eos_token_id,
-            pad_token_id=processor.tokenizer.pad_token_id
-        )
     generated_ids = generated_ids[:, model_inputs['input_ids'].shape[1]:]
-    response = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=True).strip()
-    return response
-# --- 3. Logic Chatbot (Giữ nguyên) ---
-THINKING_HTML = """
-<div class="typing-indicator">
-    <span></span>
-    <span></span>
-    <span></span>
-</div>
-"""
-CUSTOM_CSS = """
-@keyframes blink {
-    0% { opacity: .2; }
-    20% { opacity: 1; }
-    100% { opacity: .2; }
-}
-.typing-indicator {
-    display: flex;
-    align-items: center;
-    justify-content: flex-start;
-    padding: 8px 0;
-}
-.typing-indicator span {
-    height: 10px;
-    width: 10px;
-    margin: 0 2px;
-    background-color: #9E9E9E;
-    border-radius: 50%;
-    animation: blink 1.4s infinite both;
-}
-.typing-indicator span:nth-child(2) {
-    animation-delay: .2s;
-}
-.typing-indicator span:nth-child(3) {
-    animation-delay: .4s;
-}
-"""
-def manual_chat_responder(user_question: str, chat_history: list, uploaded_image: Image.Image):
     if uploaded_image is None:
         gr.Warning("Vui lòng tải ảnh lên trước để đặt câu hỏi về nó.")
-        return "", chat_history
     if not user_question or not user_question.strip():
         gr.Warning("Vui lòng nhập một câu hỏi.")
-        return "", chat_history
     chat_history.append({"role": "user", "content": user_question})
     chat_history.append({"role": "assistant", "content": THINKING_HTML})
-    yield "", chat_history
-    bot_response = process_vqa(uploaded_image, user_question)
     chat_history[-1]["content"] = bot_response
-    yield "", chat_history
 def run_example(evt: SelectData):
     selected_example = example_list[evt.index]
     image_path, question = selected_example
@@ -140,7 +131,7 @@ def run_example(evt: SelectData):
     ]
     yield image, question, chat_history
-    bot_response = process_vqa(image, question)
     chat_history[-1]["content"] = bot_response
     yield image, question, chat_history
@@ -148,9 +139,9 @@ def run_example(evt: SelectData):
 def clear_chat():
     return []
-# --- 4. Giao diện Người dùng Gradio (Giữ nguyên) ---
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), title="Vibook VQA Chatbot", css=CUSTOM_CSS) as demo:
-    gr.Markdown("# 🤖 Vibook VQA Chatbot")
     example_list = [
         ["./assets/book_example_1.jpg", "Đâu là tên đúng của cuốn sách này?"],
@@ -169,13 +160,20 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), ti
             chatbot = gr.Chatbot(label="Cuộc trò chuyện", height=600, avatar_images=(None, "https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.png"), type="messages", value=[])
             question_input = gr.Textbox(label="Hoặc nhập câu hỏi về ảnh đã tải lên", placeholder="Nhập câu hỏi và nhấn Enter...", container=False, scale=7)
-    question_input.submit(fn=manual_chat_responder, inputs=[question_input, chatbot, image_input], outputs=[question_input, chatbot])
     example_dataset.select(fn=run_example, inputs=None, outputs=[image_input, question_input, chatbot], show_progress="full")
     image_input.upload(fn=clear_chat, inputs=None, outputs=[chatbot])
     image_input.clear(fn=clear_chat, inputs=None, outputs=[chatbot])
-# --- Phần cuối (Giữ nguyên) ---
-if __name__ == "__main__":
     ASSETS_DIR = "assets"
     if not os.path.exists(ASSETS_DIR):
         os.makedirs(ASSETS_DIR)
@@ -200,4 +198,6 @@ if __name__ == "__main__":
             except requests.exceptions.RequestException as e:
                 print(f" Lỗi khi tải {filename}: {e}")
     demo.launch(debug=True)

+# app_optimized.py
 import gradio as gr
 import torch
 from PIL import Image
+from transformers import AutoModelForImageTextToText, AutoProcessor
 from gradio.events import SelectData
 import warnings
 import os
 import requests
+from typing import List
 warnings.filterwarnings("ignore", category=UserWarning, message="Overriding torch_dtype=None")
+# --- 1. Tải Model và Processor với TỐI ƯU HÓA ---
 MODEL_ID = "sunbv56/qwen2.5-vl-vqa-vibook"
+print(f"🚀 Đang tải model '{MODEL_ID}' và processor...")
+# *** TỐI ƯU HÓA 1: Lượng tử hóa (Quantization) ***
+# Sử dụng `load_in_8bit=True` để tăng tốc đáng kể trên CPU.
+# Yêu cầu `pip install bitsandbytes accelerate`
+# Lưu ý: Lượng tử hóa sẽ không dùng `torch_dtype` vì nó hoạt động trên các kiểu dữ liệu khác.
+# `device_map="auto"` sẽ tự động xử lý việc đặt model lên thiết bị.
+use_gpu = torch.cuda.is_available()
 try:
     model = AutoModelForImageTextToText.from_pretrained(
+        MODEL_ID,
+        device_map="auto",
         trust_remote_code=True,
+        # Chỉ lượng tử hóa khi chạy trên CPU để tiết kiệm tài nguyên và tăng tốc
+        load_in_8bit=not use_gpu,
+        # Nếu có GPU, sử dụng bfloat16/float16 để có hiệu năng tốt nhất
+        torch_dtype=torch.bfloat16 if use_gpu and torch.cuda.is_bf16_supported() else torch.float16
     )
     processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True, use_fast=True)
+    # *** TỐI ƯU HÓA 2: Sử dụng torch.compile() (cho PyTorch 2.0+) ***
+    # Biên dịch model để tăng tốc độ inference sau lần chạy đầu tiên.
+    # Chế độ 'reduce-overhead' tốt cho các input nhỏ và giảm gánh nặng của framework.
+    print("🚀 Đang biên dịch model với torch.compile()...")
+    model = torch.compile(model, mode="reduce-overhead", fullgraph=True)
+    print("✅ Model đã được biên dịch.")
     model.eval()
+    print(f"✅ Model và processor đã được tải và tối ưu hóa thành công!")
 except Exception as e:
+    print(f"❌ Lỗi khi tải/tối ưu hóa model/processor: {e}")
     exit()
+# --- 2. Hàm Inference Cốt lõi đã được sửa đổi để xử lý BATCH ---
+def process_vqa_batch(images: List[Image.Image], questions: List[str]):
+    prompts = []
+    processed_images = []
+    # Chuẩn bị prompt và ảnh cho từng item trong batch
+    for image, question in zip(images, questions):
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+        messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": question}]}]
+        prompt_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        prompts.append(prompt_text)
+        processed_images.append(image)
+    # Xử lý cả batch cùng một lúc
+    model_inputs = processor(text=prompts, images=processed_images, return_tensors="pt", padding=True).to(model.device)
+    generated_ids = model.generate(
+        **model_inputs,
+        max_new_tokens=128,
+        do_sample=False,
+        temperature=1.0,
+        eos_token_id=processor.tokenizer.eos_token_id,
+        pad_token_id=processor.tokenizer.pad_token_id
+    )
+    # Decode kết quả cho cả batch
     generated_ids = generated_ids[:, model_inputs['input_ids'].shape[1]:]
+    responses = processor.batch_decode(generated_ids, skip_special_tokens=True)
+    # Strip() cho mỗi response trong list
+    return [res.strip() for res in responses]
+# --- 3. Logic Chatbot ---
+THINKING_HTML = """<div class="typing-indicator"><span></span><span></span><span></span></div>"""
+CUSTOM_CSS = """@keyframes blink{0%{opacity:.2}20%{opacity:1}100%{opacity:.2}}.typing-indicator{display:flex;align-items:center;justify-content:flex-start;padding:8px 0}.typing-indicator span{height:10px;width:10px;margin:0 2px;background-color:#9E9E9E;border-radius:50%;animation:blink 1.4s infinite both}.typing-indicator span:nth-child(2){animation-delay:.2s}.typing-indicator span:nth-child(3){animation-delay:.4s}"""
+# *** TỐI ƯU HÓA 3: Sửa đổi hàm để tương thích với BATCHING của Gradio ***
+# Hàm này giờ nhận vào một list các câu hỏi và trả về một list các câu trả lời
+def manual_chat_responder(user_questions: List[str], chat_histories: List[list], uploaded_images: List[Image.Image]):
+    # Do cách Gradio batching hoạt động, chúng ta chỉ lấy item đầu tiên
+    # vì mỗi người dùng có một giao diện riêng biệt.
+    # Tuy nhiên, hàm process_vqa_batch vẫn được thiết kế để xử lý batch thực sự.
+    user_question = user_questions[0]
+    chat_history = chat_histories[0]
+    uploaded_image = uploaded_images[0]
     if uploaded_image is None:
         gr.Warning("Vui lòng tải ảnh lên trước để đặt câu hỏi về nó.")
+        return [("", chat_history)] # Phải trả về list
     if not user_question or not user_question.strip():
         gr.Warning("Vui lòng nhập một câu hỏi.")
+        return [("", chat_history)] # Phải trả về list
     chat_history.append({"role": "user", "content": user_question})
     chat_history.append({"role": "assistant", "content": THINKING_HTML})
+    # Tạm thời yield để cập nhật UI
+    # Gradio batching không hỗ trợ yield trực tiếp, nên chúng ta sẽ bỏ qua bước này
+    # và trả về kết quả cuối cùng. Người dùng sẽ thấy indicator trong một khoảng thời gian ngắn.
+    # Gọi hàm xử lý batch (dù chỉ có 1 item)
+    bot_response = process_vqa_batch([uploaded_image], [user_question])[0]
     chat_history[-1]["content"] = bot_response
+    # Phải trả về một list các kết quả, tương ứng với batch đầu vào
+    return [("", chat_history)]
+# Hàm chạy ví dụ không cần batching vì nó chỉ là một hành động đơn lẻ
 def run_example(evt: SelectData):
     selected_example = example_list[evt.index]
     image_path, question = selected_example
     ]
     yield image, question, chat_history
+    bot_response = process_vqa_batch([image], [question])[0]
     chat_history[-1]["content"] = bot_response
     yield image, question, chat_history
 def clear_chat():
     return []
+# --- 4. Định nghĩa Giao diện Người dùng Gradio ---
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), title="Vibook VQA Chatbot", css=CUSTOM_CSS) as demo:
+    gr.Markdown("# 🤖 Vibook VQA Chatbot (Optimized)")
     example_list = [
         ["./assets/book_example_1.jpg", "Đâu là tên đúng của cuốn sách này?"],
             chatbot = gr.Chatbot(label="Cuộc trò chuyện", height=600, avatar_images=(None, "https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.png"), type="messages", value=[])
             question_input = gr.Textbox(label="Hoặc nhập câu hỏi về ảnh đã tải lên", placeholder="Nhập câu hỏi và nhấn Enter...", container=False, scale=7)
+    # --- 5. Xử lý Sự kiện với TỐI ƯU HÓA BATCHING ---
+    question_input.submit(
+        fn=manual_chat_responder,
+        inputs=[question_input, chatbot, image_input],
+        outputs=[question_input, chatbot]
+    ).batch(batch_size=4, max_latency=0.1) # Gom tối đa 4 request, hoặc xử lý sau mỗi 0.1 giây
     example_dataset.select(fn=run_example, inputs=None, outputs=[image_input, question_input, chatbot], show_progress="full")
     image_input.upload(fn=clear_chat, inputs=None, outputs=[chatbot])
     image_input.clear(fn=clear_chat, inputs=None, outputs=[chatbot])
+# --- Phần cuối ---
+def setup_examples():
     ASSETS_DIR = "assets"
     if not os.path.exists(ASSETS_DIR):
         os.makedirs(ASSETS_DIR)
             except requests.exceptions.RequestException as e:
                 print(f" Lỗi khi tải {filename}: {e}")
+if __name__ == "__main__":
+    setup_examples()
     demo.launch(debug=True)