Spaces:

sunbv56
/

demo-qwen2.5-vl-vqa-vibook

Sleeping

App Files Files Community

sunbv56 commited on Jun 22

Commit

21219d5

verified ·

1 Parent(s): 8f3e0b6

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -84

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app.py
 import gradio as gr
 import torch
@@ -8,66 +8,29 @@ from gradio.events import SelectData
 import warnings
 import os
 import requests
-from typing import List
 warnings.filterwarnings("ignore", category=UserWarning, message="Overriding torch_dtype=None")
-# --- 1. Tải Model và Processor với TỐI ƯU HÓA (KHÔNG LƯỢNG TỬ HÓA) ---
 MODEL_ID = "sunbv56/qwen2.5-vl-vqa-vibook"
 print(f"🚀 Đang tải model '{MODEL_ID}' và processor...")
-use_gpu = torch.cuda.is_available()
-# *** THAY ĐỔI: Không sử dụng lượng tử hóa. Chọn dtype phù hợp. ***
-if use_gpu:
-    # Nếu có GPU, sử dụng bfloat16/float16 để có hiệu năng tốt nhất
-    dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
-    print(f"Sử dụng GPU với dtype: {dtype}")
-else:
-    # Trên CPU, chạy với độ chính xác đầy đủ float32.
-    # Lượng tử hóa đã bị loại bỏ theo yêu cầu.
-    print("⚠️  Cảnh báo: Chạy model ở độ chính xác float32 trên CPU.")
-    print("     -> Tốc độ sẽ chậm hơn so với phiên bản lượng tử hóa.")
-    print("     -> Các tối ưu hóa khác (torch.compile, batching) vẫn được áp dụng.")
-    dtype = torch.float32
 try:
-    model = AutoModelForImageTextToText.from_pretrained(
-        MODEL_ID,
-        torch_dtype=dtype,         # Sử dụng dtype đã chọn
-        device_map="auto",         # `accelerate` sẽ xử lý việc đặt model lên thiết bị
-        trust_remote_code=True,
-    )
     processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True, use_fast=True)
-    # *** TỐI ƯU HÓA 1: Sử dụng torch.compile() (cho PyTorch 2.0+) ***
-    # Biên dịch model để tăng tốc độ inference sau lần chạy đầu tiên.
-    print("🚀 Đang biên dịch model với torch.compile()... (có thể mất một chút thời gian cho lần đầu)")
-    model = torch.compile(model, mode="reduce-overhead", fullgraph=True)
-    print("✅ Model đã được biên dịch.")
     model.eval()
-    print(f"✅ Model và processor đã được tải và tối ưu hóa thành công!")
 except Exception as e:
-    print(f"❌ Lỗi khi tải/tối ưu hóa model/processor: {e}")
     exit()
-# --- 2. Hàm Inference Cốt lõi đã được sửa đổi để xử lý BATCH ---
-def process_vqa_batch(images: List[Image.Image], questions: List[str]):
-    prompts = []
-    processed_images = []
-    # Chuẩn bị prompt và ảnh cho từng item trong batch
-    for image, question in zip(images, questions):
-        if image.mode != "RGB":
-            image = image.convert("RGB")
-        messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": question}]}]
-        prompt_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        prompts.append(prompt_text)
-        processed_images.append(image)
-    # Xử lý cả batch cùng một lúc
-    model_inputs = processor(text=prompts, images=processed_images, return_tensors="pt", padding=True).to(model.device)
     generated_ids = model.generate(
         **model_inputs,
@@ -78,54 +41,83 @@ def process_vqa_batch(images: List[Image.Image], questions: List[str]):
         pad_token_id=processor.tokenizer.pad_token_id
     )
-    # Decode kết quả cho cả batch
     generated_ids = generated_ids[:, model_inputs['input_ids'].shape[1]:]
-    responses = processor.batch_decode(generated_ids, skip_special_tokens=True)
-    # Strip() cho mỗi response trong list
-    return [res.strip() for res in responses]
 # --- 3. Logic Chatbot ---
-THINKING_HTML = """<div class="typing-indicator"><span></span><span></span><span></span></div>"""
-CUSTOM_CSS = """@keyframes blink{0%{opacity:.2}20%{opacity:1}100%{opacity:.2}}.typing-indicator{display:flex;align-items:center;justify-content:flex-start;padding:8px 0}.typing-indicator span{height:10px;width:10px;margin:0 2px;background-color:#9E9E9E;border-radius:50%;animation:blink 1.4s infinite both}.typing-indicator span:nth-child(2){animation-delay:.2s}.typing-indicator span:nth-child(3){animation-delay:.4s}"""
-# *** TỐI ƯU HÓA 2: Sửa đổi hàm để tương thích với BATCHING của Gradio ***
-def manual_chat_responder(user_questions: List[str], chat_histories: List[list], uploaded_images: List[Image.Image]):
-    user_question = user_questions[0]
-    chat_history = chat_histories[0]
-    uploaded_image = uploaded_images[0]
     if uploaded_image is None:
         gr.Warning("Vui lòng tải ảnh lên trước để đặt câu hỏi về nó.")
-        return [("", chat_history)]
     if not user_question or not user_question.strip():
         gr.Warning("Vui lòng nhập một câu hỏi.")
-        return [("", chat_history)]
     chat_history.append({"role": "user", "content": user_question})
     chat_history.append({"role": "assistant", "content": THINKING_HTML})
-    # Gọi hàm xử lý batch (dù chỉ có 1 item trong trường hợp này)
-    bot_response = process_vqa_batch([uploaded_image], [user_question])[0]
     chat_history[-1]["content"] = bot_response
-    return [("", chat_history)]
 def run_example(evt: SelectData):
     selected_example = example_list[evt.index]
     image_path, question = selected_example
     gr.Info(f"Đang chạy ví dụ: \"{question}\"")
     image = Image.open(image_path).convert("RGB")
     chat_history = [
         {"role": "user", "content": question},
         {"role": "assistant", "content": THINKING_HTML}
     ]
     yield image, question, chat_history
-    bot_response = process_vqa_batch([image], [question])[0]
     chat_history[-1]["content"] = bot_response
     yield image, question, chat_history
@@ -134,8 +126,9 @@ def clear_chat():
     return []
 # --- 4. Định nghĩa Giao diện Người dùng Gradio ---
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), title="Vibook VQA Chatbot", css=CUSTOM_CSS) as demo:
-    gr.Markdown("# 🤖 Vibook VQA Chatbot (Optimized - No Quantization)")
     example_list = [
         ["./assets/book_example_1.jpg", "Đâu là tên đúng của cuốn sách này?"],
@@ -154,12 +147,8 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), ti
             chatbot = gr.Chatbot(label="Cuộc trò chuyện", height=600, avatar_images=(None, "https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.png"), type="messages", value=[])
             question_input = gr.Textbox(label="Hoặc nhập câu hỏi về ảnh đã tải lên", placeholder="Nhập câu hỏi và nhấn Enter...", container=False, scale=7)
-    # --- 5. Xử lý Sự kiện với TỐI ƯU HÓA BATCHING ---
-    question_input.submit(
-        fn=manual_chat_responder,
-        inputs=[question_input, chatbot, image_input],
-        outputs=[question_input, chatbot]
-    ).batch(batch_size=4, max_latency=0.1)
     example_dataset.select(fn=run_example, inputs=None, outputs=[image_input, question_input, chatbot], show_progress="full")
@@ -167,7 +156,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), ti
     image_input.clear(fn=clear_chat, inputs=None, outputs=[chatbot])
 # --- Phần cuối ---
-def setup_examples():
     ASSETS_DIR = "assets"
     if not os.path.exists(ASSETS_DIR):
         os.makedirs(ASSETS_DIR)
@@ -192,6 +181,4 @@ def setup_examples():
             except requests.exceptions.RequestException as e:
                 print(f" Lỗi khi tải {filename}: {e}")
-if __name__ == "__main__":
-    setup_examples()
     demo.launch(debug=True)

+# app.py
 import gradio as gr
 import torch
 import warnings
 import os
 import requests
 warnings.filterwarnings("ignore", category=UserWarning, message="Overriding torch_dtype=None")
+# --- 1. Tải Model và Processor ---
 MODEL_ID = "sunbv56/qwen2.5-vl-vqa-vibook"
 print(f"🚀 Đang tải model '{MODEL_ID}' và processor...")
 try:
+    dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
+    model = AutoModelForImageTextToText.from_pretrained(MODEL_ID, torch_dtype=dtype, device_map="auto", trust_remote_code=True)
     processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True, use_fast=True)
     model.eval()
+    print(f"✅ Model và processor đã được tải thành công!")
 except Exception as e:
+    print(f"❌ Lỗi khi tải model/processor: {e}")
     exit()
+# --- 2. Hàm Inference Cốt lõi ---
+def process_vqa(image: Image.Image, question: str):
+    if image.mode != "RGB":
+        image = image.convert("RGB")
+    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": question}]}]
+    prompt_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    model_inputs = processor(text=[prompt_text], images=[image], return_tensors="pt").to(model.device)
     generated_ids = model.generate(
         **model_inputs,
         pad_token_id=processor.tokenizer.pad_token_id
     )
     generated_ids = generated_ids[:, model_inputs['input_ids'].shape[1]:]
+    response = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=True).strip()
+    return response
 # --- 3. Logic Chatbot ---
+# ### THAY ĐỔI MỚI 1: Định nghĩa HTML và CSS cho hiệu ứng động ###
+# HTML cho hiệu ứng "đang gõ"
+THINKING_HTML = """
+<div class="typing-indicator">
+    <span></span>
+    <span></span>
+    <span></span>
+</div>
+"""
+# CSS để tạo hiệu ứng
+CUSTOM_CSS = """
+@keyframes blink {
+    0% { opacity: .2; }
+    20% { opacity: 1; }
+    100% { opacity: .2; }
+}
+.typing-indicator {
+    display: flex;
+    align-items: center;
+    justify-content: flex-start; /* Căn trái */
+    padding: 8px 0; /* Thêm chút khoảng đệm */
+}
+.typing-indicator span {
+    height: 10px;
+    width: 10px;
+    margin: 0 2px;
+    background-color: #9E9E9E; /* Màu xám */
+    border-radius: 50%;
+    animation: blink 1.4s infinite both;
+}
+.typing-indicator span:nth-child(2) {
+    animation-delay: .2s;
+}
+.typing-indicator span:nth-child(3) {
+    animation-delay: .4s;
+}
+"""
+# Hàm dành cho việc người dùng tự nhập câu hỏi
+def manual_chat_responder(user_question: str, chat_history: list, uploaded_image: Image.Image):
     if uploaded_image is None:
         gr.Warning("Vui lòng tải ảnh lên trước để đặt câu hỏi về nó.")
+        return "", chat_history
     if not user_question or not user_question.strip():
         gr.Warning("Vui lòng nhập một câu hỏi.")
+        return "", chat_history
     chat_history.append({"role": "user", "content": user_question})
+    # ### THAY ĐỔI MỚI 2: Sử dụng HTML động thay cho text tĩnh ###
     chat_history.append({"role": "assistant", "content": THINKING_HTML})
+    yield "", chat_history
+    bot_response = process_vqa(uploaded_image, user_question)
     chat_history[-1]["content"] = bot_response
+    yield "", chat_history
+# Hàm dành riêng cho việc xử lý khi nhấn vào ví dụ
 def run_example(evt: SelectData):
     selected_example = example_list[evt.index]
     image_path, question = selected_example
     gr.Info(f"Đang chạy ví dụ: \"{question}\"")
     image = Image.open(image_path).convert("RGB")
+    # ### THAY ĐỔI MỚI 3: Sử dụng HTML động thay cho text tĩnh ###
     chat_history = [
         {"role": "user", "content": question},
         {"role": "assistant", "content": THINKING_HTML}
     ]
     yield image, question, chat_history
+    bot_response = process_vqa(image, question)
     chat_history[-1]["content"] = bot_response
     yield image, question, chat_history
     return []
 # --- 4. Định nghĩa Giao diện Người dùng Gradio ---
+# ### THAY ĐỔI MỚI 4: Thêm CSS vào Blocks ###
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), title="Vibook VQA Chatbot", css=CUSTOM_CSS) as demo:
+    gr.Markdown("# 🤖 Vibook VQA Chatbot")
     example_list = [
         ["./assets/book_example_1.jpg", "Đâu là tên đúng của cuốn sách này?"],
             chatbot = gr.Chatbot(label="Cuộc trò chuyện", height=600, avatar_images=(None, "https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.png"), type="messages", value=[])
             question_input = gr.Textbox(label="Hoặc nhập câu hỏi về ảnh đã tải lên", placeholder="Nhập câu hỏi và nhấn Enter...", container=False, scale=7)
+    # --- 5. Xử lý Sự kiện ---
+    question_input.submit(fn=manual_chat_responder, inputs=[question_input, chatbot, image_input], outputs=[question_input, chatbot])
     example_dataset.select(fn=run_example, inputs=None, outputs=[image_input, question_input, chatbot], show_progress="full")
     image_input.clear(fn=clear_chat, inputs=None, outputs=[chatbot])
 # --- Phần cuối ---
+if __name__ == "__main__":
     ASSETS_DIR = "assets"
     if not os.path.exists(ASSETS_DIR):
         os.makedirs(ASSETS_DIR)
             except requests.exceptions.RequestException as e:
                 print(f" Lỗi khi tải {filename}: {e}")
     demo.launch(debug=True)