Spaces:

Lemorra
/

Qwen2.5-VL-3B-Instruct-Backend-API

Running

App Files Files Community

Lemorra commited on May 20

Commit

16dca74

1 Parent(s): ec7fb79

🎉 Added support for Qwen2_5 model

Browse files

Files changed (12) hide show

Dockerfile +16 -0
requirement.txt +9 -0
src/__pycache__/app.cpython-310.pyc +0 -0
src/app.py +32 -0
src/utils/__init__.py +0 -0
src/utils/__pycache__/__init__.cpython-310.pyc +0 -0
src/utils/__pycache__/authentication.cpython-310.pyc +0 -0
src/utils/__pycache__/payload_model.cpython-310.pyc +0 -0
src/utils/__pycache__/qwen_inference.cpython-310.pyc +0 -0
src/utils/authentication.py +20 -0
src/utils/payload_model.py +9 -0
src/utils/qwen_inference.py +124 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.9
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "src/app:app", "--host", "0.0.0.0", "--port", "7860"]

requirement.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+fastapi
+uvicorn[standard]
+git+https://github.com/huggingface/transformers
+accelerate
+qwen-vl-utils[decord]==0.0.8
+python-dotenv
+PyJWT
+pydantic
+torch

src/__pycache__/app.cpython-310.pyc ADDED Viewed

Binary file (1.19 kB). View file

src/app.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from pyexpat import model
+from typing import Annotated
+from fastapi import FastAPI, Depends
+from .utils.authentication import verify_token
+from .utils.payload_model import SingleInferencePayload, VideoInferencePayload
+from .utils.qwen_inference import Qwen2_5, get_single_inference, get_video_inference
+import os
+from dotenv import load_dotenv
+load_dotenv()
+model_path = os.getenv("MODEL_PATH")
+model_object = Qwen2_5(model_path)
+app = FastAPI()
+@app.get("/")
+def greet_json():
+    return {
+        "message": "Welcome! The backend API for Qwen2.5-VL-3B-Instruct model is running.",
+        "status": "active"
+    }
+@app.post("/single_inference")
+def single_inference(payload: SingleInferencePayload, _token: Annotated[dict, Depends(verify_token)]):
+    return model_object.get_single_inference(payload)
+@app.post("/video_inference")
+def video_inference(payload: VideoInferencePayload, _token: Annotated[dict, Depends(verify_token)]):
+    return model_object.get_video_inference(payload)

src/utils/__init__.py ADDED Viewed

File without changes

src/utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (236 Bytes). View file

src/utils/__pycache__/authentication.cpython-310.pyc ADDED Viewed

Binary file (1.02 kB). View file

src/utils/__pycache__/payload_model.cpython-310.pyc ADDED Viewed

Binary file (728 Bytes). View file

src/utils/__pycache__/qwen_inference.cpython-310.pyc ADDED Viewed

Binary file (712 Bytes). View file

src/utils/authentication.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from fastapi import HTTPException, Header
+import jwt
+from dotenv import load_dotenv
+import os
+load_dotenv()
+def get_secret_key():
+    return os.getenv("SECRET_KEY")
+async def verify_token(authorization: str = Header(...)):
+    try:
+        token_type, token = authorization.split()
+        if token_type.lower() != "bearer":
+            raise HTTPException(status_code=401, detail="Invalid token type")
+        return jwt.decode(token, get_secret_key(), algorithms=["HS256"])
+    except jwt.ExpiredSignatureError:
+        raise HTTPException(status_code=401, detail="Token has expired")
+    except (jwt.InvalidTokenError, IndexError):
+        raise HTTPException(status_code=401, detail="Invalid token")

src/utils/payload_model.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from pydantic import BaseModel
+class SingleInferencePayload(BaseModel):
+    image_path: str
+    question: str
+class VideoInferencePayload(BaseModel):
+    video_path: str
+    question: list[str]

src/utils/qwen_inference.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from .payload_model import SingleInferencePayload, VideoInferencePayload
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+from pydantic import BaseModel
+import torch
+class Qwen2_5(BaseModel):
+    def __init__(self, model_path: str):
+        self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            model_path, torch_dtype="auto", device_map="auto"
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+        self.processor = AutoProcessor.from_pretrained(model_path)
+    def prepare_single_inference(self, image: str, question: str):
+        image = f"data:image;base64,{image}"
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "image": image,
+                    },
+                    {
+                        "type": "text",
+                        "text": question
+                    },
+                ],
+            }
+        ]
+        text = self.processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = self.processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to("cuda")
+        return inputs
+    def prepare_video_inference(self, video: list[str], question: str):
+        base64_videos = []
+        for frame in video:
+            base64_videos.append(f"data:image;base64,{frame}")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "video",
+                        "video": base64_videos,
+                    },
+                    {
+                        "type": "text",
+                        "text": question
+                    },
+                ],
+            }
+        ]
+        text = self.processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
+        inputs = self.processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            fps=1.0,
+            padding=True,
+            return_tensors="pt",
+            **video_kwargs,
+        )
+        inputs = inputs.to("cuda")
+        return inputs
+    def get_single_inference(self, payload: SingleInferencePayload):
+        try:
+            processed_inputs = self.prepare_single_inference(payload.image_path, payload.question)
+            generated_ids = self.model.generate(**processed_inputs, max_new_tokens=128)
+            generated_ids_trimmed = [
+                out_ids[len(in_ids) :] for in_ids, out_ids in zip(processed_inputs.input_ids, generated_ids)
+            ]
+            output_text = self.processor.batch_decode(
+                generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+            )
+            print(f"Model generated text: {output_text}")
+            return {
+                "message": output_text,
+                "status": 200
+            }
+        except Exception as e:
+            return {
+                "message": str(e),
+                "status": 500
+            }
+    def get_video_inference(self, payload: VideoInferencePayload):
+        try:
+            processed_inputs = self.prepare_video_inference(payload.video_path, payload.question)
+            generated_ids = self.model.generate(**processed_inputs, max_new_tokens=128)
+            generated_ids_trimmed = [
+                out_ids[len(in_ids) :] for in_ids, out_ids in zip(processed_inputs.input_ids, generated_ids)
+            ]
+            output_text = self.processor.batch_decode(
+                generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+            )
+            print(f"Model generated text: {output_text}")
+            return {
+                "message": output_text,
+                "status": 200
+            }
+        except Exception as e:
+            return {
+                "message": str(e),
+                "status": 500
+            }