mrfakename's picture
Upload 114 files
c8448bc verified
raw
history blame
4.05 kB
import argparse
import os
import random
import numpy as np
import torch
import torch.backends.cudnn as cudnn
import gradio as gr
from torch.cuda.amp import autocast
from sonique.Video_LLaMA.video_llama.common.config import Config
from sonique.Video_LLaMA.video_llama.common.dist_utils import get_rank
from sonique.Video_LLaMA.video_llama.common.registry import registry
from sonique.Video_LLaMA.video_llama.conversation.conversation_video import Chat, Conversation, default_conversation,SeparatorStyle,conv_llava_llama_2
import decord
import gc
decord.bridge.set_bridge('torch')
from sonique.Video_LLaMA.video_llama.datasets.builders import *
from sonique.Video_LLaMA.video_llama.models import *
from sonique.Video_LLaMA.video_llama.processors import *
from sonique.Video_LLaMA.video_llama.runners import *
from sonique.Video_LLaMA.video_llama.tasks import *
decord.bridge.set_bridge('torch')
def generate_prompt_from_video_description(cfg_path, gpu_id, model_type, input_file, num_beams=1, temperature=1.0, low_resource=False):
# initialize model
args = argparse.Namespace(cfg_path=cfg_path, gpu_id=gpu_id, model_type=model_type, options=[])
cfg = Config(args)
model_config = cfg.model_cfg
model_config.device_8bit = args.gpu_id
model_config.low_resource = low_resource
model_cls = registry.get_model_class(model_config.arch)
model = model_cls.from_config(model_config).to('cuda:{}'.format(args.gpu_id))
model.eval()
vis_processor_cfg = cfg.datasets_cfg.webvid.vis_processor.train
vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
if args.model_type == 'vicuna':
chat_state = default_conversation.copy()
else:
chat_state = conv_llava_llama_2.copy()
chat = Chat(model, vis_processor, device=f'cuda:{args.gpu_id}')
# process input
if input_file.endswith('.jpg') or input_file.endswith('.png'):
print(input_file)
# chatbot = chatbot + [((input_file,), None)]
chat_state.system = "You are able to understand the visual content that the user provides. Follow the instructions carefully and explain your answers in detail."
img_list = []
llm_message = chat.upload_img(input_file, chat_state, img_list)
elif input_file.endswith('.mp4'):
print(input_file)
# chatbot = chatbot + [((input_file,), None)]
chat_state.system = "You are able to understand the visual content that the user provides. Follow the instructions carefully and explain your answers in detail."
img_list = []
llm_message = chat.upload_video_without_audio(input_file, chat_state, img_list)
else:
print("Unsupported file type")
return
question = "Describe the scene in detail"
# question = """
# As a music composer fluent in English, you're tasked with creating background music for a video.
# Based on the scene described, provide a set of tags in English that describe this background music for the video.
# Do not use the tags from the example.
# Please only return the set of tags that describe this background music for the input video without any explanation.
# Return the tags in the following format:
# Tags: [Tags1, Tags2, ..., Tempo (BPM)]
# Example format:
# Tags: [Piano, Synths, Strings, Violin, Flute, Reflective, Slow tempo, 96 BPM]
# """
with autocast():
chat.ask(question, chat_state)
llm_response = chat.answer(conv=chat_state,
img_list=img_list,
num_beams=num_beams,
temperature=temperature,
max_new_tokens=512,
max_length=2000)[0]
print("Chatbot response:", llm_response)
# clean up cache
del model
gc.collect()
torch.cuda.empty_cache()
return llm_response