import gradio as gr import torch from transformers import AutoProcessor, LlavaForConditionalGeneration from transformers import BitsAndBytesConfig quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16 ) model_id = "llava-hf/llava-1.5-7b-hf" processor = AutoProcessor.from_pretrained(model_id) model = LlavaForConditionalGeneration.from_pretrained( model_id, quantization_config=quantization_config, device_map="auto" ) def text_to_image(image, prompt): prompt = f'USER: \n{prompt}\nASSISTANT:' inputs = processor([prompt], images=[image], padding=True, return_tensors="pt").to(model.device) for k, v in inputs.items(): print(k, v.shape) print(inputs) output = model.generate(**inputs, max_new_tokens=100) generated_text = processor.batch_decode(output, skip_special_tokens=True) for text in generated_text: print(text.split("ASSISTANT:")[-1]) print() demo = gr.Interface( fn=text_to_image, inputs=[ gr.Image(label='Select an image to analyze', type='pil'), gr.Textbox(label='Enter Prompt') ], outputs=gr.Textbox(label='Maurice says:') ) if __name__ == "__main__": demo.launch(show_api=False)