File size: 2,425 Bytes
a01b0e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from langchain.agents import load_tools
from langchain.agents import initialize_agent
from langchain.agents import AgentType
from langchain.chat_models import AzureChatOpenAI
from langchain.llms import OpenAI
from langchain.chains.conversation.memory import ConversationBufferWindowMemory

import os


OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_API_BASE = os.getenv("OPENAI_API_BASE")
#llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY, temperature=0, model_name='gpt-3.5-turbo',openai_api_base=OPENAI_API_BASE)
llm = AzureChatOpenAI(deployment_name="bitservice_chat_35",openai_api_base=OPENAI_API_BASE,openai_api_key=OPENAI_API_KEY,openai_api_version="2023-03-15-preview",model_name="gpt-3.5-turbo")


import torch
from transformers import BlipProcessor, BlipForConditionalGeneration

image_to_text_model = "Salesforce/blip-image-captioning-large"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

processor = BlipProcessor.from_pretrained(image_to_text_model)
model = BlipForConditionalGeneration.from_pretrained(image_to_text_model).to(device)

from transformers.models.oneformer.modeling_oneformer import OneFormerModelOutput
import requests
from PIL import Image

def describeImage(image_url):
  image_object = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')
  # image
  inputs = processor(image_object, return_tensors="pt").to(device)
  outputs = model.generate(**inputs)
  return processor.decode(outputs[0], skip_special_tokens=True)


from langchain.tools import BaseTool

class DescribeImageTool(BaseTool):
    name = "Describe Image Tool"
    description = 'use this tool to describe an image.'

    def _run(self, url: str):
        description = describeImage(url)
        return description
    
    def _arun(self, query: str):
        raise NotImplementedError("Async operation not supported yet")

tools = [DescribeImageTool()]


agent = initialize_agent(
    agent='chat-conversational-react-description',
    tools=tools,
    llm=llm,
    verbose=True,
    max_iterations=3,
    early_stopping_method='generate',
    memory=ConversationBufferWindowMemory(
        memory_key='chat_history',
        k=5,
        return_messages=True
    )
)

image_url = 'https://images.unsplash.com/photo-1682228287072-5e23cbffd487?ixlib=rb-4.0.3&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=987&q=80'
agent(f"Please describe the following image:\n{image_url}")