File size: 4,607 Bytes
11fa05c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23ab8fa
11fa05c
23ab8fa
11fa05c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# -*- coding: utf-8 -*-

from langchain.agents import load_tools
from langchain.agents import initialize_agent
from langchain.agents import AgentType
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.chains.conversation.memory import ConversationBufferWindowMemory

import os
from langchain.chat_models import AzureChatOpenAI


#llm_fy = OpenAI(model_name="text-davinci-003", max_tokens=1024) #用来翻译的,已经用LLMChain替换了


OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_API_BASE = os.getenv("OPENAI_API_BASE")
#llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY, temperature=0, model_name='gpt-3.5-turbo',openai_api_base=OPENAI_API_BASE)
llm = AzureChatOpenAI(deployment_name="bitservice_chat_35",openai_api_base=OPENAI_API_BASE,openai_api_key=OPENAI_API_KEY,openai_api_version="2023-03-15-preview",model_name="gpt-3.5-turbo")

import torch
from transformers import BlipProcessor, BlipForConditionalGeneration

image_to_text_model = "Salesforce/blip-image-captioning-large"
device = 'cuda' if torch.cuda.is_available() else 'cpu'


processor = BlipProcessor.from_pretrained(image_to_text_model)
model = BlipForConditionalGeneration.from_pretrained(image_to_text_model).to(device)

from transformers.models.oneformer.modeling_oneformer import OneFormerModelOutput
import requests
from PIL import Image

def describeImageByUrl(image_url):
  image_object = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')
  # image
  inputs = processor(image_object, return_tensors="pt").to(device)
  outputs = model.generate(**inputs)
  describe = processor.decode(outputs[0], skip_special_tokens=True)
  return describe

def describeImageByPath(image_path):
  image_object = Image.open(image_path).convert('RGB')
  # image
  inputs = processor(image_object, return_tensors="pt").to(device)
  outputs = model.generate(**inputs)
  describe = processor.decode(outputs[0], skip_special_tokens=True)
  return describe


#description = describeImageByUrl('https://img0.baidu.com/it/u=4190066402,1916608022&fm=253&fmt=auto&app=120&f=JPEG?w=1280&h=800')
#description

from langchain.tools import BaseTool

class DescribeImageTool(BaseTool):
    name = "Describe Image Tool"
    description = 'use this tool to describe an image.'

    def _run(self, url: str):
        #description = describeImageByUrl(url) 
        description = describeImageByPath(url)  
        return description
    
    def _arun(self, query: str):
        raise NotImplementedError("Async operation not supported yet")


tools = [DescribeImageTool()]


agent = initialize_agent(
    agent='chat-conversational-react-description',
    tools=tools,
    llm=llm,
    verbose=True,
    max_iterations=3,
    early_stopping_method='generate',
    memory=ConversationBufferWindowMemory(
        memory_key='chat_history',
        k=5,
        return_messages=True
    )
)

from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
def enToChinese(english):
    #ch = llm_fy("Please translate the following sentence from English to Chinese:"+english)
    #return ch
    pp = "Please translate the following sentence from English to Chinese:{english}"
    prompt = PromptTemplate(
           input_variables=["english"],
           template=pp
    )
    llchain=LLMChain(llm=llm,prompt=prompt)
    return llchain.run(english)
    

def chToEnglish(chinese):
    #en = llm_fy("Please translate the following sentence from Chinese to English:"+chinese)
    #return en
    pp = "Please translate the following sentence from Chinese to English:{chinese}"
    prompt = PromptTemplate(
           input_variables=["chinese"],
           template=pp
    )
    llchain=LLMChain(llm=llm,prompt=prompt)
    return llchain.run(chinese)

#image_url = 'https://img0.baidu.com/it/u=4190066402,1916608022&fm=253&fmt=auto&app=120&f=JPEG?w=1280&h=800'
#agent(f"Describe the following image:\n{image_url}")
#en_result = agent(f"描述下面这张图片:\n{image_url}")['output']
#print(enToChinese(en_result))

#agent(f"What is the brand of car in the following image:\n{image_url}")
#en_result = agent(f"下面这张图片的汽车品牌是什么:\n{image_url}")['output']
#print(enToChinese(en_result))

def imageAnalyse(image_path,question):
    question = question.strip();
    if len(question) ==0:
        question = "请描述这张图片"
    print("question:"+question)
    en_result = agent(f"{question}:\n{image_path}")['output']
    print("en_result:"+en_result)
    ch_result = enToChinese(en_result)
    print("ch_result:"+ch_result)
    return ch_result