File size: 907 Bytes
bf1b552
30a2420
bf1b552
84cc85e
 
bf1b552
6cad4f1
30a2420
 
 
 
 
bf1b552
6cad4f1
 
bf1b552
6cad4f1
 
bf1b552
6cad4f1
 
 
bf1b552
6cad4f1
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import streamlit as st
from transformers import pipeline, T5Config

# 加载 Visual Question Answering 模型 microsoft/git-base-vqav2
vqa_pipeline = pipeline("text2text-generation", model="microsoft/git-base-vqav2")

# 加载文本到语音模型
text_to_speech_pipeline = pipeline(
    "text-to-speech",
    model="microsoft/speecht5_tts",
    config=T5Config.from_pretrained("microsoft/speecht5_tts", speaker_embeddings=True)
)

def main():
    st.title("Visual Question Answering with Text-to-Speech")

    image_path = st.text_input("Enter image path:")
    question = st.text_input("Enter your question:")

    if st.button("Get Answer"):
        answer = vqa_pipeline(question, image_path)[0]['generated_text']
        audio_data = text_to_speech_pipeline(answer)

        st.write("Answer:", answer)
        st.audio(audio_data[0]["audio"], format='audio/wav')

if __name__ == '__main__':
    main()