Spaces:
Sleeping
Sleeping
File size: 1,277 Bytes
bf1b552 477b035 589de03 b1afa79 a1e406e b1afa79 84cc85e bf1b552 393c3fa b1afa79 bf1b552 6cad4f1 bf1b552 6cad4f1 bf1b552 6cad4f1 393c3fa bf1b552 6cad4f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
import streamlit as st
from transformers import pipeline
from transformers import T5Config
from datasets import load_dataset
import torch
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = embeddings_dataset[7306]["xvector"]
speaker_embeddings = torch.tensor(speaker_embeddings).unsqueeze(0)
# 加载 Visual Question Answering 模型 microsoft/git-base-vqav2
vqa_pipeline = pipeline("text2text-generation", model="microsoft/git-base-vqav2")
# 加载文本到语音模型
text_to_speech_pipeline = pipeline("text-to-speech", model="microsoft/speecht5_tts")
def main():
st.title("Visual Question Answering with Text-to-Speech")
image_path = st.text_input("Enter image path:")
question = st.text_input("Enter your question:")
if st.button("Get Answer"):
answer = vqa_pipeline(question, image_path)[0]['generated_text']
# 将说话者的嵌入向量作为文本的一部分传递给文本到语音模型
text_with_speaker = f"{answer} Speaker Embeddings: {speaker_embeddings}"
audio_data = text_to_speech_pipeline(text_with_speaker)
st.write("Answer:", answer)
st.audio(audio_data[0]["audio"], format='audio/wav')
if __name__ == '__main__':
main() |