File size: 3,778 Bytes
c22d94a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9622ab6
 
 
 
 
 
 
c22d94a
9622ab6
 
55e017c
9622ab6
c22d94a
 
 
 
 
 
 
 
9622ab6
c22d94a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9622ab6
 
c22d94a
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from dotenv import load_dotenv, find_dotenv
from transformers import pipeline
from langchain import LLMChain, OpenAI, PromptTemplate

import requests
import os

# UI layer
import streamlit as st

load_dotenv(find_dotenv())

HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
# It involves 3 steps
# image to text
def image_to_text(url, use_api=True):
    if use_api:
        API_URL = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-large"
        headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"}
        
        filename = url.split("/")[-1]
        with open(filename, "rb") as f:
            data = f.read()
        response = requests.post(API_URL, headers=headers, data=data)
        return response.json()[0]['generated_text']


    # Download the model and use it, which is slow
    captioner = pipeline("image-to-text",model="Salesforce/blip-image-captioning-base")
    # captioner("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
    result = captioner(url)
    return result[0]['generated_text']

## [{'generated_text': 'two birds are standing next to each other '}]

# LLM
def generate_story(story_idea):
    # template = """
    #     You are a professional song writter;
    #     Generate a song based on a simple narrative, the song should be no more than 100 words.
    #     Song should be in Nepali language
    #     CONTEXT: {story_idea}
    #     STORY:
    #     """
    template = """
            you are a song writer, write a song using following context:
            {story_idea}. 
            Song should not be more than 150 words. It should be in English language.
            """
    prompt = PromptTemplate(input_variables=["story_idea"], template=template)

    story_llm = LLMChain(llm=OpenAI(model_name='gpt-3.5-turbo-0301', temperature=1), prompt=prompt, verbose=True)
    story = story_llm.run(story_idea)
    return story

# text to speech
def text_to_speech(story):
    API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
    headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"}

    payloads = {
        "inputs": story
    }

    response = requests.post(API_URL, headers=headers, json=payloads)
    with open("story_audio.flac", "wb") as file:
        file.write(response.content)

# caption = image_to_text("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
# story = generate_story(story_idea="Two parrots singing a song")
# text_to_speech(story="Two parrots singing a song")

def main():
    st.set_page_config(page_title="Upload any image to hear a nice story")

    st.header("Listen to what your image has to tell you. JK DEMO APP")

    uploaded_file = st.file_uploader("Choose an image...", type="jpg")
    if uploaded_file is not None:
        print(uploaded_file)
        bytes_data = uploaded_file.getvalue()
        with open(uploaded_file.name, "wb") as file:
            file.write(bytes_data)

        st.image(uploaded_file, caption="Uploaded image", use_column_width=True)

        image_description = image_to_text(uploaded_file.name, use_api=True)
        
        # Display image description on FE
        with st.expander("Image Description"):
            st.write(image_description)

        story = generate_story(story_idea=image_description)
        # story_starter_text = "Yo ho Radio Nepal, prastut xa sun nai parne katha: "
        story_starter_text = ""
        story = story_starter_text + story

        # Display story text on FE
        with st.expander("Story"):
            st.write(story)

        # Display audio player on FE
        text_to_speech(story=story)
        st.audio("story_audio.flac")

if __name__ == '__main__':
    main()