Szeyu commited on
Commit
17f0914
·
verified ·
1 Parent(s): c047835

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -47
app.py CHANGED
@@ -1,3 +1,8 @@
 
 
 
 
 
1
  import streamlit as st
2
  from transformers import pipeline
3
  import textwrap
@@ -7,67 +12,119 @@ import tempfile
7
  import os
8
  from PIL import Image
9
 
10
- # Initialize pipelines
 
 
 
 
11
  @st.cache_resource
12
- def load_pipelines():
13
- captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
14
- storyer = pipeline("text-generation", model="aspis/gpt2-genre-story-generation")
15
- tts = pipeline("text-to-speech", model="facebook/mms-tts-eng")
16
- return captioner, storyer, tts
 
 
 
 
 
 
 
 
 
 
17
 
18
- captioner, storyer, tts = load_pipelines()
 
19
 
20
- # Main logic
21
- def generate_content(image):
22
- # Convert Streamlit uploaded image to PIL image
23
- pil_image = Image.open(image)
24
 
25
- # Generate caption
26
- caption = captioner(pil_image)[0]["generated_text"]
27
- st.write("**Caption:**", caption)
 
 
 
 
 
 
 
 
 
 
28
 
29
- # Generate story
30
- prompt = (
31
  f"Write a funny, warm children's story for ages 3-10, 50–100 words, "
32
- f"in third-person narrative, that describes this scene exactly: {caption} "
33
- f"mention the exact place or venue within {caption}"
34
  )
35
- raw = storyer(
36
- prompt,
 
 
37
  max_new_tokens=150,
38
- temperature=0.7,
39
- top_p=0.9,
40
- no_repeat_ngram_size=2,
41
  return_full_text=False
42
  )[0]["generated_text"].strip()
43
 
44
- # Trim to max 100 words
45
- words = raw.split()
46
- story = " ".join(words[:100])
47
- st.write("**Story:**", story)
 
 
 
 
 
 
 
 
 
 
48
 
49
- # Convert story to speech
50
- chunks = textwrap.wrap(story, width=200)
51
- audio = np.concatenate([tts(chunk)["audio"].squeeze() for chunk in chunks])
 
52
 
53
- # Save audio to temporary file
54
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
55
- sf.write(temp_file.name, audio, tts.model.config.sampling_rate)
56
- temp_file_path = temp_file.name
57
 
58
- return caption, story, temp_file_path
 
 
 
 
 
 
 
 
 
59
 
60
- # Streamlit UI
61
- st.title("Image to Children's Story and Audio")
62
- st.write("Upload an image to generate a caption, a children's story, and an audio narration.")
63
 
64
- uploaded_image = st.file_uploader("Choose an image", type=["jpg", "jpeg", "png"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
- if uploaded_image is not None:
67
- st.image(uploaded_image, caption="Uploaded Image", use_column_width=True)
68
- if st.button("Generate Story and Audio"):
69
- with st.spinner("Generating content..."):
70
- caption, story, audio_path = generate_content(uploaded_image)
71
- st.audio(audio_path, format="audio/wav")
72
- # Clean up temporary file
73
- os.remove(audio_path)
 
1
+ """
2
+ Streamlit application that generates children's stories from images with audio narration.
3
+ Uses Hugging Face transformers for image captioning, story generation, and text-to-speech.
4
+ """
5
+
6
  import streamlit as st
7
  from transformers import pipeline
8
  import textwrap
 
12
  import os
13
  from PIL import Image
14
 
15
+ # Constants
16
+ MAX_STORY_WORDS = 100
17
+ TEXT_CHUNK_WIDTH = 200 # Characters per chunk for text-to-speech processing
18
+ AUDIO_SAMPLE_RATE = 16000 # 16kHz sampling rate for audio output
19
+
20
  @st.cache_resource
21
+ def load_ml_pipelines():
22
+ """
23
+ Load and cache ML models for image captioning, story generation, and text-to-speech.
24
+
25
+ Returns:
26
+ tuple: Three pipeline objects for:
27
+ - Image-to-text (captioning)
28
+ - Text generation (story)
29
+ - Text-to-speech
30
+ """
31
+ caption_pipeline = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
32
+ story_pipeline = pipeline("text-generation", model="aspis/gpt2-genre-story-generation")
33
+ tts_pipeline = pipeline("text-to-speech", model="facebook/mms-tts-eng")
34
+
35
+ return caption_pipeline, story_pipeline, tts_pipeline
36
 
37
+ # Load ML pipelines once and cache them
38
+ image_caption_pipeline, story_gen_pipeline, text_to_speech_pipeline = load_ml_pipelines()
39
 
40
+ def generate_story_content(uploaded_image):
41
+ """
42
+ Process an image to generate caption, story, and audio narration.
 
43
 
44
+ Args:
45
+ uploaded_image (UploadedFile): Streamlit file uploader object
46
+
47
+ Returns:
48
+ tuple: (caption_text, story_text, temp_audio_path)
49
+ """
50
+ # Convert uploaded image to PIL format
51
+ pil_image = Image.open(uploaded_image)
52
+
53
+ # Generate image caption
54
+ caption_result = image_caption_pipeline(pil_image)[0]
55
+ caption_text = caption_result["generated_text"]
56
+ st.write("**Caption:**", caption_text)
57
 
58
+ # Create story generation prompt
59
+ story_prompt = (
60
  f"Write a funny, warm children's story for ages 3-10, 50–100 words, "
61
+ f"in third-person narrative, that describes this scene exactly: {caption_text} "
62
+ f"mention the exact place or venue within {caption_text}"
63
  )
64
+
65
+ # Generate story text
66
+ story_output = story_gen_pipeline(
67
+ story_prompt,
68
  max_new_tokens=150,
69
+ temperature=0.7, # Controls randomness (lower = more deterministic)
70
+ top_p=0.9, # Nucleus sampling probability
71
+ no_repeat_ngram_size=2, # Prevent repeating word pairs
72
  return_full_text=False
73
  )[0]["generated_text"].strip()
74
 
75
+ # Trim story to maximum allowed words
76
+ story_words = story_output.split()
77
+ trimmed_story = " ".join(story_words[:MAX_STORY_WORDS])
78
+ st.write("**Story:**", trimmed_story)
79
+
80
+ # Split story into chunks for text-to-speech processing
81
+ story_chunks = textwrap.wrap(trimmed_story, width=TEXT_CHUNK_WIDTH)
82
+
83
+ # Generate audio for each chunk and concatenate
84
+ audio_segments = [
85
+ text_to_speech_pipeline(chunk)["audio"].squeeze()
86
+ for chunk in story_chunks
87
+ ]
88
+ concatenated_audio = np.concatenate(audio_segments)
89
 
90
+ # Create temporary audio file
91
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
92
+ sf.write(temp_audio_file.name, concatenated_audio, samplerate=AUDIO_SAMPLE_RATE)
93
+ temp_audio_path = temp_audio_file.name
94
 
95
+ return caption_text, trimmed_story, temp_audio_path
 
 
 
96
 
97
+ # Streamlit application interface
98
+ def main():
99
+ """Main Streamlit application layout and interaction logic."""
100
+ st.title("📖 Image to Children's Story with Audio Narration")
101
+ st.markdown("""
102
+ Upload an image to generate:
103
+ 1. A descriptive caption
104
+ 2. A children's story (ages 3-10)
105
+ 3. Audio narration of the story
106
+ """)
107
 
108
+ image_file = st.file_uploader("Choose an image", type=["jpg", "jpeg", "png"])
 
 
109
 
110
+ if image_file is not None:
111
+ st.image(image_file, caption="Uploaded Image", use_column_width=True)
112
+
113
+ if st.button("Generate Story and Audio"):
114
+ with st.spinner("Creating magical story..."):
115
+ try:
116
+ caption, story, audio_path = generate_story_content(image_file)
117
+ st.success("Here's your generated story!")
118
+
119
+ # Display audio player
120
+ st.audio(audio_path, format="audio/wav")
121
+
122
+ # Clean up temporary audio file
123
+ os.remove(audio_path)
124
+ except Exception as e:
125
+ st.error(f"Something went wrong: {str(e)}")
126
+ if 'audio_path' in locals():
127
+ os.remove(audio_path)
128
 
129
+ if __name__ == "__main__":
130
+ main()