Szeyu commited on
Commit
e35a81f
·
verified ·
1 Parent(s): 17f0914

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -104
app.py CHANGED
@@ -1,8 +1,3 @@
1
- """
2
- Streamlit application that generates children's stories from images with audio narration.
3
- Uses Hugging Face transformers for image captioning, story generation, and text-to-speech.
4
- """
5
-
6
  import streamlit as st
7
  from transformers import pipeline
8
  import textwrap
@@ -12,119 +7,67 @@ import tempfile
12
  import os
13
  from PIL import Image
14
 
15
- # Constants
16
- MAX_STORY_WORDS = 100
17
- TEXT_CHUNK_WIDTH = 200 # Characters per chunk for text-to-speech processing
18
- AUDIO_SAMPLE_RATE = 16000 # 16kHz sampling rate for audio output
19
-
20
  @st.cache_resource
21
- def load_ml_pipelines():
22
- """
23
- Load and cache ML models for image captioning, story generation, and text-to-speech.
24
-
25
- Returns:
26
- tuple: Three pipeline objects for:
27
- - Image-to-text (captioning)
28
- - Text generation (story)
29
- - Text-to-speech
30
- """
31
- caption_pipeline = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
32
- story_pipeline = pipeline("text-generation", model="aspis/gpt2-genre-story-generation")
33
- tts_pipeline = pipeline("text-to-speech", model="facebook/mms-tts-eng")
34
-
35
- return caption_pipeline, story_pipeline, tts_pipeline
36
 
37
- # Load ML pipelines once and cache them
38
- image_caption_pipeline, story_gen_pipeline, text_to_speech_pipeline = load_ml_pipelines()
39
 
40
- def generate_story_content(uploaded_image):
41
- """
42
- Process an image to generate caption, story, and audio narration.
 
43
 
44
- Args:
45
- uploaded_image (UploadedFile): Streamlit file uploader object
46
-
47
- Returns:
48
- tuple: (caption_text, story_text, temp_audio_path)
49
- """
50
- # Convert uploaded image to PIL format
51
- pil_image = Image.open(uploaded_image)
52
-
53
- # Generate image caption
54
- caption_result = image_caption_pipeline(pil_image)[0]
55
- caption_text = caption_result["generated_text"]
56
- st.write("**Caption:**", caption_text)
57
 
58
- # Create story generation prompt
59
- story_prompt = (
60
  f"Write a funny, warm children's story for ages 3-10, 50–100 words, "
61
- f"in third-person narrative, that describes this scene exactly: {caption_text} "
62
- f"mention the exact place or venue within {caption_text}"
63
  )
64
-
65
- # Generate story text
66
- story_output = story_gen_pipeline(
67
- story_prompt,
68
  max_new_tokens=150,
69
- temperature=0.7, # Controls randomness (lower = more deterministic)
70
- top_p=0.9, # Nucleus sampling probability
71
- no_repeat_ngram_size=2, # Prevent repeating word pairs
72
  return_full_text=False
73
  )[0]["generated_text"].strip()
74
 
75
- # Trim story to maximum allowed words
76
- story_words = story_output.split()
77
- trimmed_story = " ".join(story_words[:MAX_STORY_WORDS])
78
- st.write("**Story:**", trimmed_story)
79
-
80
- # Split story into chunks for text-to-speech processing
81
- story_chunks = textwrap.wrap(trimmed_story, width=TEXT_CHUNK_WIDTH)
82
-
83
- # Generate audio for each chunk and concatenate
84
- audio_segments = [
85
- text_to_speech_pipeline(chunk)["audio"].squeeze()
86
- for chunk in story_chunks
87
- ]
88
- concatenated_audio = np.concatenate(audio_segments)
89
 
90
- # Create temporary audio file
91
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
92
- sf.write(temp_audio_file.name, concatenated_audio, samplerate=AUDIO_SAMPLE_RATE)
93
- temp_audio_path = temp_audio_file.name
94
 
95
- return caption_text, trimmed_story, temp_audio_path
 
 
 
96
 
97
- # Streamlit application interface
98
- def main():
99
- """Main Streamlit application layout and interaction logic."""
100
- st.title("📖 Image to Children's Story with Audio Narration")
101
- st.markdown("""
102
- Upload an image to generate:
103
- 1. A descriptive caption
104
- 2. A children's story (ages 3-10)
105
- 3. Audio narration of the story
106
- """)
107
 
108
- image_file = st.file_uploader("Choose an image", type=["jpg", "jpeg", "png"])
 
 
109
 
110
- if image_file is not None:
111
- st.image(image_file, caption="Uploaded Image", use_column_width=True)
112
-
113
- if st.button("Generate Story and Audio"):
114
- with st.spinner("Creating magical story..."):
115
- try:
116
- caption, story, audio_path = generate_story_content(image_file)
117
- st.success("Here's your generated story!")
118
-
119
- # Display audio player
120
- st.audio(audio_path, format="audio/wav")
121
-
122
- # Clean up temporary audio file
123
- os.remove(audio_path)
124
- except Exception as e:
125
- st.error(f"Something went wrong: {str(e)}")
126
- if 'audio_path' in locals():
127
- os.remove(audio_path)
128
 
129
- if __name__ == "__main__":
130
- main()
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  from transformers import pipeline
3
  import textwrap
 
7
  import os
8
  from PIL import Image
9
 
10
+ # Initialize pipelines
 
 
 
 
11
  @st.cache_resource
12
+ def load_pipelines():
13
+ captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
14
+ storyer = pipeline("text-generation", model="aspis/gpt2-genre-story-generation")
15
+ tts = pipeline("text-to-speech", model="facebook/mms-tts-eng")
16
+ return captioner, storyer, tts
 
 
 
 
 
 
 
 
 
 
17
 
18
+ captioner, storyer, tts = load_pipelines()
 
19
 
20
+ # Main logic
21
+ def generate_content(image):
22
+ # Convert Streamlit uploaded image to PIL image
23
+ pil_image = Image.open(image)
24
 
25
+ # Generate caption
26
+ caption = captioner(pil_image)[0]["generated_text"]
27
+ st.write("**Caption:**", caption)
 
 
 
 
 
 
 
 
 
 
28
 
29
+ # Generate story
30
+ prompt = (
31
  f"Write a funny, warm children's story for ages 3-10, 50–100 words, "
32
+ f"in third-person narrative, that describes this scene exactly: {caption} "
33
+ f"mention the exact place or venue within {caption}"
34
  )
35
+ raw = storyer(
36
+ prompt,
 
 
37
  max_new_tokens=150,
38
+ temperature=0.7,
39
+ top_p=0.9,
40
+ no_repeat_ngram_size=2,
41
  return_full_text=False
42
  )[0]["generated_text"].strip()
43
 
44
+ # Trim to max 100 words
45
+ words = raw.split()
46
+ story = " ".join(words[:100])
47
+ st.write("**Story:**", story)
 
 
 
 
 
 
 
 
 
 
48
 
49
+ # Convert story to speech
50
+ chunks = textwrap.wrap(story, width=200)
51
+ audio = np.concatenate([tts(chunk)["audio"].squeeze() for chunk in chunks])
 
52
 
53
+ # Save audio to temporary file
54
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
55
+ sf.write(temp_file.name, audio, tts.model.config.sampling_rate)
56
+ temp_file_path = temp_file.name
57
 
58
+ return caption, story, temp_file_path
 
 
 
 
 
 
 
 
 
59
 
60
+ # Streamlit UI
61
+ st.title("Image to Children's Story and Audio")
62
+ st.write("Upload an image to generate a caption, a children's story, and an audio narration.")
63
 
64
+ uploaded_image = st.file_uploader("Choose an image", type=["jpg", "jpeg", "png"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
+ if uploaded_image is not None:
67
+ st.image(uploaded_image, caption="Uploaded Image", use_column_width=True)
68
+ if st.button("Generate Story and Audio"):
69
+ with st.spinner("Generating content..."):
70
+ caption, story, audio_path = generate_content(uploaded_image)
71
+ st.audio(audio_path, format="audio/wav")
72
+ # Clean up temporary file
73
+ os.remove(audio_path)