justREE commited on
Commit
b88893e
·
verified ·
1 Parent(s): fd3e00c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +126 -78
app.py CHANGED
@@ -4,108 +4,156 @@ import io
4
  import wave
5
  import re
6
  import streamlit as st
7
- from transformers import pipeline, SpeechT5Processor, SpeechT5HifiGan
8
- from datasets import load_dataset
9
  from PIL import Image
10
  import numpy as np
11
- import torch
12
 
13
- # ─────────────────────────────────────────────────────────────
14
- # 1) LOAD PIPELINES
15
- # ─────────────────────────────────────────────────────────────
16
  @st.cache_resource(show_spinner=False)
17
  def load_captioner():
18
- return pipeline("image-to-text", model="Salesforce/blip-image-captioning-base", device="cpu")
 
 
 
 
19
 
20
  @st.cache_resource(show_spinner=False)
21
- def load_story_generator():
22
- return pipeline("text-generation", model="microsoft/Phi-4-mini-reasoning", device="cpu")
 
 
 
 
23
 
24
  @st.cache_resource(show_spinner=False)
25
  def load_tts_pipe():
26
- processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
27
- model = pipeline("text-to-speech", model="microsoft/speecht5_tts", device="cpu")
28
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
29
- speaker_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
30
- speaker_embedding = torch.tensor(speaker_dataset[7306]["xvector"]).unsqueeze(0)
31
- return processor, model, vocoder, speaker_embedding
32
-
33
- # ─────────────────────────────────────────────────────────────
34
- # 2) PIPELINE FUNCTIONS
35
- # ─────────────────────────────────────────────────────────────
36
- def get_caption(image, captioner):
37
- return captioner(image)[0]['generated_text']
38
-
39
- def generate_story(caption, generator):
40
- prompt = f"Write a short, magical story for children aged 3 to 10 based on this scene: {caption}. Keep it under 100 words."
41
- outputs = generator(
42
- prompt,
43
- max_new_tokens=120,
44
- temperature=0.8,
45
- top_p=0.95,
46
- do_sample=True
47
  )
48
- story = outputs[0]["generated_text"]
49
- return clean_story_output(story, prompt)
50
 
51
- def clean_story_output(story, prompt):
52
- story = story[len(prompt):].strip() if story.startswith(prompt) else story
53
- if "." in story:
54
- story = story[: story.rfind(".") + 1]
55
- return sentence_case(story)
56
 
57
- def sentence_case(text):
58
  parts = re.split(r'([.!?])', text)
59
  out = []
60
  for i in range(0, len(parts) - 1, 2):
61
  sentence = parts[i].strip().capitalize()
62
- out.append(f"{sentence}{parts[i + 1]}")
 
63
  if len(parts) % 2:
64
  last = parts[-1].strip().capitalize()
65
  if last:
66
  out.append(last)
67
  return " ".join(out)
68
 
69
- def convert_to_audio(text, processor, tts_pipe, vocoder, speaker_embedding):
70
- inputs = processor(text=text, return_tensors="pt")
71
- speech = tts_pipe.model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
72
- pcm = (speech.numpy() * 32767).astype(np.int16)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  buffer = io.BytesIO()
74
- with wave.open(buffer, "wb") as wf:
75
- wf.setnchannels(1)
76
- wf.setsampwidth(2)
77
- wf.setframerate(16000)
78
- wf.writeframes(pcm.tobytes())
 
79
  buffer.seek(0)
80
  return buffer.read()
81
 
82
- # ─────────────────────────────────────────────────────────────
83
- # 3) STREAMLIT APP UI
84
- # ─────────────────────────────────────────────────────────────
85
- st.set_page_config(page_title="Magic Storyteller", layout="centered")
86
- st.title("Magic Storyteller")
87
- st.markdown("Upload an image to generate a magical story and hear it read aloud!")
88
-
89
- uploaded = st.file_uploader("Upload Image", type=["jpg", "jpeg", "png"])
90
- if uploaded:
91
- image = Image.open(uploaded)
92
- st.image(image, caption="Your uploaded image", use_container_width=True)
93
-
94
- if st.button(" Generate Story and Audio"):
95
- st.subheader("🖼️ Step 1: Captioning")
96
- captioner = load_captioner()
97
- caption = get_caption(image, captioner)
98
- st.markdown(f"**Caption:** {sentence_case(caption)}")
99
-
100
- st.subheader("📖 Step 2: Story Generation")
101
- story_pipe = load_story_generator()
102
- story = generate_story(caption, story_pipe)
103
- st.write(story)
104
-
105
- st.subheader("🔊 Step 3: Listen to the Story")
106
- processor, tts_pipe, vocoder, speaker_embedding = load_tts_pipe()
107
- audio_bytes = convert_to_audio(story, processor, tts_pipe, vocoder, speaker_embedding)
108
- st.audio(audio_bytes, format="audio/wav")
109
- st.balloons()
110
- else:
111
- st.info("Please upload an image to begin.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import wave
5
  import re
6
  import streamlit as st
7
+ from transformers import pipeline
 
8
  from PIL import Image
9
  import numpy as np
 
10
 
11
+ # 1) CACHE & LOAD MODELS
 
 
12
  @st.cache_resource(show_spinner=False)
13
  def load_captioner():
14
+ return pipeline(
15
+ "image-to-text",
16
+ model="Salesforce/blip-image-captioning-base",
17
+ device="cpu"
18
+ )
19
 
20
  @st.cache_resource(show_spinner=False)
21
+ def load_story_pipe():
22
+ return pipeline(
23
+ "text2text-generation",
24
+ model="google/flan-t5-base",
25
+ device="cpu"
26
+ )
27
 
28
  @st.cache_resource(show_spinner=False)
29
  def load_tts_pipe():
30
+ return pipeline(
31
+ "text-to-speech",
32
+ model="facebook/mms-tts-eng",
33
+ device="cpu"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  )
 
 
35
 
36
+ # 2) HELPER FUNCTIONS
 
 
 
 
37
 
38
+ def sentence_case(text: str) -> str:
39
  parts = re.split(r'([.!?])', text)
40
  out = []
41
  for i in range(0, len(parts) - 1, 2):
42
  sentence = parts[i].strip().capitalize()
43
+ delimiter = parts[i + 1]
44
+ out.append(f"{sentence}{delimiter}")
45
  if len(parts) % 2:
46
  last = parts[-1].strip().capitalize()
47
  if last:
48
  out.append(last)
49
  return " ".join(out)
50
 
51
+
52
+ def caption_image(img: Image.Image, captioner) -> str:
53
+ results = captioner(img)
54
+ if not results:
55
+ return ""
56
+ return results[0].get("generated_text", "")
57
+
58
+
59
+ def story_from_caption(caption: str, pipe) -> str:
60
+ prompt = f"Write a vivid, imaginative ~100-word story about this scene: {caption}"
61
+ results = pipe(
62
+ prompt,
63
+ max_length=100,
64
+ min_length=80,
65
+ do_sample=True,
66
+ top_k=100,
67
+ top_p=0.9,
68
+ temperature=0.7,
69
+ repetition_penalty=1.1,
70
+ no_repeat_ngram_size=4,
71
+ early_stopping=False
72
+ )
73
+ raw = results[0]["generated_text"].strip()
74
+ if raw.lower().startswith(prompt.lower()):
75
+ raw = raw[len(prompt):].strip()
76
+ if "." in raw:
77
+ raw = raw[: raw.rfind(".") + 1]
78
+ return sentence_case(raw)
79
+
80
+
81
+ def tts_bytes(text: str, tts_pipe) -> bytes:
82
+ output = tts_pipe(text)
83
+ result = output[0] if isinstance(output, list) else output
84
+ audio_array = result["audio"]
85
+ rate = result["sampling_rate"]
86
+ data = audio_array.T if audio_array.ndim == 2 else audio_array
87
+ pcm = (data * 32767).astype(np.int16)
88
+
89
  buffer = io.BytesIO()
90
+ wf = wave.open(buffer, "wb")
91
+ wf.setnchannels(1 if data.ndim == 1 else data.shape[1])
92
+ wf.setsampwidth(2)
93
+ wf.setframerate(rate)
94
+ wf.writeframes(pcm.tobytes())
95
+ wf.close()
96
  buffer.seek(0)
97
  return buffer.read()
98
 
99
+ # 3) STREAMLIT UI ENHANCEMENTS
100
+
101
+ st.set_page_config(
102
+ page_title="Media Magic Storyteller",
103
+ page_icon="🎨",
104
+ layout="wide"
105
+ )
106
+
107
+ # Sidebar
108
+ with st.sidebar:
109
+ st.header("🎨 Media Magic")
110
+ st.markdown(
111
+ "Upload an image and click 'Generate' to craft a magical story."
112
+ )
113
+ st.markdown("---")
114
+ st.markdown("1️⃣ Upload your image\n2️⃣ Click 'Generate'\n3️⃣ Read & Listen!")
115
+ st.markdown("---")
116
+ st.markdown("Built with 💖 using Hugging Face & Streamlit.")
117
+
118
+ # Main
119
+ st.title("✨ Media Magic Storyteller")
120
+
121
+ col1, col2 = st.columns([1, 2])
122
+ with col1:
123
+ uploaded = st.file_uploader("Upload Image", type=["jpg", "jpeg", "png"])
124
+ if uploaded:
125
+ st.image(uploaded, caption="Your Image", use_container_width=True)
126
+ with col2:
127
+ st.write("### Your Story")
128
+ placeholder = st.empty()
129
+
130
+ if uploaded and st.button("🚀 Generate Story & Audio"):
131
+ progress = st.progress(0)
132
+
133
+ # Captioning
134
+ progress.progress(10)
135
+ captioner = load_captioner()
136
+ caption = sentence_case(caption_image(Image.open(uploaded), captioner))
137
+ st.subheader("🖼️ Caption")
138
+ st.info(caption)
139
+
140
+ # Story
141
+ progress.progress(40)
142
+ story_pipe = load_story_pipe()
143
+ story = story_from_caption(caption, story_pipe)
144
+ st.subheader("📖 Story")
145
+ st.write(story)
146
+
147
+ # Audio
148
+ progress.progress(70)
149
+ tts_pipe = load_tts_pipe()
150
+ audio = tts_bytes(story, tts_pipe)
151
+ st.subheader("🔊 Audio")
152
+ st.audio(audio, format="audio/wav")
153
+
154
+ progress.progress(100)
155
+ st.balloons()
156
+
157
+ # Footer
158
+ st.markdown("---")
159
+ st.markdown("© 2025 Media Magic | https://huggingface.co")