Spaces:

fahadqazi
/

Sindhi-Text-to-Speech

Running

App Files Files Community

fahadqazi commited on Mar 28

Commit

4dbf45b

verified ·

1 Parent(s): 0b3b1b2

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -24

app.py CHANGED Viewed

@@ -164,38 +164,49 @@ def text_to_speech(text, audio_file=None):
     if temp_segment:
         combined_segments.append(temp_segment.strip())
-    # Prepare silences
-    short_silence = np.zeros(int(22050 * 0.05), dtype=np.int16)  # 50ms for normal pause
-    long_silence = np.zeros(int(22050 * 0.15), dtype=np.int16)   # 150ms for "..."
-    # Synthesize and concatenate audio
-    combined_audio = np.array([], dtype=np.int16)
-    for segment in combined_segments:
-        # Create a BytesIO buffer to store synthesized speech
-        buffer = io.BytesIO()
-        # Use wave.open() to provide a compatible object
-        with wave.open(buffer, "wb") as wav_file:
-            voice.synthesize(segment, wav_file, **synthesize_args)
-        # Read the synthesized speech from the buffer
-        buffer.seek(0)
-        audio_segment, _ = sf.read(buffer, dtype='int16')
-        # Append synthesized audio
-        combined_audio = np.concatenate((combined_audio, audio_segment))
-        # Add appropriate silence after each segment
-        if segment.endswith("...") or segment.endswith("…"):
-            combined_audio = np.concatenate((combined_audio, long_silence))
-        elif segment.endswith(".") or segment.endswith("\n"):
-            combined_audio = np.concatenate((combined_audio, short_silence))
-    # Save the final output to a WAV file
-    output_file = f"{uuid.uuid4()}.wav"
-    sf.write(output_file, combined_audio, 22050)  # Assuming 22050 Hz sample rate
     return output_file
 # def text_to_speech(text, audio_file=None):

     if temp_segment:
         combined_segments.append(temp_segment.strip())
+    print(combined_segments)
+    # Silence lengths (50ms for ".", "\n", 150ms for "...")
+    short_silence = np.zeros(int(22050 * 0.05), dtype=np.int16)  # 50ms pause
+    long_silence = np.zeros(int(22050 * 0.15), dtype=np.int16)   # 150ms pause for "..."
+    # Create a temporary directory for storing individual segment WAV files
+    temp_dir = tempfile.mkdtemp()
+    temp_files = []
+    try:
+        # Synthesize and save each segment to a WAV file
+        for i, segment in enumerate(combined_segments):
+            segment_path = os.path.join(temp_dir, f"segment_{i}.wav")
+            with wave.open(segment_path, "wb") as wav_file:
+                voice.synthesize(segment, wav_file, **synthesize_args)
+            temp_files.append(segment_path)
+        # Concatenate all WAV files with appropriate silence
+        combined_audio = np.array([], dtype=np.int16)
+        for i, file_path in enumerate(temp_files):
+            audio_segment, _ = sf.read(file_path, dtype='int16')
+            combined_audio = np.concatenate((combined_audio, audio_segment))
+            # Add silence after each segment
+            segment = combined_segments[i]
+            if segment.endswith("...") or segment.endswith("…"):
+                combined_audio = np.concatenate((combined_audio, long_silence))
+            elif segment.endswith(".") or segment.endswith("\n"):
+                combined_audio = np.concatenate((combined_audio, short_silence))
+        # Save the final output to a WAV file
+        output_file = f"{uuid.uuid4()}.wav"
+        sf.write(output_file, combined_audio, 22050)  # Assuming 22050 Hz sample rate
+    finally:
+        # Clean up the temporary directory
+        shutil.rmtree(temp_dir)
     return output_file
 # def text_to_speech(text, audio_file=None):