Spaces:

Gigaverse
/

ivrit-ai-streaming

Sleeping

App Files Files Community

AshDavid12 commited on Sep 17, 2024

Commit

1c789c0

1 Parent(s): 1ab0cdf

added validation for wav and pcm

Browse files

Files changed (5) hide show

.gitignore +1 -0
client.py +78 -23
infer.py +51 -6
poetry.lock +22 -1
pyproject.toml +2 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.wav

client.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import asyncio
 import json
 import wave
 import websockets
@@ -9,8 +10,62 @@ import ssl
 # Parameters for reading and sending the audio
 AUDIO_FILE_URL = "https://raw.githubusercontent.com/AshDavid12/runpod-serverless-forked/main/test_hebrew.wav"  # Use WAV file
 async def send_audio(websocket):
     buffer_size = 1024 * 16  # Send smaller chunks (16KB) for real-time processing
     # Download the WAV file locally
     # with requests.get(AUDIO_FILE_URL, stream=True) as response:
@@ -21,29 +76,29 @@ async def send_audio(websocket):
     #         print("Audio file downloaded successfully.")
             # Open the downloaded WAV file and extract PCM data
-            with wave.open('test_copy.wav', 'rb') as wav_file:
-                metadata = {
-                    'sample_rate': wav_file.getframerate(),
-                    'channels': wav_file.getnchannels(),
-                    'sampwidth': wav_file.getsampwidth(),
-                }
-                # Send metadata to the server before sending the audio
-                await websocket.send(json.dumps(metadata))
-                print(f"Sent metadata: {metadata}")
-                # Send the PCM audio data in chunks
-                while True:
-                    pcm_chunk = wav_file.readframes(buffer_size)
-                    if not pcm_chunk:
-                        break  # End of file
-                    await websocket.send(pcm_chunk)  # Send raw PCM data chunk
-                    #print(f"Sent PCM chunk of size {len(pcm_chunk)} bytes.")
-                    await asyncio.sleep(0.01)  # Simulate real-time sending
-        else:
-            print(f"Failed to download audio file. Status code: {response.status_code}")
 async def receive_transcription(websocket):

 import asyncio
 import json
+import logging
 import wave
 import websockets
 # Parameters for reading and sending the audio
 AUDIO_FILE_URL = "https://raw.githubusercontent.com/AshDavid12/runpod-serverless-forked/main/test_hebrew.wav"  # Use WAV file
+from pydub import AudioSegment
+# Convert and resample audio before writing it to WAV
+# Convert and resample audio before writing it to WAV
+def convert_to_mono_16k(audio_file_path):
+    logging.info(f"Starting audio conversion to mono and resampling to 16kHz for file: {audio_file_path}")
+    try:
+        # Load the audio file into an AudioSegment object
+        audio_segment = AudioSegment.from_file(audio_file_path, format="wav")
+        # Convert the audio to mono and resample it to 16kHz
+        audio_segment = audio_segment.set_channels(1).set_frame_rate(16000)
+        logging.info("Audio conversion to mono and 16kHz completed successfully.")
+    except Exception as e:
+        logging.error(f"Error during audio conversion: {e}")
+        raise e
+    # Return the modified AudioSegment object
+    return audio_segment
 async def send_audio(websocket):
     buffer_size = 1024 * 16  # Send smaller chunks (16KB) for real-time processing
+    logging.info("Converting the audio to mono and 16kHz.")
+    try:
+        converted_audio = convert_to_mono_16k('test_copy.wav')
+    except Exception as e:
+        logging.error(f"Failed to convert audio: {e}")
+        return
+    # Send metadata to the server
+    metadata = {
+        'sample_rate': 16000,  # Resampled rate
+        'channels': 1,  # Converted to mono
+        'sampwidth': 2  # Assuming 16-bit audio
+    }
+    await websocket.send(json.dumps(metadata))
+    logging.info(f"Sent metadata: {metadata}")
+    try:
+        raw_data = converted_audio.raw_data
+        logging.info(f"Starting to send raw PCM audio data. Total data size: {len(raw_data)} bytes.")
+        for i in range(0, len(raw_data), buffer_size):
+            pcm_chunk = raw_data[i:i + buffer_size]
+            await websocket.send(pcm_chunk)  # Send raw PCM data chunk
+            logging.info(f"Sent PCM chunk of size {len(pcm_chunk)} bytes.")
+            await asyncio.sleep(0.01)  # Simulate real-time sending
+        logging.info("Completed sending all audio data.")
+    except Exception as e:
+        logging.error(f"Error while sending audio data: {e}")
     # Download the WAV file locally
     # with requests.get(AUDIO_FILE_URL, stream=True) as response:
     #         print("Audio file downloaded successfully.")
             # Open the downloaded WAV file and extract PCM data
+    # with wave.open('test_copy.wav', 'rb') as wav_file:
+    #     metadata = {
+    #         'sample_rate': wav_file.getframerate(),
+    #         'channels': wav_file.getnchannels(),
+    #         'sampwidth': wav_file.getsampwidth(),
+    #     }
+    #
+    #     # Send metadata to the server before sending the audio
+    #     await websocket.send(json.dumps(metadata))
+    #     print(f"Sent metadata: {metadata}")
+        # # Send the PCM audio data in chunks
+        # while True:
+        #     pcm_chunk = wav_file.readframes(buffer_size)
+        #     if not pcm_chunk:
+        #         break  # End of file
+        #
+        #     await websocket.send(pcm_chunk)  # Send raw PCM data chunk
+        #     #print(f"Sent PCM chunk of size {len(pcm_chunk)} bytes.")
+        #     await asyncio.sleep(0.01)  # Simulate real-time sending
+        # else:
+        #     print(f"Failed to download audio file. Status code: {response.status_code}")
 async def receive_transcription(websocket):

infer.py CHANGED Viewed

@@ -131,9 +131,6 @@ def transcribe_core_ws(audio_file, last_transcribed_time):
     """
     Transcribe the audio file and return only the segments that have not been processed yet.
-    :param audio_file: Path to the growing audio file.
-    :param last_transcribed_time: The last time (in seconds) that was transcribed.
-    :return: Newly transcribed segments and the updated last transcribed time.
     """
     logging.info(f"Starting transcription for file: {audio_file} from {last_transcribed_time} seconds.")
@@ -177,6 +174,43 @@ def transcribe_core_ws(audio_file, last_transcribed_time):
 import tempfile
 @app.websocket("/wtranscribe")
 async def websocket_transcribe(websocket: WebSocket):
     logging.info("New WebSocket connection request received.")
@@ -214,6 +248,12 @@ async def websocket_transcribe(websocket: WebSocket):
                 # Accumulate the raw PCM data into the buffer
                 pcm_audio_buffer.extend(audio_chunk)
                 # Estimate the duration of the chunk based on its size
                 chunk_duration = len(audio_chunk) / (sample_rate * channels * sample_width)
                 accumulated_audio_time += chunk_duration
@@ -233,6 +273,11 @@ async def websocket_transcribe(websocket: WebSocket):
                                 wav_file.setframerate(sample_rate)
                                 wav_file.writeframes(pcm_audio_buffer)
                     logging.info(f"Temporary WAV file created at {temp_wav_file.name} for transcription.")
                     # Log to confirm that the file exists and has the expected size
@@ -260,9 +305,9 @@ async def websocket_transcribe(websocket: WebSocket):
                     await websocket.send_json(response)
                     # Optionally delete the temporary WAV file after processing
-                    if os.path.exists(temp_wav_file):
-                        os.remove(temp_wav_file)
-                        logging.info(f"Temporary WAV file {temp_wav_file} removed.")
             except WebSocketDisconnect:
                 logging.info("WebSocket connection closed by the client.")

     """
     Transcribe the audio file and return only the segments that have not been processed yet.
     """
     logging.info(f"Starting transcription for file: {audio_file} from {last_transcribed_time} seconds.")
 import tempfile
+# Function to verify if the PCM data is valid
+def validate_pcm_data(pcm_audio_buffer, sample_rate, channels, sample_width):
+    """Validates the PCM data buffer to ensure it conforms to the expected format."""
+    logging.info(f"Validating PCM data: total size = {len(pcm_audio_buffer)} bytes.")
+    # Calculate the expected sample size
+    expected_sample_size = sample_rate * channels * sample_width
+    actual_sample_size = len(pcm_audio_buffer)
+    if actual_sample_size == 0:
+        logging.error("Received PCM data is empty.")
+        return False
+    logging.info(f"Expected sample size per second: {expected_sample_size} bytes.")
+    if actual_sample_size % expected_sample_size != 0:
+        logging.warning(
+            f"PCM data size {actual_sample_size} is not a multiple of the expected sample size per second ({expected_sample_size} bytes). Data may be corrupted or incomplete.")
+    return True
+# Function to validate if the created WAV file is valid
+def validate_wav_file(wav_file_path):
+    """Validates if the WAV file was created correctly and can be opened."""
+    try:
+        with wave.open(wav_file_path, 'rb') as wav_file:
+            sample_rate = wav_file.getframerate()
+            channels = wav_file.getnchannels()
+            sample_width = wav_file.getsampwidth()
+            logging.info(
+                f"WAV file details - Sample Rate: {sample_rate}, Channels: {channels}, Sample Width: {sample_width}")
+            return True
+    except wave.Error as e:
+        logging.error(f"Error reading WAV file: {e}")
+        return False
 @app.websocket("/wtranscribe")
 async def websocket_transcribe(websocket: WebSocket):
     logging.info("New WebSocket connection request received.")
                 # Accumulate the raw PCM data into the buffer
                 pcm_audio_buffer.extend(audio_chunk)
+                # Validate the PCM data after each chunk
+                if not validate_pcm_data(pcm_audio_buffer, sample_rate, channels, sample_width):
+                    logging.error("Invalid PCM data received. Aborting transcription.")
+                    await websocket.send_json({"error": "Invalid PCM data received."})
+                    return
                 # Estimate the duration of the chunk based on its size
                 chunk_duration = len(audio_chunk) / (sample_rate * channels * sample_width)
                 accumulated_audio_time += chunk_duration
                                 wav_file.setframerate(sample_rate)
                                 wav_file.writeframes(pcm_audio_buffer)
+                        if not validate_wav_file(temp_wav_file.name):
+                            logging.error(f"Invalid WAV file created: {temp_wav_file.name}")
+                            await websocket.send_json({"error": "Invalid WAV file created."})
+                            return
                     logging.info(f"Temporary WAV file created at {temp_wav_file.name} for transcription.")
                     # Log to confirm that the file exists and has the expected size
                     await websocket.send_json(response)
                     # Optionally delete the temporary WAV file after processing
+                    if os.path.exists(temp_wav_file.name):
+                        os.remove(temp_wav_file.name)
+                        logging.info(f"Temporary WAV file {temp_wav_file.name} removed.")
             except WebSocketDisconnect:
                 logging.info("WebSocket connection closed by the client.")

poetry.lock CHANGED Viewed

@@ -1064,6 +1064,16 @@ tokenizers = ">=0.13,<1"
 conversion = ["transformers[torch] (>=4.23)"]
 dev = ["black (==23.*)", "flake8 (==6.*)", "isort (==5.*)", "pytest (==7.*)"]
 [[package]]
 name = "filelock"
 version = "3.16.0"
@@ -2539,6 +2549,17 @@ azure-key-vault = ["azure-identity (>=1.16.0)", "azure-keyvault-secrets (>=4.8.0
 toml = ["tomli (>=2.0.1)"]
 yaml = ["pyyaml (>=6.0.1)"]
 [[package]]
 name = "pygments"
 version = "2.18.0"
@@ -3862,4 +3883,4 @@ type = ["pytest-mypy"]
 [metadata]
 lock-version = "2.0"
 python-versions = "3.9.1"
-content-hash = "8b654ee2a2cc97497e78fbe0de6258f3fb006e3f9bbe7234f800843f66adcb7b"

 conversion = ["transformers[torch] (>=4.23)"]
 dev = ["black (==23.*)", "flake8 (==6.*)", "isort (==5.*)", "pytest (==7.*)"]
+[[package]]
+name = "ffmpeg"
+version = "1.4"
+description = "ffmpeg python package url [https://github.com/jiashaokun/ffmpeg]"
+optional = false
+python-versions = "*"
+files = [
+    {file = "ffmpeg-1.4.tar.gz", hash = "sha256:6931692c890ff21d39938433c2189747815dca0c60ddc7f9bb97f199dba0b5b9"},
+]
 [[package]]
 name = "filelock"
 version = "3.16.0"
 toml = ["tomli (>=2.0.1)"]
 yaml = ["pyyaml (>=6.0.1)"]
+[[package]]
+name = "pydub"
+version = "0.25.1"
+description = "Manipulate audio with an simple and easy high level interface"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pydub-0.25.1-py2.py3-none-any.whl", hash = "sha256:65617e33033874b59d87db603aa1ed450633288aefead953b30bded59cb599a6"},
+    {file = "pydub-0.25.1.tar.gz", hash = "sha256:980a33ce9949cab2a569606b65674d748ecbca4f0796887fd6f46173a7b0d30f"},
+]
 [[package]]
 name = "pygments"
 version = "2.18.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "3.9.1"
+content-hash = "62e30245d9470305f2b33ff86655c5a38e9f58c708b7ffb3cdfbf932ccfda6c7"

pyproject.toml CHANGED Viewed

@@ -24,6 +24,8 @@ openai = "^1.42.0"
 numpy = "^1.22.0"
 torch = "2.1.0"
 sounddevice = "^0.5.0"

 numpy = "^1.22.0"
 torch = "2.1.0"
 sounddevice = "^0.5.0"
+pydub = "^0.25.1"
+ffmpeg = "^1.4"