Spaces:

Audio-AGI
/

WavJourney

Sleeping

App Files Files Community

zzk1st commited on Aug 24, 2023

Commit

03adfb9

1 Parent(s): 78d395e

Fixed multi-user

Browse files

Files changed (11) hide show

APIs.py +4 -4
README.md +16 -6
code_generator.py +1 -3
config.yaml +0 -4
pipeline.py +7 -6
scripts/kill_services.py +1 -6
services.py +2 -2
ui_client.py +16 -8
utils.py +10 -1
voice_presets.py +1 -1
wavjourney_cli.py +1 -1

APIs.py CHANGED Viewed

@@ -6,6 +6,7 @@ import pyloudnorm as pyln
 from scipy.io.wavfile import write
 import torchaudio
 from retrying import retry
 os.environ['OPENBLAS_NUM_THREADS'] = '1'
@@ -14,10 +15,9 @@ SAMPLE_RATE = 32000
 with open('config.yaml', 'r') as file:
     config = yaml.safe_load(file)
-    service_port = config['Service-Port']
     enable_sr = config['Speech-Restoration']['Enable']
-    localhost_addr = '0.0.0.0'
 def LOUDNESS_NORM(audio, sr=32000, volumn=-25):
     # peak normalize audio to -1 dB
@@ -148,7 +148,7 @@ def TTA(text, length=5, volume=-35, out_wav='out.wav'):
 @retry(stop_max_attempt_number=5, wait_fixed=2000)
-def TTS(text, speaker='news_anchor', volume=-20, out_wav='out.wav', enhanced=enable_sr, speaker_id='', speaker_npz=''):
     url = f'http://{localhost_addr}:{service_port}/generate_speech'
     data = {
     'text': f'{text}',

 from scipy.io.wavfile import write
 import torchaudio
 from retrying import retry
+from utils import get_service_port, get_service_url
 os.environ['OPENBLAS_NUM_THREADS'] = '1'
 with open('config.yaml', 'r') as file:
     config = yaml.safe_load(file)
+    service_port = get_service_port()
+    localhost_addr = get_service_url()
     enable_sr = config['Speech-Restoration']['Enable']
 def LOUDNESS_NORM(audio, sr=32000, volumn=-25):
     # peak normalize audio to -1 dB
 @retry(stop_max_attempt_number=5, wait_fixed=2000)
+def TTS(text, volume=-20, out_wav='out.wav', enhanced=enable_sr, speaker_id='', speaker_npz=''):
     url = f'http://{localhost_addr}:{service_port}/generate_speech'
     data = {
     'text': f'{text}',

README.md CHANGED Viewed

@@ -8,7 +8,7 @@ pinned: false
 license: cc-by-nc-nd-4.0
 ---
 # <span style="color: blue;">🎵</span> WavJourney: Compositional Audio Creation with LLMs
-[![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2307.14335) [![GitHub Stars](https://img.shields.io/github/stars/Audio-AGI/WavJourney?style=social)](https://github.com/Audio-AGI/WavJourney/) [![githubio](https://img.shields.io/badge/GitHub.io-Demo_Page-blue?logo=Github&style=flat-square)](https://audio-agi.github.io/WavJourney_demopage/)
 This repository contains the official implementation of ["WavJourney: Compositional Audio Creation with Large Language Models"](https://audio-agi.github.io/WavJourney_demopage/WavJourney_arXiv.pdf).
@@ -32,14 +32,24 @@ bash ./scripts/EnvsSetup.sh
 conda activate WavJourney
 ```
-3. Set your `OpenAI-Key` in `config.yaml` for accessing [GPT-4 API](https://platform.openai.com/account/api-keys) [[Guidance](https://help.openai.com/en/articles/7102672-how-can-i-access-gpt-4)]. Please make sure the 'Service-Port' is not occupied. You can also modify the configuration, check the details described in the configuration file.
-3. Pre-download the models (might take some time):
 ```bash
 python scripts/download_models.py
 ```
-5. Start Python API services (e.g., Text-to-Speech, Text-to-Audio)
 ```bash
 bash scripts/start_services.sh
 ```
@@ -51,7 +61,7 @@ bash scripts/start_ui.sh
 ## Commandline Usage
  ```bash
- python wavjourney_cli.py -f --input-text "Generate a one-minute introduction to quantum mechanics"
  ```

 license: cc-by-nc-nd-4.0
 ---
 # <span style="color: blue;">🎵</span> WavJourney: Compositional Audio Creation with LLMs
+[![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2307.14335) [![GitHub Stars](https://img.shields.io/github/stars/Audio-AGI/WavJourney?style=social)](https://github.com/Audio-AGI/WavJourney/) [![githubio](https://img.shields.io/badge/GitHub.io-Demo_Page-blue?logo=Github&style=flat-square)](https://audio-agi.github.io/WavJourney_demopage/) [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/Audio-AGI/WavJourney)
 This repository contains the official implementation of ["WavJourney: Compositional Audio Creation with Large Language Models"](https://audio-agi.github.io/WavJourney_demopage/WavJourney_arXiv.pdf).
 conda activate WavJourney
 ```
+3. (Optional) You can modify the default configuration in `config.yaml`, check the details described in the configuration file.
+4. Pre-download the models (might take some time):
 ```bash
 python scripts/download_models.py
 ```
+5. Set the WAVJOURNEY_OPENAI_KEY in the environment variable for accessing [GPT-4 API](https://platform.openai.com/account/api-keys) [[Guidance](https://help.openai.com/en/articles/7102672-how-can-i-access-gpt-4)]
+```bash
+export WAVJOURNEY_OPENAI_KEY=your_openai_key_here
+```
+6. Set environment variables for using API services
+```bash
+export WAVJOURNEY_SERVICE_PORT=8021 WAVJOURNEY_SERVICE_URL=127.0.0.1
+```
+7. Start Python API services (e.g., Text-to-Speech, Text-to-Audio)
 ```bash
 bash scripts/start_services.sh
 ```
 ## Commandline Usage
  ```bash
+ python wavjourney_cli.py -f --input-text "Generate a one-minute introduction to quantum mechanics"
  ```

code_generator.py CHANGED Viewed

@@ -113,10 +113,8 @@ class AudioCodeGenerator:
             return wav_filename
         header = f'''
-import sys
-sys.path.append('../AudioJourney')
 import os
 import datetime
 from APIs import TTM, TTS, TTA, MIX, CAT, COMPUTE_LEN

             return wav_filename
         header = f'''
 import os
+import sys
 import datetime
 from APIs import TTM, TTS, TTA, MIX, CAT, COMPUTE_LEN

config.yaml CHANGED Viewed

@@ -15,7 +15,3 @@ Speech-Restoration:
 Voice-Parser:
   # HuBERT
   device: 'cpu'
-Service-Port: 8021
-OpenAI-Key: ''

 Voice-Parser:
   # HuBERT
   device: 'cpu'

pipeline.py CHANGED Viewed

@@ -120,6 +120,7 @@ def init_session(session_id=''):
     # create the paths
     os.makedirs(utils.get_session_voice_preset_path(session_id))
     os.makedirs(utils.get_session_audio_path(session_id))
     return session_id
 @retry(stop_max_attempt_number=3)
@@ -142,7 +143,6 @@ def input_text_to_json_script_with_retry(complete_prompt_path, api_key):
 # Step 1: input_text to json
 def input_text_to_json_script(input_text, output_path, api_key):
-    print('Step 1: Writing audio script with LLM ...')
     input_text = maybe_get_content_from_file(input_text)
     text_to_audio_script_prompt = get_file_content('prompts/text_to_json.prompt')
     prompt = f'{text_to_audio_script_prompt}\n\nInput text: {input_text}\n\nScript:\n'
@@ -155,7 +155,6 @@ def input_text_to_json_script(input_text, output_path, api_key):
 # Step 2: json to char-voice map
 def json_script_to_char_voice_map(json_script, voices, output_path, api_key):
-    print('Step 2: Parsing character voice with LLM...')
     json_script_content = maybe_get_content_from_file(json_script)
     prompt = get_file_content('prompts/audio_script_to_character_voice_map.prompt')
     presets_str = '\n'.join(f"{preset['id']}: {preset['desc']}" for preset in voices.values())
@@ -172,7 +171,6 @@ def json_script_to_char_voice_map(json_script, voices, output_path, api_key):
 # Step 3: json to py code
 def json_script_and_char_voice_map_to_audio_gen_code(json_script_filename, char_voice_map_filename, output_path, result_filename):
-    print('Step 3: Compiling audio script to Python program ...')
     audio_code_generator = AudioCodeGenerator()
     code = audio_code_generator.parse_and_generate(
         json_script_filename,
@@ -184,14 +182,14 @@ def json_script_and_char_voice_map_to_audio_gen_code(json_script_filename, char_
 # Step 4: py code to final wav
 def audio_code_gen_to_result(audio_gen_code_path):
-    print('Step 4: Start running Python program ...')
     audio_gen_code_filename = audio_gen_code_path / 'audio_generation.py'
-    os.system(f'python {audio_gen_code_filename}')
 # Function call used by Gradio: input_text to json
 def generate_json_file(session_id, input_text, api_key):
     output_path = utils.get_session_path(session_id)
     # Step 1
     return input_text_to_json_script(input_text, output_path, api_key)
 # Function call used by Gradio: json to result wav
@@ -201,13 +199,16 @@ def generate_audio(session_id, json_script, api_key):
     voices = voice_presets.get_merged_voice_presets(session_id)
     # Step 2
     char_voice_map = json_script_to_char_voice_map(json_script, voices, output_path, api_key)
     # Step 3
     json_script_filename = output_path / 'audio_script.json'
     char_voice_map_filename = output_path / 'character_voice_map.json'
     result_wav_basename = f'res_{session_id}'
     json_script_and_char_voice_map_to_audio_gen_code(json_script_filename, char_voice_map_filename, output_path, result_wav_basename)
     # Step 4
     audio_code_gen_to_result(output_path)
     result_wav_filename = output_audio_path / f'{result_wav_basename}.wav'
@@ -217,4 +218,4 @@ def generate_audio(session_id, json_script, api_key):
 # Convenient function call used by wavjourney_cli
 def full_steps(session_id, input_text, api_key):
     json_script = generate_json_file(session_id, input_text, api_key)
-    return generate_audio(session_id, json_script, api_key)

     # create the paths
     os.makedirs(utils.get_session_voice_preset_path(session_id))
     os.makedirs(utils.get_session_audio_path(session_id))
+    print(f'New session created, session_id={session_id}')
     return session_id
 @retry(stop_max_attempt_number=3)
 # Step 1: input_text to json
 def input_text_to_json_script(input_text, output_path, api_key):
     input_text = maybe_get_content_from_file(input_text)
     text_to_audio_script_prompt = get_file_content('prompts/text_to_json.prompt')
     prompt = f'{text_to_audio_script_prompt}\n\nInput text: {input_text}\n\nScript:\n'
 # Step 2: json to char-voice map
 def json_script_to_char_voice_map(json_script, voices, output_path, api_key):
     json_script_content = maybe_get_content_from_file(json_script)
     prompt = get_file_content('prompts/audio_script_to_character_voice_map.prompt')
     presets_str = '\n'.join(f"{preset['id']}: {preset['desc']}" for preset in voices.values())
 # Step 3: json to py code
 def json_script_and_char_voice_map_to_audio_gen_code(json_script_filename, char_voice_map_filename, output_path, result_filename):
     audio_code_generator = AudioCodeGenerator()
     code = audio_code_generator.parse_and_generate(
         json_script_filename,
 # Step 4: py code to final wav
 def audio_code_gen_to_result(audio_gen_code_path):
     audio_gen_code_filename = audio_gen_code_path / 'audio_generation.py'
+    os.system(f'PYTHONPATH=. python {audio_gen_code_filename}')
 # Function call used by Gradio: input_text to json
 def generate_json_file(session_id, input_text, api_key):
     output_path = utils.get_session_path(session_id)
     # Step 1
+    print(f'session_id={session_id}, Step 1: Writing audio script with LLM ...')
     return input_text_to_json_script(input_text, output_path, api_key)
 # Function call used by Gradio: json to result wav
     voices = voice_presets.get_merged_voice_presets(session_id)
     # Step 2
+    print(f'session_id={session_id}, Step 2: Parsing character voice with LLM...')
     char_voice_map = json_script_to_char_voice_map(json_script, voices, output_path, api_key)
     # Step 3
     json_script_filename = output_path / 'audio_script.json'
     char_voice_map_filename = output_path / 'character_voice_map.json'
     result_wav_basename = f'res_{session_id}'
+    print(f'session_id={session_id}, Step 3: Compiling audio script to Python program ...')
     json_script_and_char_voice_map_to_audio_gen_code(json_script_filename, char_voice_map_filename, output_path, result_wav_basename)
     # Step 4
+    print(f'session_id={session_id}, Step 4: Start running Python program ...')
     audio_code_gen_to_result(output_path)
     result_wav_filename = output_audio_path / f'{result_wav_basename}.wav'
 # Convenient function call used by wavjourney_cli
 def full_steps(session_id, input_text, api_key):
     json_script = generate_json_file(session_id, input_text, api_key)
+    return generate_audio(session_id, json_script, api_key)

scripts/kill_services.py CHANGED Viewed

@@ -1,12 +1,7 @@
-import yaml
 import os
-# Read the YAML file
-with open('config.yaml', 'r') as file:
-    config = yaml.safe_load(file)
 # Extract values for each application
-service_port = config['Service-Port']
 # Execute the commands
 os.system(f'kill $(lsof -t -i :{service_port})')

 import os
 # Extract values for each application
+service_port = os.environ.get('WAVJOURNEY_SERVICE_PORT')
 # Execute the commands
 os.system(f'kill $(lsof -t -i :{service_port})')

services.py CHANGED Viewed

@@ -6,7 +6,7 @@ import torch
 import torchaudio
 from torchaudio.transforms import SpeedPerturbation
 from APIs import WRITE_AUDIO, LOUDNESS_NORM
-from utils import fade
 from flask import Flask, request, jsonify
 with open('config.yaml', 'r') as file:
@@ -226,5 +226,5 @@ def parse_voice():
 if __name__ == '__main__':
-    service_port = config['Service-Port']
     app.run(debug=False, port=service_port)

 import torchaudio
 from torchaudio.transforms import SpeedPerturbation
 from APIs import WRITE_AUDIO, LOUDNESS_NORM
+from utils import fade, get_service_port
 from flask import Flask, request, jsonify
 with open('config.yaml', 'r') as file:
 if __name__ == '__main__':
+    service_port = get_service_port()
     app.run(debug=False, port=service_port)

ui_client.py CHANGED Viewed

@@ -41,9 +41,15 @@ def convert_char_voice_map_to_md(char_voice_map):
     return table_txt
 def generate_script_fn(instruction, _state: gr.State):
     try:
-        session_id = _state['session_id']
         api_key = utils.get_api_key()
         json_script = generate_json_file(session_id, instruction, api_key)
         table_text = convert_json_to_md(json_script)
@@ -130,12 +136,14 @@ def textbox_listener(textbox_input):
 def get_voice_preset_to_list(state: gr.State):
-    if state.__class__ == dict:
-        session_id = state['session_id']
     else:
-        session_id = state.value['session_id']
     voice_presets = load_voice_presets_metadata(
-        utils.get_session_voice_preset_path(session_id),
         safe_if_metadata_not_exist=True
     )
     dataframe = []
@@ -192,7 +200,7 @@ def add_voice_preset(vp_id, vp_desc, file, ui_state, added_voice_preset):
     else:
         count: int = added_voice_preset['count']
         # check if greater than 3
-        session_id = ui_state['session_id']
         file_path = file.name
         print(f'session {session_id}, id {id}, desc {vp_desc}, file {file_path}')
         # Do adding ...
@@ -398,7 +406,7 @@ with gr.Blocks(css=css) as interface:
     system_voice_presets = get_system_voice_presets()
     # State
-    ui_state = gr.State(value={'session_id': pipeline.init_session()})
     selected_voice_presets = gr.State(value={'selected_voice_preset': None})
     added_voice_preset_state = gr.State(value={'added_file': None, 'count': 0})
     # UI Component
@@ -557,4 +565,4 @@ with gr.Blocks(css=css) as interface:
     # print_state_btn = gr.Button(value='Print State')
     # print_state_btn.click(fn=lambda state, state2: print(state, state2), inputs=[ui_state, selected_voice_presets])
 interface.queue(concurrency_count=5, max_size=20)
-interface.launch()

     return table_txt
+def get_or_create_session_from_state(ui_state):
+    if 'session_id' not in ui_state:
+        ui_state['session_id'] = pipeline.init_session()
+    return ui_state['session_id']
 def generate_script_fn(instruction, _state: gr.State):
     try:
+        session_id = get_or_create_session_from_state(_state)
         api_key = utils.get_api_key()
         json_script = generate_json_file(session_id, instruction, api_key)
         table_text = convert_json_to_md(json_script)
 def get_voice_preset_to_list(state: gr.State):
+    if state.__class__ == gr.State:
+        state = state.value
+    if 'session_id' in state:
+        path = utils.get_session_voice_preset_path(state['session_id'])
     else:
+        path = ''
     voice_presets = load_voice_presets_metadata(
+        path,
         safe_if_metadata_not_exist=True
     )
     dataframe = []
     else:
         count: int = added_voice_preset['count']
         # check if greater than 3
+        session_id = get_or_create_session_from_state(ui_state)
         file_path = file.name
         print(f'session {session_id}, id {id}, desc {vp_desc}, file {file_path}')
         # Do adding ...
     system_voice_presets = get_system_voice_presets()
     # State
+    ui_state = gr.State({})
     selected_voice_presets = gr.State(value={'selected_voice_preset': None})
     added_voice_preset_state = gr.State(value={'added_file': None, 'count': 0})
     # UI Component
     # print_state_btn = gr.Button(value='Print State')
     # print_state_btn.click(fn=lambda state, state2: print(state, state2), inputs=[ui_state, selected_voice_presets])
 interface.queue(concurrency_count=5, max_size=20)
+interface.launch()

utils.py CHANGED Viewed

@@ -65,6 +65,15 @@ def fade(audio_data, fade_duration=2, sr=32000):
 #         config = yaml.safe_load(file)
 #         return config['OpenAI-Key'] if 'OpenAI-Key' in config else None
 def get_api_key():
-    api_key = os.environ.get('OPENAI_KEY')
     return api_key

 #         config = yaml.safe_load(file)
 #         return config['OpenAI-Key'] if 'OpenAI-Key' in config else None
+def get_service_port():
+    service_port = os.environ.get('WAVJOURNEY_SERVICE_PORT')
+    return service_port
+def get_service_url():
+    service_url = os.environ.get('WAVJOURNEY_SERVICE_URL')
+    return service_url
 def get_api_key():
+    api_key = os.environ.get('WAVJOURNEY_OPENAI_KEY')
     return api_key

voice_presets.py CHANGED Viewed

@@ -11,7 +11,7 @@ def save_voice_presets_metadata(voice_presets_path, metadata):
         json.dump(metadata, f, indent=4)
 def load_voice_presets_metadata(voice_presets_path, safe_if_metadata_not_exist=False):
-    metadata_full_path = voice_presets_path / 'metadata.json'
     if safe_if_metadata_not_exist:
         if not os.path.exists(metadata_full_path):

         json.dump(metadata, f, indent=4)
 def load_voice_presets_metadata(voice_presets_path, safe_if_metadata_not_exist=False):
+    metadata_full_path = Path(voice_presets_path) / 'metadata.json'
     if safe_if_metadata_not_exist:
         if not os.path.exists(metadata_full_path):

wavjourney_cli.py CHANGED Viewed

@@ -24,4 +24,4 @@ if args.full:
     pipeline.full_steps(session_id, input_text, api_key)
     end_time = time.time()
-    print(f"WavJourney took {end_time - start_time:.2f} seconds to complete.")

     pipeline.full_steps(session_id, input_text, api_key)
     end_time = time.time()
+    print(f"WavJourney took {end_time - start_time:.2f} seconds to complete.")