Spaces:
Running
Running
| import gradio as gr | |
| from gradio_client import Client, handle_file | |
| import os | |
| import json | |
| import re | |
| from moviepy import * | |
| import cv2 | |
| hf_token = os.environ.get("HF_TKN") | |
| def extract_firstframe(video_in): | |
| vidcap = cv2.VideoCapture(video_in) | |
| success,image = vidcap.read() | |
| count = 0 | |
| while success: | |
| if count == 0: | |
| cv2.imwrite("first_frame.jpg", image) # save first extracted frame as jpg file named first_frame.jpg | |
| else: | |
| break # exit loop after saving first frame | |
| success,image = vidcap.read() | |
| print ('Read a new frame: ', success) | |
| count += 1 | |
| print ("Done extracted first frame!") | |
| return "first_frame.jpg" | |
| def extract_audio(video_in): | |
| input_video = video_in | |
| output_audio = 'audio.wav' | |
| # Open the video file and extract the audio | |
| video_clip = VideoFileClip(input_video) | |
| audio_clip = video_clip.audio | |
| # Save the audio as a .wav file | |
| audio_clip.write_audiofile(output_audio, fps=44100) # Use 44100 Hz as the sample rate for .wav files | |
| print("Audio extraction complete.") | |
| return 'audio.wav' | |
| def get_caption_from_kosmos(image_in): | |
| kosmos2_client = Client("https://ydshieh-kosmos-2.hf.space/") | |
| kosmos2_result = kosmos2_client.predict( | |
| image_in, # str (filepath or URL to image) in 'Test Image' Image component | |
| "Detailed", # str in 'Description Type' Radio component | |
| fn_index=4 | |
| ) | |
| print(f"KOSMOS2 RETURNS: {kosmos2_result}") | |
| with open(kosmos2_result[1], 'r') as f: | |
| data = json.load(f) | |
| reconstructed_sentence = [] | |
| for sublist in data: | |
| reconstructed_sentence.append(sublist[0]) | |
| full_sentence = ' '.join(reconstructed_sentence) | |
| #print(full_sentence) | |
| # Find the pattern matching the expected format ("Describe this image in detail:" followed by optional space and then the rest)... | |
| pattern = r'^Describe this image in detail:\s*(.*)$' | |
| # Apply the regex pattern to extract the description text. | |
| match = re.search(pattern, full_sentence) | |
| if match: | |
| description = match.group(1) | |
| print(description) | |
| else: | |
| print("Unable to locate valid description.") | |
| # Find the last occurrence of "." | |
| last_period_index = description.rfind('.') | |
| # Truncate the string up to the last period | |
| truncated_caption = description[:last_period_index + 1] | |
| # print(truncated_caption) | |
| print(f"\n—\nIMAGE CAPTION: {truncated_caption}") | |
| return truncated_caption | |
| def get_caption(image_in): | |
| client = Client("fffiloni/moondream1", hf_token=hf_token) | |
| result = client.predict( | |
| image=handle_file(image_in), | |
| question="Describe precisely the image in one sentence.", | |
| api_name="/predict" | |
| ) | |
| print(result) | |
| return result | |
| def get_magnet(prompt): | |
| amended_prompt = f"{prompt}" | |
| print(amended_prompt) | |
| try: | |
| client = Client("https://fffiloni-magnet.hf.space/") | |
| result = client.predict( | |
| "facebook/audio-magnet-medium", # Literal['facebook/magnet-small-10secs', 'facebook/magnet-medium-10secs', 'facebook/magnet-small-30secs', 'facebook/magnet-medium-30secs', 'facebook/audio-magnet-small', 'facebook/audio-magnet-medium'] in 'Model' Radio component | |
| "", # str in 'Model Path (custom models)' Textbox component | |
| amended_prompt, # str in 'Input Text' Textbox component | |
| 3, # float in 'Temperature' Number component | |
| 0.9, # float in 'Top-p' Number component | |
| 10, # float in 'Max CFG coefficient' Number component | |
| 1, # float in 'Min CFG coefficient' Number component | |
| 20, # float in 'Decoding Steps (stage 1)' Number component | |
| 10, # float in 'Decoding Steps (stage 2)' Number component | |
| 10, # float in 'Decoding Steps (stage 3)' Number component | |
| 10, # float in 'Decoding Steps (stage 4)' Number component | |
| "prod-stride1 (new!)", # Literal['max-nonoverlap', 'prod-stride1 (new!)'] in 'Span Scoring' Radio component | |
| api_name="/predict_full" | |
| ) | |
| print(result) | |
| return result[1] | |
| except: | |
| raise gr.Error("MAGNet space API is not ready, please try again in few minutes ") | |
| def get_audioldm(prompt): | |
| try: | |
| client = Client("https://haoheliu-audioldm2-text2audio-text2music.hf.space/") | |
| result = client.predict( | |
| prompt, # str in 'Input text' Textbox component | |
| "Low quality. Music.", # str in 'Negative prompt' Textbox component | |
| 10, # int | float (numeric value between 5 and 15) in 'Duration (seconds)' Slider component | |
| 3.5, # int | float (numeric value between 0 and 7) in 'Guidance scale' Slider component | |
| 45, # int | float in 'Seed' Number component | |
| 3, # int | float (numeric value between 1 and 5) in 'Number waveforms to generate' Slider component | |
| fn_index=1 | |
| ) | |
| print(result) | |
| audio_result = extract_audio(result) | |
| return audio_result | |
| except: | |
| raise gr.Error("AudioLDM space API is not ready, please try again in few minutes ") | |
| def get_audiogen(prompt): | |
| try: | |
| client = Client("https://fffiloni-audiogen.hf.space/") | |
| result = client.predict( | |
| prompt, | |
| 10, | |
| api_name="/infer" | |
| ) | |
| return result | |
| except: | |
| raise gr.Error("AudioGen space API is not ready, please try again in few minutes ") | |
| def get_tango(prompt): | |
| try: | |
| client = Client("fffiloni/tango", hf_token=hf_token) | |
| result = client.predict( | |
| prompt, # str representing string value in 'Prompt' Textbox component | |
| 100, # int | float representing numeric value between 100 and 200 in 'Steps' Slider component | |
| 4, # int | float representing numeric value between 1 and 10 in 'Guidance Scale' Slider component | |
| api_name="/predict" | |
| ) | |
| print(result) | |
| return result | |
| except: | |
| raise gr.Error("Tango space API is not ready, please try again in few minutes ") | |
| def get_tango2(prompt): | |
| try: | |
| client = Client("declare-lab/tango2") | |
| result = client.predict( | |
| prompt, | |
| 100, | |
| 4, | |
| api_name="/predict" | |
| ) | |
| print(result) | |
| return result | |
| except: | |
| raise gr.Error("Tango2 space API is not ready, please try again in few minutes ") | |
| def get_stable_audio_open(prompt): | |
| try: | |
| client = Client("fffiloni/Stable-Audio-Open-A10", hf_token=hf_token) | |
| result = client.predict( | |
| prompt=prompt, | |
| seconds_total=30, | |
| steps=100, | |
| cfg_scale=7, | |
| api_name="/predict" | |
| ) | |
| print(result) | |
| return result | |
| except: | |
| raise gr.Error("Stable Audio Open space API is not ready, please try again in few minutes ") | |
| def blend_vsfx(video_in, audio_result): | |
| audioClip = AudioFileClip(audio_result) | |
| print(f"AUD: {audioClip.duration}") | |
| clip = VideoFileClip(video_in) | |
| print(f"VID: {clip.duration}") | |
| if clip.duration < audioClip.duration : | |
| audioClip = audioClip.with_start(0).with_duration(clip.duration) | |
| elif clip.duration > audioClip.duration : | |
| clip = clip.subclip((0.0), (audioClip.duration)) | |
| final_clip = clip.set_audio(audioClip) | |
| # Set the output codec | |
| codec = 'libx264' | |
| audio_codec = 'aac' | |
| final_clip.write_videofile('final_video_with_sound.mp4', codec=codec, audio_codec=audio_codec) | |
| return "final_video_with_sound.mp4" | |
| def infer(video_in, chosen_model): | |
| image_in = extract_firstframe(video_in) | |
| caption = get_caption(image_in) | |
| if chosen_model == "MAGNet" : | |
| audio_result = get_magnet(caption) | |
| elif chosen_model == "AudioLDM-2" : | |
| audio_result = get_audioldm(caption) | |
| elif chosen_model == "AudioGen" : | |
| audio_result = get_audiogen(caption) | |
| elif chosen_model == "Tango" : | |
| audio_result = get_tango(caption) | |
| elif chosen_model == "Tango 2" : | |
| audio_result = get_tango2(caption) | |
| elif chosen_model == "Stable Audio Open" : | |
| audio_result = get_stable_audio_open(caption) | |
| final_res = blend_vsfx(video_in, audio_result) | |
| return gr.update(value=caption, interactive=True), gr.update(interactive=True), audio_result, final_res | |
| def retry(edited_prompt, video_in, chosen_model): | |
| image_in = extract_firstframe(video_in) | |
| caption = edited_prompt | |
| if chosen_model == "MAGNet" : | |
| audio_result = get_magnet(caption) | |
| elif chosen_model == "AudioLDM-2" : | |
| audio_result = get_audioldm(caption) | |
| elif chosen_model == "AudioGen" : | |
| audio_result = get_audiogen(caption) | |
| elif chosen_model == "Tango" : | |
| audio_result = get_tango(caption) | |
| elif chosen_model == "Tango 2" : | |
| audio_result = get_tango2(caption) | |
| elif chosen_model == "Stable Audio Open" : | |
| audio_result = get_stable_audio_open(caption) | |
| final_res = blend_vsfx(video_in, audio_result) | |
| return audio_result, final_res | |
| def refresh(): | |
| return gr.update(value=None, interactive=False), gr.update(interactive=False), gr.update(value=None), gr.update(value=None) | |
| css=""" | |
| #col-container{ | |
| margin: 0 auto; | |
| max-width: 800px; | |
| } | |
| """ | |
| with gr.Blocks(css=css) as demo: | |
| with gr.Column(elem_id="col-container"): | |
| gr.HTML(""" | |
| <h2 style="text-align: center;"> | |
| Video to SoundFX | |
| </h2> | |
| <p style="text-align: center;"> | |
| Get sound effects from a video shot while comparing audio models from image caption. | |
| </p> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| video_in = gr.Video(sources=["upload"], label="Video input") | |
| with gr.Row(): | |
| chosen_model = gr.Dropdown(label="Choose a model", choices=["MAGNet", "AudioLDM-2", "AudioGen", "Tango", "Tango 2", "Stable Audio Open"], value="Tango") | |
| submit_btn = gr.Button("Submit", scale=0) | |
| with gr.Column(): | |
| caption_o = gr.Textbox(label="Scene caption", interactive=False) | |
| retry_btn = gr.Button("Retry with edited scene caption", interactive=False) | |
| audio_o = gr.Audio(label="Audio output") | |
| with gr.Column(): | |
| video_o = gr.Video(label="Video with soundFX") | |
| gr.Examples( | |
| examples = [ | |
| ["examples/photoreal-train.mp4", "Tango"], | |
| ["examples/train-window.mp4", "Tango"], | |
| ["examples/chinese-new-year-dragon.mp4", "Tango"], | |
| ["examples/big-sur.mp4", "AudioLDM-2"] | |
| ], | |
| fn=infer, | |
| inputs = [video_in, chosen_model], | |
| outputs= [caption_o, retry_btn, audio_o, video_o], | |
| cache_examples=False | |
| ) | |
| ''' | |
| video_in.change( | |
| fn = refresh, | |
| inputs = None, | |
| outputs = [caption_o, retry_btn, audio_o, video_o], | |
| queue = False, | |
| show_progress = False | |
| ) | |
| video_in.clear( | |
| fn = refresh, | |
| inputs = None, | |
| outputs = [caption_o, retry_btn, audio_o, video_o], | |
| queue = False, | |
| show_progress = False | |
| ) | |
| ''' | |
| submit_btn.click( | |
| fn=infer, | |
| inputs=[video_in, chosen_model], | |
| outputs=[caption_o, retry_btn, audio_o, video_o], | |
| ) | |
| retry_btn.click( | |
| fn=retry, | |
| inputs=[caption_o, video_in, chosen_model], | |
| outputs=[audio_o, video_o], | |
| ) | |
| demo.queue(max_size=10).launch(show_api=False, debug=True, show_error=True) |