Spaces:
Paused
Paused
import gradio as gr | |
from datasets import load_dataset | |
import subprocess | |
import os | |
import tempfile | |
import urllib.request | |
from pathlib import Path | |
def convert_ts_to_mp4(dataset_name, file_name, hf_token): | |
""" | |
Downloads a .ts video file from a Hugging Face dataset, | |
converts it to .mp4 using ffmpeg, and returns the path | |
to the .mp4 file. Handles both public and private datasets. | |
Args: | |
dataset_name (str): The name of the Hugging Face dataset. | |
file_name (str): The name of the .ts file within the dataset. | |
It should be just the filename, not the full path. | |
hf_token (str): The Hugging Face token. If None or empty, | |
it's assumed the dataset is public. | |
Returns: | |
str: The path to the converted .mp4 file, or None on error. | |
""" | |
try: | |
# 1. Load the dataset | |
if hf_token: | |
dataset = load_dataset(dataset_name, use_auth_token=hf_token, streaming=True) | |
else: | |
dataset = load_dataset(dataset_name, streaming=True) | |
# 2. Find the file. This part assumes the filename is unique | |
# within the dataset. For more complex datasets, you might | |
# need a more sophisticated search (e.g., iterating through | |
# splits and checking file metadata). This also assumes | |
# that the dataset provides the files in a way that we can | |
# access them directly. | |
file_url = None | |
for split in dataset.keys(): # Iterate through the splits | |
for example in dataset[split]: | |
if "file" in example and os.path.basename(example["file"]) == file_name: | |
file_url = example["file"] | |
print(file_url) | |
break | |
elif isinstance(example, dict): # Check for nested file paths. | |
for key, value in example.items(): | |
if isinstance(value, str) and os.path.basename(value) == file_name: | |
file_url = value; | |
break | |
if file_url: | |
break | |
if not file_url: | |
return "Error: File not found in the dataset." | |
# 3. Download the .ts file to a temporary location | |
with tempfile.NamedTemporaryFile(suffix=".ts", delete=True) as ts_file: | |
# Use a simple download mechanism. For more robust | |
# downloading, especially with large files, consider | |
# using 'requests' with streaming. | |
try: | |
urllib.request.urlretrieve(file_url, ts_file.name) | |
except Exception as e: | |
return f"Error downloading file: {e}" | |
# 4. Convert the .ts file to .mp4 using ffmpeg in a temporary location | |
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as mp4_file: | |
try: | |
subprocess.run( | |
[ | |
"ffmpeg", | |
"-i", | |
ts_file.name, | |
"-c:v", | |
"libx264", # Use libx264 for H.264 encoding (common) | |
"-c:a", | |
"aac", # Use AAC for audio encoding (common) | |
"-y", # Overwrite output file if it exists | |
mp4_file.name, | |
], | |
check=True, # Raise an exception on non-zero exit code | |
stdout=subprocess.PIPE, | |
stderr=subprocess.PIPE, | |
) | |
except subprocess.CalledProcessError as e: | |
# ffmpeg failed. Return the error message. | |
error_message = f"FFmpeg conversion failed: {e.stderr.decode('utf-8')}" | |
print(error_message) # Print to console for debugging in Spaces | |
return error_message | |
# 5. Return the path to the .mp4 file | |
return mp4_file.name | |
except Exception as e: | |
return f"An error occurred: {e}" | |
def gradio_interface(): | |
""" | |
Defines the Gradio interface for the application. | |
""" | |
inputs = [ | |
gr.Textbox( | |
label="Hugging Face Dataset Name", | |
placeholder="e.g., 'PolyAI/minds-14' or 'my-org/my-private-dataset'", | |
), | |
gr.Textbox( | |
label="TS File Name (within the dataset)", | |
placeholder="e.g., 'file_name.ts'", | |
), | |
gr.Textbox( | |
label="Hugging Face Token (for private datasets)", | |
placeholder="(Optional) Enter your Hugging Face token here, or set it as HF_TOKEN in Space settings", | |
type="password", | |
), | |
] | |
outputs = gr.File(label="Converted MP4 File") # Use gr.File for downloadable files | |
title = "TS to MP4 Converter" | |
description = ( | |
"Convert .ts video files from Hugging Face datasets to .mp4 format. " | |
"Provide the dataset name and the name of the .ts file. The converted " | |
".mp4 file will be available for download. " | |
"For private datasets, you *must* provide a Hugging Face token, either directly in the input box, or, preferably, by setting the `HF_TOKEN` secret in your Space's settings." | |
) | |
# Example Usage (Corrected) | |
article = """ | |
Example Usage: | |
1. For a public dataset like 'PolyAI/minds-14' and the file 'audio/en/common_voice_en_7722.ts', | |
enter 'PolyAI/minds-14' in the "Hugging Face Dataset Name" field and | |
'common_voice_en_7722.ts' in the "TS File Name" field. Leave the "Hugging Face Token" field empty. | |
2. For a private dataset, enter the dataset name (e.g., 'my-org/my-private-dataset') | |
and the .ts file name. Enter your Hugging Face token in the "Hugging Face Token" field | |
*or*, preferably, add your token as a secret named `HF_TOKEN` in your Space's settings. | |
3. Click the 'Submit' button. | |
4. The converted .mp4 file will be processed, and a download link will be provided. | |
""" | |
return gr.Interface( | |
fn=convert_ts_to_mp4, | |
inputs=inputs, | |
outputs=outputs, | |
title=title, | |
description=description, | |
article=article, | |
) | |
if __name__ == "__main__": | |
gradio_interface().launch() | |