File size: 6,367 Bytes
c2e8a2e
 
 
 
 
5379aa1
 
c2e8a2e
be74542
c2e8a2e
 
 
be74542
c2e8a2e
 
 
 
 
be74542
 
c2e8a2e
 
 
 
 
 
be74542
 
 
 
c2e8a2e
 
 
 
 
 
 
 
 
 
 
 
5379aa1
c2e8a2e
be74542
c2e8a2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
be74542
c2e8a2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
be74542
c2e8a2e
 
 
 
 
 
 
be74542
c2e8a2e
 
 
 
 
be74542
 
 
 
 
c2e8a2e
 
 
 
 
 
 
be74542
 
c2e8a2e
 
 
 
 
 
be74542
 
 
 
 
 
 
 
c2e8a2e
 
 
 
 
 
 
 
 
 
 
 
be74542
c2e8a2e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import gradio as gr
from datasets import load_dataset
import subprocess
import os
import tempfile
import urllib.request
from pathlib import Path

def convert_ts_to_mp4(dataset_name, file_name, hf_token):
    """
    Downloads a .ts video file from a Hugging Face dataset,
    converts it to .mp4 using ffmpeg, and returns the path
    to the .mp4 file.  Handles both public and private datasets.

    Args:
        dataset_name (str): The name of the Hugging Face dataset.
        file_name (str): The name of the .ts file within the dataset.
                        It should be just the filename, not the full path.
        hf_token (str):  The Hugging Face token.  If None or empty,
                        it's assumed the dataset is public.

    Returns:
        str: The path to the converted .mp4 file, or None on error.
    """
    try:
        # 1. Load the dataset
        if hf_token:
            dataset = load_dataset(dataset_name, use_auth_token=hf_token, streaming=True)
        else:
            dataset = load_dataset(dataset_name, streaming=True)

        # 2. Find the file.  This part assumes the filename is unique
        #    within the dataset.  For more complex datasets, you might
        #    need a more sophisticated search (e.g., iterating through
        #    splits and checking file metadata).  This also assumes
        #    that the dataset provides the files in a way that we can
        #    access them directly.
        file_url = None
        for split in dataset.keys():  # Iterate through the splits
            for example in dataset[split]:
                if "file" in example and os.path.basename(example["file"]) == file_name:
                    file_url = example["file"]
                    print(file_url)
                    break
                elif isinstance(example, dict): # Check for nested file paths.
                    for key, value in example.items():
                        if isinstance(value, str) and os.path.basename(value) == file_name:
                            file_url = value;
                            break
            if file_url:
                break

        if not file_url:
            return "Error: File not found in the dataset."

        # 3. Download the .ts file to a temporary location
        with tempfile.NamedTemporaryFile(suffix=".ts", delete=True) as ts_file:
            # Use a simple download mechanism.  For more robust
            # downloading, especially with large files, consider
            # using 'requests' with streaming.
            try:
                urllib.request.urlretrieve(file_url, ts_file.name)
            except Exception as e:
                return f"Error downloading file: {e}"

            # 4. Convert the .ts file to .mp4 using ffmpeg in a temporary location
            with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as mp4_file:
                try:
                    subprocess.run(
                        [
                            "ffmpeg",
                            "-i",
                            ts_file.name,
                            "-c:v",
                            "libx264",  # Use libx264 for H.264 encoding (common)
                            "-c:a",
                            "aac",      # Use AAC for audio encoding (common)
                            "-y",  # Overwrite output file if it exists
                            mp4_file.name,
                        ],
                        check=True,  # Raise an exception on non-zero exit code
                        stdout=subprocess.PIPE,
                        stderr=subprocess.PIPE,
                    )
                except subprocess.CalledProcessError as e:
                    # ffmpeg failed.  Return the error message.
                    error_message = f"FFmpeg conversion failed: {e.stderr.decode('utf-8')}"
                    print(error_message)  # Print to console for debugging in Spaces
                    return error_message

                # 5. Return the path to the .mp4 file
                return mp4_file.name

    except Exception as e:
        return f"An error occurred: {e}"



def gradio_interface():
    """
    Defines the Gradio interface for the application.
    """
    inputs = [
        gr.Textbox(
            label="Hugging Face Dataset Name",
            placeholder="e.g., 'PolyAI/minds-14' or 'my-org/my-private-dataset'",
        ),
        gr.Textbox(
            label="TS File Name (within the dataset)",
            placeholder="e.g., 'file_name.ts'",
        ),
        gr.Textbox(
            label="Hugging Face Token (for private datasets)",
            placeholder="(Optional) Enter your Hugging Face token here, or set it as HF_TOKEN in Space settings",
            type="password",
        ),
    ]
    outputs = gr.File(label="Converted MP4 File")  # Use gr.File for downloadable files

    title = "TS to MP4 Converter"
    description = (
        "Convert .ts video files from Hugging Face datasets to .mp4 format. "
        "Provide the dataset name and the name of the .ts file.  The converted "
        ".mp4 file will be available for download.  "
        "For private datasets, you *must* provide a Hugging Face token, either directly in the input box, or, preferably, by setting the `HF_TOKEN` secret in your Space's settings."
    )

    # Example Usage (Corrected)
    article = """
    Example Usage:

    1.  For a public dataset like 'PolyAI/minds-14' and the file 'audio/en/common_voice_en_7722.ts',
        enter 'PolyAI/minds-14' in the "Hugging Face Dataset Name" field and
        'common_voice_en_7722.ts' in the "TS File Name" field.  Leave the "Hugging Face Token" field empty.
    2.  For a private dataset, enter the dataset name (e.g., 'my-org/my-private-dataset')
        and the .ts file name.  Enter your Hugging Face token in the "Hugging Face Token" field
        *or*, preferably, add your token as a secret named `HF_TOKEN` in your Space's settings.
    3.  Click the 'Submit' button.
    4.  The converted .mp4 file will be processed, and a download link will be provided.
    """

    return gr.Interface(
        fn=convert_ts_to_mp4,
        inputs=inputs,
        outputs=outputs,
        title=title,
        description=description,
        article=article,
    )



if __name__ == "__main__":
    gradio_interface().launch()