Spaces:
Configuration error
Configuration error
| import json | |
| from tqdm import tqdm | |
| from pytubefix import YouTube | |
| import xml.etree.ElementTree as ET | |
| import os | |
| with open ('VideoInstruct100K.json','r') as f : | |
| data=json.load(f) | |
| # Usage | |
| existed_video_id={} | |
| for video_name in os.listdir('videos'): | |
| video_id = video_name.split('.')[0] | |
| existed_video_id[video_id]=True | |
| def download_video_with_subtitles(video_id): | |
| # Create a YouTube object. | |
| yt = YouTube(f'https://www.youtube.com/watch?v={video_id}') | |
| video_filename = f"{video_id}.mp4" | |
| video_downloaded=False | |
| try : | |
| # Get the video stream with the highest resolution and download the video. | |
| stream = yt.streams.get_highest_resolution() | |
| stream.download(output_path='videos', filename=video_filename) | |
| video_downloaded=True | |
| except Exception as e: | |
| print(f"Error downloading video {video_id}: {str(e)}") | |
| video_downloaded=False | |
| if not video_downloaded: | |
| return False,False | |
| # Get the video's available captions (subtitles). | |
| captions = yt.captions.all() | |
| # Download the captions if available in xml format. | |
| caption_downloaded = False | |
| for caption in captions: | |
| caption_code = caption.code | |
| # select only english captions | |
| if 'en' in caption_code: | |
| caption.download(title=f"{video_id}", output_path='subtitles_xml',srt=False) | |
| caption_downloaded = True | |
| return video_downloaded,caption_downloaded | |
| def convert_xml_vtt(xml_path, vtt_path): | |
| # Parse the XML subtitle file | |
| tree = ET.parse(xml_path) | |
| root = tree.getroot() | |
| # Initialize a list to store VTT subtitle entries | |
| vtt_subtitle = [] | |
| # Function to convert time in milliseconds to WebVTT format | |
| def ms_to_vtt_time(milliseconds): | |
| seconds, milliseconds = divmod(milliseconds, 1000) | |
| minutes, seconds = divmod(seconds, 60) | |
| return f"{minutes:02d}:{seconds:02d}.{milliseconds:03d}" | |
| # Iterate through subtitle elements | |
| toggle = True | |
| for p in root.findall(".//p"): | |
| if toggle: | |
| start_time = int(p.get("t")) | |
| subtitle_text = " ".join(s.text.strip() for s in p.findall(".//s")) | |
| # duration = int(p.get("d")) if p.get("d") is not None else 0 | |
| if not toggle: | |
| end_time = int(p.get("t")) | |
| # Format and append the VTT entry to the list | |
| vtt_subtitle.append(f"{ms_to_vtt_time(start_time)} --> {ms_to_vtt_time(end_time)}\n{subtitle_text}\n") | |
| toggle = not toggle | |
| # Join the VTT entries into a single string | |
| vtt_content = "WEBVTT\n\n" + "\n".join(vtt_subtitle) | |
| # Save the VTT content to a file | |
| with open(vtt_path, "w", encoding="utf-8") as vtt_file: | |
| vtt_file.write(vtt_content) | |
| import os | |
| os.makedirs('videos', exist_ok=True) | |
| os.makedirs('subtitles_vtt', exist_ok=True) | |
| os.makedirs('subtitles_xml', exist_ok=True) | |
| for video_path in tqdm(data,desc='Downloading videos') : | |
| video_id=video_path.split('/')[-1].split('.')[0] | |
| if existed_video_id.get(video_id,False): | |
| continue | |
| video_downloaded,caption_downloaded=download_video_with_subtitles(video_id) | |
| if caption_downloaded: | |
| # convert xml to vtt | |
| xml_file_path=f'subtitles_xml/{video_id} (a.en).xml' | |
| convert_xml_vtt(xml_file_path,f'subtitles_vtt/{video_id}.vtt') | |