John-Jiang's picture
init commit
5301c48
raw
history blame
2.48 kB
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
# Download and save the transcript
import os
from typing import Dict, Any
from starfish.data_ingest.parsers.base_parser import BaseParser
class YouTubeParser(BaseParser):
"""Parser for YouTube transcripts"""
def __init__(self):
super().__init__()
self.supported_extensions = [".youtube", ".yt"]
self.metadata = {}
def parse(self, url: str) -> str:
"""Parse a YouTube video transcript
Args:
url: YouTube video URL
Returns:
Transcript text
"""
try:
from pytube import YouTube
from youtube_transcript_api import YouTubeTranscriptApi
except ImportError:
raise ImportError(
"pytube and youtube-transcript-api are required for YouTube parsing. " "Install them with: pip install pytube youtube-transcript-api"
)
# Extract video ID and metadata
yt = YouTube(url)
video_id = yt.video_id
# Store metadata
self.metadata = {
"title": yt.title,
"author": yt.author,
"length": yt.length,
"views": yt.views,
"publish_date": yt.publish_date,
"description": yt.description,
"url": url,
}
# Get transcript
transcript = YouTubeTranscriptApi.get_transcript(video_id)
# Combine transcript segments
combined_text = []
for segment in transcript:
combined_text.append(segment["text"])
# Add video metadata
metadata = f"Title: {yt.title}\n" f"Author: {yt.author}\n" f"Length: {yt.length} seconds\n" f"URL: {url}\n\n" f"Transcript:\n"
return metadata + "\n".join(combined_text)
def get_metadata(self) -> Dict[str, Any]:
"""Get video metadata
Returns:
Dictionary containing video metadata
"""
return self.metadata
def is_supported(self, url: str) -> bool:
"""Check if the URL is supported by this parser
Args:
url: YouTube URL or ID
Returns:
True if the URL is supported, False otherwise
"""
return any(ext in url.lower() for ext in self.supported_extensions) or "youtube.com" in url.lower()