Spaces:
Running
Running
File size: 2,479 Bytes
5301c48 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
# Download and save the transcript
import os
from typing import Dict, Any
from starfish.data_ingest.parsers.base_parser import BaseParser
class YouTubeParser(BaseParser):
"""Parser for YouTube transcripts"""
def __init__(self):
super().__init__()
self.supported_extensions = [".youtube", ".yt"]
self.metadata = {}
def parse(self, url: str) -> str:
"""Parse a YouTube video transcript
Args:
url: YouTube video URL
Returns:
Transcript text
"""
try:
from pytube import YouTube
from youtube_transcript_api import YouTubeTranscriptApi
except ImportError:
raise ImportError(
"pytube and youtube-transcript-api are required for YouTube parsing. " "Install them with: pip install pytube youtube-transcript-api"
)
# Extract video ID and metadata
yt = YouTube(url)
video_id = yt.video_id
# Store metadata
self.metadata = {
"title": yt.title,
"author": yt.author,
"length": yt.length,
"views": yt.views,
"publish_date": yt.publish_date,
"description": yt.description,
"url": url,
}
# Get transcript
transcript = YouTubeTranscriptApi.get_transcript(video_id)
# Combine transcript segments
combined_text = []
for segment in transcript:
combined_text.append(segment["text"])
# Add video metadata
metadata = f"Title: {yt.title}\n" f"Author: {yt.author}\n" f"Length: {yt.length} seconds\n" f"URL: {url}\n\n" f"Transcript:\n"
return metadata + "\n".join(combined_text)
def get_metadata(self) -> Dict[str, Any]:
"""Get video metadata
Returns:
Dictionary containing video metadata
"""
return self.metadata
def is_supported(self, url: str) -> bool:
"""Check if the URL is supported by this parser
Args:
url: YouTube URL or ID
Returns:
True if the URL is supported, False otherwise
"""
return any(ext in url.lower() for ext in self.supported_extensions) or "youtube.com" in url.lower()
|