Spaces:

John-Jiang
/

starfish_data_ai

Running

App Files Files Community

starfish_data_ai / src /starfish /data_ingest /parsers /youtube_parser.py

John-Jiang

init commit

5301c48 2 months ago

raw

history blame

2.48 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the terms described in the LICENSE file in
	# the root directory of this source tree.
	# Download and save the transcript

	import os
	from typing import Dict, Any
	from starfish.data_ingest.parsers.base_parser import BaseParser


	class YouTubeParser(BaseParser):
	"""Parser for YouTube transcripts"""

	def __init__(self):
	super().__init__()
	self.supported_extensions = [".youtube", ".yt"]
	self.metadata = {}

	def parse(self, url: str) -> str:
	"""Parse a YouTube video transcript

	Args:
	url: YouTube video URL

	Returns:
	Transcript text
	"""
	try:
	from pytube import YouTube
	from youtube_transcript_api import YouTubeTranscriptApi
	except ImportError:
	raise ImportError(
	"pytube and youtube-transcript-api are required for YouTube parsing. " "Install them with: pip install pytube youtube-transcript-api"
	)

	# Extract video ID and metadata
	yt = YouTube(url)
	video_id = yt.video_id

	# Store metadata
	self.metadata = {
	"title": yt.title,
	"author": yt.author,
	"length": yt.length,
	"views": yt.views,
	"publish_date": yt.publish_date,
	"description": yt.description,
	"url": url,
	}

	# Get transcript
	transcript = YouTubeTranscriptApi.get_transcript(video_id)

	# Combine transcript segments
	combined_text = []
	for segment in transcript:
	combined_text.append(segment["text"])

	# Add video metadata
	metadata = f"Title: {yt.title}\n" f"Author: {yt.author}\n" f"Length: {yt.length} seconds\n" f"URL: {url}\n\n" f"Transcript:\n"

	return metadata + "\n".join(combined_text)

	def get_metadata(self) -> Dict[str, Any]:
	"""Get video metadata

	Returns:
	Dictionary containing video metadata
	"""
	return self.metadata

	def is_supported(self, url: str) -> bool:
	"""Check if the URL is supported by this parser

	Args:
	url: YouTube URL or ID

	Returns:
	True if the URL is supported, False otherwise
	"""
	return any(ext in url.lower() for ext in self.supported_extensions) or "youtube.com" in url.lower()