Spaces:

PirateXX
/

Youtube-Video-Summarizer

Build error

App Files Files Community

Youtube-Video-Summarizer / app.py

PirateXX

Update app.py

7356e48 verified 4 months ago

raw

history blame contribute delete

5.32 kB

	# imports

	import os

	import requests
	from IPython.display import Markdown
	import gradio as gr
	from youtube_transcript_api import YouTubeTranscriptApi
	import re

	class YoutubeVideoID:
	def __init__(self, url):
	self.url = url
	self.video_id = self.extract_video_id(url)

	def extract_video_id(self, url):
	"""
	Extracts the YouTube video ID from a given URL.
	Supports both regular and shortened URLs.
	"""
	# Regular expression to match YouTube video URL and extract the video ID
	regex = r"(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/\|\S*\?v=)\|(?:youtu\.be\/))([a-zA-Z0-9_-]{11})"
	match = re.match(regex, url)

	if match:
	return match.group(1)
	else:
	raise ValueError("Invalid YouTube URL")

	def __str__(self):
	return f"Video ID: {self.video_id}"

	def get_transcript(video_id, language='en'):
	try:
	# Step 1: List available transcripts
	transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)

	print("✅ Available transcript languages:")
	for t in transcript_list:
	print(f"- {t.language} ({t.language_code})")

	for transcript in transcript_list:
	print(f"Language: {transcript.language}")
	print(f"Language Code: {transcript.language_code}")
	# 'hi' is the language code for Hindi
	transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[transcript.language_code])
	return transcript
	# for entry in transcript:
	# print(f"{entry['start']}s: {entry['text']}")
	except TranscriptsDisabled:
	print("❌ Transcripts are disabled for this video.")
	except NoTranscriptFound:
	print("❌ No transcript was found for this video.")
	except VideoUnavailable:
	print("❌ The video is unavailable.")
	except Exception as e:
	print(f"⚠️ An unexpected error occurred: {e}")
	return ""




	def summarize_text(text):
	try:
	text_summary = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", torch_dtype=torch.bfloat16)
	output = text_summary(input_text)
	return output[0]['summary_text']
	except Exception as e:
	print(f"Error summarizing text: {e}")
	return None

	def split_text(text, chunk_size=3000):
	"""
	Splits large text into smaller chunks based on the given chunk size.
	Ensures that chunks end with a full stop where possible to maintain sentence integrity.

	:param text: str, the text to be split
	:param chunk_size: int, maximum size of each chunk (default 3000 characters)
	:return: list of str, where each str is a chunk of text
	"""
	chunks = []
	while len(text) > chunk_size:
	# Find the last full stop within or at the chunk size
	split_point = text.rfind('.', 0, chunk_size + 1) # +1 to include the period itself if it's at chunk_size
	if split_point == -1: # No period found within the chunk size
	split_point = chunk_size

	# Append the chunk, ensuring we don't strip spaces that might be part of the sentence structure
	chunks.append(text[:split_point + 1] if split_point != chunk_size else text[:chunk_size])
	text = text[split_point + 1:] if split_point != chunk_size else text[chunk_size:]

	# Add the remaining text as the final chunk, only strip if there's content
	if text:
	chunks.append(text.strip())

	return chunks


	def get_result(video_url, summarize=True):
	# Fetch transcript using the video ID
	yt_video = YoutubeVideoID(video_url)
	transcript_text = get_transcript(yt_video.video_id)
	print(yt_video.video_id)
	print(summarize)
	if summarize == False:
	return transcript_text
	transcript_chunks = split_text(transcript_text)
	summaries = []
	for chunk in transcript_chunks:
	summary = summarize_text(chunk)
	summaries.append(summary)
	full_summary = " ".join(summaries)
	return Markdown(full_summary)


	def create_gradio_interface():
	with gr.Blocks() as demo:
	gr.Markdown("# YouTube Video Summarizer")
	gr.Markdown("""
	This space provides summary of youtube video urls, you can also get full transcripts if you choose so.
	### Credits:
	Created by Arsh – Providing a simple solution for video summarization!
	""")

	# Input for YouTube URL
	video_url_input = gr.Textbox(label="Enter YouTube URL", lines=1)

	# Radio button for choosing output type (summary or full transcript)
	output_type = gr.Radio(choices=["Summary", "Full Transcript"], label="Choose Output Type", value="Summary")

	# Output for summarized or full transcript text
	output_text = gr.Textbox(label="Result", lines=6)

	# Submit button
	submit_button = gr.Button("Generate", variant="primary")

	# Define the action for the button press
	submit_button.click(fn=get_result,
	inputs=[video_url_input, output_type],
	outputs=[output_text])

	return demo

	# Launch the interface with user credit
	demo = create_gradio_interface()
	demo.launch(share=True, show_api=True)