Spaces:
Build error
Build error
# imports | |
import os | |
import requests | |
from IPython.display import Markdown | |
import gradio as gr | |
from youtube_transcript_api import YouTubeTranscriptApi | |
import re | |
class YoutubeVideoID: | |
def __init__(self, url): | |
self.url = url | |
self.video_id = self.extract_video_id(url) | |
def extract_video_id(self, url): | |
""" | |
Extracts the YouTube video ID from a given URL. | |
Supports both regular and shortened URLs. | |
""" | |
# Regular expression to match YouTube video URL and extract the video ID | |
regex = r"(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|\S*\?v=)|(?:youtu\.be\/))([a-zA-Z0-9_-]{11})" | |
match = re.match(regex, url) | |
if match: | |
return match.group(1) | |
else: | |
raise ValueError("Invalid YouTube URL") | |
def __str__(self): | |
return f"Video ID: {self.video_id}" | |
def get_transcript(video_id, language='en'): | |
try: | |
# Step 1: List available transcripts | |
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) | |
print("β Available transcript languages:") | |
for t in transcript_list: | |
print(f"- {t.language} ({t.language_code})") | |
for transcript in transcript_list: | |
print(f"Language: {transcript.language}") | |
print(f"Language Code: {transcript.language_code}") | |
# 'hi' is the language code for Hindi | |
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[transcript.language_code]) | |
return transcript | |
# for entry in transcript: | |
# print(f"{entry['start']}s: {entry['text']}") | |
except TranscriptsDisabled: | |
print("β Transcripts are disabled for this video.") | |
except NoTranscriptFound: | |
print("β No transcript was found for this video.") | |
except VideoUnavailable: | |
print("β The video is unavailable.") | |
except Exception as e: | |
print(f"β οΈ An unexpected error occurred: {e}") | |
return "" | |
def summarize_text(text): | |
try: | |
text_summary = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", torch_dtype=torch.bfloat16) | |
output = text_summary(input_text) | |
return output[0]['summary_text'] | |
except Exception as e: | |
print(f"Error summarizing text: {e}") | |
return None | |
def split_text(text, chunk_size=3000): | |
""" | |
Splits large text into smaller chunks based on the given chunk size. | |
Ensures that chunks end with a full stop where possible to maintain sentence integrity. | |
:param text: str, the text to be split | |
:param chunk_size: int, maximum size of each chunk (default 3000 characters) | |
:return: list of str, where each str is a chunk of text | |
""" | |
chunks = [] | |
while len(text) > chunk_size: | |
# Find the last full stop within or at the chunk size | |
split_point = text.rfind('.', 0, chunk_size + 1) # +1 to include the period itself if it's at chunk_size | |
if split_point == -1: # No period found within the chunk size | |
split_point = chunk_size | |
# Append the chunk, ensuring we don't strip spaces that might be part of the sentence structure | |
chunks.append(text[:split_point + 1] if split_point != chunk_size else text[:chunk_size]) | |
text = text[split_point + 1:] if split_point != chunk_size else text[chunk_size:] | |
# Add the remaining text as the final chunk, only strip if there's content | |
if text: | |
chunks.append(text.strip()) | |
return chunks | |
def get_result(video_url, summarize=True): | |
# Fetch transcript using the video ID | |
yt_video = YoutubeVideoID(video_url) | |
transcript_text = get_transcript(yt_video.video_id) | |
print(yt_video.video_id) | |
print(summarize) | |
if summarize == False: | |
return transcript_text | |
transcript_chunks = split_text(transcript_text) | |
summaries = [] | |
for chunk in transcript_chunks: | |
summary = summarize_text(chunk) | |
summaries.append(summary) | |
full_summary = " ".join(summaries) | |
return Markdown(full_summary) | |
def create_gradio_interface(): | |
with gr.Blocks() as demo: | |
gr.Markdown("# YouTube Video Summarizer") | |
gr.Markdown(""" | |
This space provides summary of youtube video urls, you can also get full transcripts if you choose so. | |
### Credits: | |
Created by **Arsh** β Providing a simple solution for video summarization! | |
""") | |
# Input for YouTube URL | |
video_url_input = gr.Textbox(label="Enter YouTube URL", lines=1) | |
# Radio button for choosing output type (summary or full transcript) | |
output_type = gr.Radio(choices=["Summary", "Full Transcript"], label="Choose Output Type", value="Summary") | |
# Output for summarized or full transcript text | |
output_text = gr.Textbox(label="Result", lines=6) | |
# Submit button | |
submit_button = gr.Button("Generate", variant="primary") | |
# Define the action for the button press | |
submit_button.click(fn=get_result, | |
inputs=[video_url_input, output_type], | |
outputs=[output_text]) | |
return demo | |
# Launch the interface with user credit | |
demo = create_gradio_interface() | |
demo.launch(share=True, show_api=True) |