Spaces:
Build error
Build error
File size: 5,319 Bytes
3dfb32c bb29c0c 3dfb32c 7356e48 3dfb32c 7356e48 3dfb32c 53fda21 3dfb32c 698fffa 3dfb32c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
# imports
import os
import requests
from IPython.display import Markdown
import gradio as gr
from youtube_transcript_api import YouTubeTranscriptApi
import re
class YoutubeVideoID:
def __init__(self, url):
self.url = url
self.video_id = self.extract_video_id(url)
def extract_video_id(self, url):
"""
Extracts the YouTube video ID from a given URL.
Supports both regular and shortened URLs.
"""
# Regular expression to match YouTube video URL and extract the video ID
regex = r"(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|\S*\?v=)|(?:youtu\.be\/))([a-zA-Z0-9_-]{11})"
match = re.match(regex, url)
if match:
return match.group(1)
else:
raise ValueError("Invalid YouTube URL")
def __str__(self):
return f"Video ID: {self.video_id}"
def get_transcript(video_id, language='en'):
try:
# Step 1: List available transcripts
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
print("β
Available transcript languages:")
for t in transcript_list:
print(f"- {t.language} ({t.language_code})")
for transcript in transcript_list:
print(f"Language: {transcript.language}")
print(f"Language Code: {transcript.language_code}")
# 'hi' is the language code for Hindi
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[transcript.language_code])
return transcript
# for entry in transcript:
# print(f"{entry['start']}s: {entry['text']}")
except TranscriptsDisabled:
print("β Transcripts are disabled for this video.")
except NoTranscriptFound:
print("β No transcript was found for this video.")
except VideoUnavailable:
print("β The video is unavailable.")
except Exception as e:
print(f"β οΈ An unexpected error occurred: {e}")
return ""
def summarize_text(text):
try:
text_summary = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", torch_dtype=torch.bfloat16)
output = text_summary(input_text)
return output[0]['summary_text']
except Exception as e:
print(f"Error summarizing text: {e}")
return None
def split_text(text, chunk_size=3000):
"""
Splits large text into smaller chunks based on the given chunk size.
Ensures that chunks end with a full stop where possible to maintain sentence integrity.
:param text: str, the text to be split
:param chunk_size: int, maximum size of each chunk (default 3000 characters)
:return: list of str, where each str is a chunk of text
"""
chunks = []
while len(text) > chunk_size:
# Find the last full stop within or at the chunk size
split_point = text.rfind('.', 0, chunk_size + 1) # +1 to include the period itself if it's at chunk_size
if split_point == -1: # No period found within the chunk size
split_point = chunk_size
# Append the chunk, ensuring we don't strip spaces that might be part of the sentence structure
chunks.append(text[:split_point + 1] if split_point != chunk_size else text[:chunk_size])
text = text[split_point + 1:] if split_point != chunk_size else text[chunk_size:]
# Add the remaining text as the final chunk, only strip if there's content
if text:
chunks.append(text.strip())
return chunks
def get_result(video_url, summarize=True):
# Fetch transcript using the video ID
yt_video = YoutubeVideoID(video_url)
transcript_text = get_transcript(yt_video.video_id)
print(yt_video.video_id)
print(summarize)
if summarize == False:
return transcript_text
transcript_chunks = split_text(transcript_text)
summaries = []
for chunk in transcript_chunks:
summary = summarize_text(chunk)
summaries.append(summary)
full_summary = " ".join(summaries)
return Markdown(full_summary)
def create_gradio_interface():
with gr.Blocks() as demo:
gr.Markdown("# YouTube Video Summarizer")
gr.Markdown("""
This space provides summary of youtube video urls, you can also get full transcripts if you choose so.
### Credits:
Created by **Arsh** β Providing a simple solution for video summarization!
""")
# Input for YouTube URL
video_url_input = gr.Textbox(label="Enter YouTube URL", lines=1)
# Radio button for choosing output type (summary or full transcript)
output_type = gr.Radio(choices=["Summary", "Full Transcript"], label="Choose Output Type", value="Summary")
# Output for summarized or full transcript text
output_text = gr.Textbox(label="Result", lines=6)
# Submit button
submit_button = gr.Button("Generate", variant="primary")
# Define the action for the button press
submit_button.click(fn=get_result,
inputs=[video_url_input, output_type],
outputs=[output_text])
return demo
# Launch the interface with user credit
demo = create_gradio_interface()
demo.launch(share=True, show_api=True) |