File size: 5,319 Bytes
3dfb32c
 
 
 
 
 
bb29c0c
3dfb32c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7356e48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3dfb32c
7356e48
 
 
3dfb32c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53fda21
 
3dfb32c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
698fffa
3dfb32c
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# imports

import os

import requests
from IPython.display import Markdown
import gradio as gr
from youtube_transcript_api import YouTubeTranscriptApi
import re

class YoutubeVideoID:
    def __init__(self, url):
        self.url = url
        self.video_id = self.extract_video_id(url)

    def extract_video_id(self, url):
        """
        Extracts the YouTube video ID from a given URL.
        Supports both regular and shortened URLs.
        """
        # Regular expression to match YouTube video URL and extract the video ID
        regex = r"(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|\S*\?v=)|(?:youtu\.be\/))([a-zA-Z0-9_-]{11})"
        match = re.match(regex, url)
        
        if match:
            return match.group(1)
        else:
            raise ValueError("Invalid YouTube URL")

    def __str__(self):
        return f"Video ID: {self.video_id}"

def get_transcript(video_id, language='en'):
    try:
        # Step 1: List available transcripts
        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)

        print("βœ… Available transcript languages:")
        for t in transcript_list:
            print(f"- {t.language} ({t.language_code})")

        for transcript in transcript_list:
            print(f"Language: {transcript.language}")
            print(f"Language Code: {transcript.language_code}")
            # 'hi' is the language code for Hindi
            transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[transcript.language_code])
            return transcript
            # for entry in transcript:
            #     print(f"{entry['start']}s: {entry['text']}")
    except TranscriptsDisabled:
        print("❌ Transcripts are disabled for this video.")
    except NoTranscriptFound:
        print("❌ No transcript was found for this video.")
    except VideoUnavailable:
        print("❌ The video is unavailable.")
    except Exception as e:
        print(f"⚠️ An unexpected error occurred: {e}")
    return ""




def summarize_text(text):
    try:
        text_summary = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", torch_dtype=torch.bfloat16)
        output = text_summary(input_text)
        return output[0]['summary_text']
    except Exception as e:
        print(f"Error summarizing text: {e}")
        return None

def split_text(text, chunk_size=3000):
    """
    Splits large text into smaller chunks based on the given chunk size.
    Ensures that chunks end with a full stop where possible to maintain sentence integrity.
    
    :param text: str, the text to be split
    :param chunk_size: int, maximum size of each chunk (default 3000 characters)
    :return: list of str, where each str is a chunk of text
    """
    chunks = []
    while len(text) > chunk_size:
        # Find the last full stop within or at the chunk size
        split_point = text.rfind('.', 0, chunk_size + 1)  # +1 to include the period itself if it's at chunk_size
        if split_point == -1:  # No period found within the chunk size
            split_point = chunk_size
        
        # Append the chunk, ensuring we don't strip spaces that might be part of the sentence structure
        chunks.append(text[:split_point + 1] if split_point != chunk_size else text[:chunk_size])
        text = text[split_point + 1:] if split_point != chunk_size else text[chunk_size:]
    
    # Add the remaining text as the final chunk, only strip if there's content
    if text:
        chunks.append(text.strip())
    
    return chunks


def get_result(video_url, summarize=True):
    # Fetch transcript using the video ID
    yt_video = YoutubeVideoID(video_url)
    transcript_text = get_transcript(yt_video.video_id)
    print(yt_video.video_id)
    print(summarize)
    if summarize == False:
        return transcript_text
    transcript_chunks = split_text(transcript_text)
    summaries = []
    for chunk in transcript_chunks:
        summary = summarize_text(chunk)
        summaries.append(summary)
    full_summary = " ".join(summaries)
    return Markdown(full_summary)


def create_gradio_interface():
    with gr.Blocks() as demo:
        gr.Markdown("# YouTube Video Summarizer")
        gr.Markdown("""
        This space provides summary of youtube video urls, you can also get full transcripts if you choose so.
        ### Credits:
        Created by **Arsh** – Providing a simple solution for video summarization!
        """)

        # Input for YouTube URL
        video_url_input = gr.Textbox(label="Enter YouTube URL", lines=1)

        # Radio button for choosing output type (summary or full transcript)
        output_type = gr.Radio(choices=["Summary", "Full Transcript"], label="Choose Output Type", value="Summary")

        # Output for summarized or full transcript text
        output_text = gr.Textbox(label="Result", lines=6)

        # Submit button
        submit_button = gr.Button("Generate", variant="primary")

        # Define the action for the button press
        submit_button.click(fn=get_result,
                            inputs=[video_url_input, output_type],
                            outputs=[output_text])

    return demo

# Launch the interface with user credit
demo = create_gradio_interface()
demo.launch(share=True, show_api=True)