In [41]:
from youtube_transcript_api import YouTubeTranscriptApi
from nltk.tokenize import TextTilingTokenizer  
import pandas as pd
import numpy as np
import requests
import json

url = "https://www.youtube.com/watch?v=77zvIYDFSok"
video_id = url.split("=")[1]

try:
    raw = YouTubeTranscriptApi.get_transcript(video_id)
except:
    transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
    for transcript in transcript_list:
        raw = transcript.translate('en').fetch()
        break

response = requests.get(f"https://noembed.com/embed?dataType=json&url={url}")
data = json.loads(response.content)

title, author = data["title"], data["author_name"]

In [42]:
raw

[{'text': '[Music]', 'start': 2.19, 'duration': 3.5},
 {'text': '[Music]', 'start': 18.73, 'duration': 3.07},
 {'text': '[Applause]', 'start': 27.71, 'duration': 3.289},
 {'text': '[Music]', 'start': 33.68, 'duration': 7.01},
 {'text': '[Laughter] [Music] [Music] [', 'start': 36.05, 'duration': 4.64},
 {'text': 'Applause]', 'start': 59.97, 'duration': 3.2},
 {'text': '[Music]', 'start': 68.78, 'duration': 3.12},
 {'text': 'Recently, the', 'start': 72.18, 'duration': 3.0},
 {'text': "issue of sexual assault by celebrities has not stopped.  It's",
  'start': 79.26,
  'duration': 3.24},
 {'text': "true that it happened to me. It's",
  'start': 87.74,
  'duration': 3.96},
 {'text': 'reallyembarrassing', 'start': 96.299, 'duration': 3.721},
 {'text': "[Music] It's disastrous", 'start': 98.16, 'duration': 4.099},
 {'text': '[Music]', 'start': 102.36, 'duration': 3.18},
 {'text': "There's also a part where feminists are obsessed with men's genitals. I think their germinating",
  'start': 111.

In [43]:
# Convert the list of dictionaries to a pandas dataframe
df = pd.DataFrame(raw)

# Add end column
df['end'] = df['start'] + df['duration']

# Add a new column to the dataframe called 'total_words' that contains the total number of words so far in the transcript
df['total_words'] = df['text'].apply(lambda x: len(x.split())).cumsum()

# Add "\n\n" at the end of df["text"]
df["text"] = df["text"] + "\n\n"

In [44]:
# Merge the text column into a single string and save to a transcript variable

transcript = df['text'].str.cat(sep=' ')

In [45]:
transcript

"[Music]\n\n [Music]\n\n [Applause]\n\n [Music]\n\n [Laughter] [Music] [Music] [\n\n Applause]\n\n [Music]\n\n Recently, the\n\n issue of sexual assault by celebrities has not stopped.  It's\n\n true that it happened to me. It's\n\n reallyembarrassing\n\n [Music] It's disastrous\n\n [Music]\n\n There's also a part where feminists are obsessed with men's genitals. I think their germinating\n\n power is a really scary part.  I\n\n think this castration will happen more often as they get castrated.\n\n In fact, feminism was popular at the time,\n\n but thanks to its popularity,\n\n a lot of\n\n things happened, such as scolding, ridicule, insults, and\n\n insults against\n\n men.  I\n\n just couldn't stay there. Well, the\n\n pepper is 3 cm.\n\n Besides, all men are potential rape\n\n criminals. Men are useless. Men stopped\n\n trusting women.  If you\n\n reach there, you may be hit by the #MeToo movement, so I\n\n think there are a lot of them right now. I think\n\n there may be a little

In [11]:
tt = TextTilingTokenizer()

# Tokenize the transcript into segments using the TextTilingTokenizer
segments = tt.tokenize(transcript)

In [12]:
# # Remove \n\n from each segment
segments = [segment.replace('\n\n','').strip() for segment in segments]

In [13]:
# Calculate a list of word count for each segment
segments_wc = [len(segment.split()) for segment in segments]

# Make it cumulative
segments_wc = np.cumsum(segments_wc)

In [14]:
def to_timestamp(seconds):
    seconds = int(seconds)

    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds_remaining = seconds % 60
    
    if seconds >= 3600:
        return f"{hours:02d}:{minutes:02d}:{seconds_remaining:02d}"
    else:
        return f"{minutes:02d}:{seconds_remaining:02d}"

In [15]:
to_timestamp(100)

'01:40'

In [16]:
# For each value in segments_wc, get the index of the closest value in df['total_words']
# This will be the index of the row in df that is closest to the end of each segment
idx = [np.argmin(np.abs(df['total_words'] - total_words)) for total_words in segments_wc]

# Get segment end times from idx
segment_end_times = df['end'].iloc[idx].values

# Add 0.0 to the beginning of segment_end_times
segment_end_times = np.insert(segment_end_times, 0, 0.0)

# segment_times is a list of tuples containing the start and end times of each segment
segment_times = [(to_timestamp(segment_end_times[i-1]), to_timestamp(segment_end_times[i])) for i in range(1,len(segment_end_times))]

In [22]:
# At the beginning of each segment, add the title, author, and segment times
segments_times = [f"({to_timestamp(segment_end_times[i-1])}, {to_timestamp(segment_end_times[i])})" for i in range(1,len(segment_end_times))]

In [23]:
segments_times

['(00:00, 00:48)',
 '(00:48, 01:10)',
 '(01:10, 01:46)',
 '(01:46, 02:26)',
 '(02:26, 02:57)',
 '(02:57, 03:25)',
 '(03:25, 04:11)',
 '(04:11, 04:41)',
 '(04:41, 05:26)',
 '(05:26, 05:45)',
 '(05:45, 06:13)',
 '(06:13, 06:40)',
 '(06:40, 07:02)',
 '(07:02, 07:54)',
 '(07:54, 08:17)',
 '(08:17, 09:24)',
 '(09:24, 10:10)',
 '(10:10, 11:02)',
 '(11:02, 11:47)',
 '(11:47, 12:09)',
 '(12:09, 12:52)',
 '(12:52, 13:50)',
 '(13:50, 14:15)',
 '(14:15, 14:38)',
 '(14:38, 16:14)',
 '(16:14, 17:16)',
 '(17:16, 17:47)',
 '(17:47, 18:17)',
 '(18:17, 18:56)',
 '(18:56, 19:31)',
 '(19:31, 19:52)',
 '(19:52, 21:03)',
 '(21:03, 21:39)',
 '(21:39, 22:08)',
 '(22:08, 22:42)',
 '(22:42, 23:35)',
 '(23:35, 24:51)',
 '(24:51, 26:01)',
 '(26:01, 26:28)',
 '(26:28, 26:57)',
 '(26:57, 28:37)',
 '(28:37, 29:00)',
 '(29:00, 29:50)',
 '(29:50, 30:12)',
 '(30:12, 30:55)',
 '(30:55, 31:47)',
 '(31:47, 32:54)',
 '(32:54, 33:33)',
 '(33:33, 33:50)',
 '(33:50, 34:20)',
 '(34:20, 34:48)',
 '(34:48, 35:22)',
 '(35:22, 36

In [72]:
text = '''
Segment from 'Feminism Is 'Dividing This'' Country' by VICE News
Timestamp: (10:51, 12:24)
---
personally take while leading this group, let's create a world where feminists don't have to choose feminism.  I choose feminism because I think you're watching. As a person, I live to protect the woman I love. I think I'm about the level of a director who creates a hero. Well, one day, [Music] We were humiliated like this.  I think there are so many messages in this very short video. First of all, I think there's
---
'''

# Get the title and timestamp from the text
import re

# define regular expression patterns
title_pattern = r"Segment from '(.+)'"
timestamp_pattern = r"Timestamp: \((.+)\)"

# search for title, source, and timestamp using regular expressions
title = re.search(title_pattern, text).group(1)
start_timestamp = re.search(timestamp_pattern, text).group(1).split(",")[0]

url = f"URL: https://www.youtube.com/watch?v={video_id}&t={start_timestamp}"

# Add url in text before first "---"
text = re.sub(r"---", f"{url}\n---", text, count=1)


In [73]:
text

"\nSegment from 'Feminism Is 'Dividing This'' Country' by VICE News\nTimestamp: (10:51, 12:24)\nURL: https://www.youtube.com/watch?v=77zvIYDFSok&t=10:51 \n---\npersonally take while leading this group, let's create a world where feminists don't have to choose feminism.  I choose feminism because I think you're watching. As a person, I live to protect the woman I love. I think I'm about the level of a director who creates a hero. Well, one day, [Music] We were humiliated like this.  I think there are so many messages in this very short video. First of all, I think there's\n---\n"