wendru18 commited on
Commit
87ee9d7
·
1 Parent(s): 46b6387

updated notebook and ' fix in app

Browse files
Files changed (2) hide show
  1. app.py +4 -0
  2. notebook.ipynb +327 -463
app.py CHANGED
@@ -49,6 +49,10 @@ def get_youtube_data(url):
49
 
50
  title, author = data["title"], data["author_name"]
51
 
 
 
 
 
52
  df = pd.DataFrame(raw)
53
 
54
  df['end'] = df['start'] + df['duration']
 
49
 
50
  title, author = data["title"], data["author_name"]
51
 
52
+ # ' is a reserved character
53
+ title = title.replace("'", "")
54
+ author = author.replace("'", "")
55
+
56
  df = pd.DataFrame(raw)
57
 
58
  df['end'] = df['start'] + df['duration']
notebook.ipynb CHANGED
@@ -2,352 +2,70 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 16,
6
  "metadata": {},
7
- "outputs": [
8
- {
9
- "name": "stdout",
10
- "output_type": "stream",
11
- "text": [
12
- "No transcript found\n"
13
- ]
14
- }
15
- ],
16
  "source": [
17
  "from youtube_transcript_api import YouTubeTranscriptApi\n",
18
- "from nltk.tokenize import TextTilingTokenizer \n",
 
 
19
  "import pandas as pd\n",
 
20
  "import numpy as np\n",
21
  "import requests\n",
 
 
22
  "import json\n",
23
- "\n",
24
- "url = \"https://www.youtube.com/watch?v=z7-K1zmBu-8\"\n",
25
- "video_id = url.split(\"=\")[1]\n",
26
- "\n",
27
- "try:\n",
28
- " raw = YouTubeTranscriptApi.get_transcript(video_id)\n",
29
- "except:\n",
30
- " try:\n",
31
- " transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)\n",
32
- " for transcript in transcript_list:\n",
33
- " raw = transcript.translate('en').fetch()\n",
34
- " break\n",
35
- " except:\n",
36
- " print(\"No transcript found\")\n",
37
- " raw = []\n",
38
- "\n",
39
- "response = requests.get(f\"https://noembed.com/embed?dataType=json&url={url}\")\n",
40
- "data = json.loads(response.content)\n",
41
- "\n",
42
- "title, author = data[\"title\"], data[\"author_name\"]"
43
- ]
44
- },
45
- {
46
- "cell_type": "code",
47
- "execution_count": 42,
48
- "metadata": {},
49
- "outputs": [
50
- {
51
- "data": {
52
- "text/plain": [
53
- "[{'text': '[Music]', 'start': 2.19, 'duration': 3.5},\n",
54
- " {'text': '[Music]', 'start': 18.73, 'duration': 3.07},\n",
55
- " {'text': '[Applause]', 'start': 27.71, 'duration': 3.289},\n",
56
- " {'text': '[Music]', 'start': 33.68, 'duration': 7.01},\n",
57
- " {'text': '[Laughter] [Music] [Music] [', 'start': 36.05, 'duration': 4.64},\n",
58
- " {'text': 'Applause]', 'start': 59.97, 'duration': 3.2},\n",
59
- " {'text': '[Music]', 'start': 68.78, 'duration': 3.12},\n",
60
- " {'text': 'Recently, the', 'start': 72.18, 'duration': 3.0},\n",
61
- " {'text': \"issue of sexual assault by celebrities has not stopped. It's\",\n",
62
- " 'start': 79.26,\n",
63
- " 'duration': 3.24},\n",
64
- " {'text': \"true that it happened to me. It's\",\n",
65
- " 'start': 87.74,\n",
66
- " 'duration': 3.96},\n",
67
- " {'text': 'reallyembarrassing', 'start': 96.299, 'duration': 3.721},\n",
68
- " {'text': \"[Music] It's disastrous\", 'start': 98.16, 'duration': 4.099},\n",
69
- " {'text': '[Music]', 'start': 102.36, 'duration': 3.18},\n",
70
- " {'text': \"There's also a part where feminists are obsessed with men's genitals. I think their germinating\",\n",
71
- " 'start': 111.6,\n",
72
- " 'duration': 4.5},\n",
73
- " {'text': 'power is a really scary part. I',\n",
74
- " 'start': 113.82,\n",
75
- " 'duration': 4.259},\n",
76
- " {'text': 'think this castration will happen more often as they get castrated.',\n",
77
- " 'start': 121.259,\n",
78
- " 'duration': 3.441},\n",
79
- " {'text': 'In fact, feminism was popular at the time,',\n",
80
- " 'start': 133.58,\n",
81
- " 'duration': 7.659},\n",
82
- " {'text': 'but thanks to its popularity,', 'start': 137.58, 'duration': 8.04},\n",
83
- " {'text': 'a lot of', 'start': 141.239, 'duration': 8.041},\n",
84
- " {'text': 'things happened, such as scolding, ridicule, insults, and',\n",
85
- " 'start': 145.62,\n",
86
- " 'duration': 5.04},\n",
87
- " {'text': 'insults against', 'start': 149.28, 'duration': 2.28},\n",
88
- " {'text': 'men. I', 'start': 150.66, 'duration': 2.7},\n",
89
- " {'text': \"just couldn't stay there. Well, the\",\n",
90
- " 'start': 153.36,\n",
91
- " 'duration': 5.459},\n",
92
- " {'text': 'pepper is 3 cm.', 'start': 160.2, 'duration': 5.58},\n",
93
- " {'text': 'Besides, all men are potential rape',\n",
94
- " 'start': 162.36,\n",
95
- " 'duration': 7.019},\n",
96
- " {'text': 'criminals. Men are useless. Men stopped',\n",
97
- " 'start': 165.78,\n",
98
- " 'duration': 6.599},\n",
99
- " {'text': 'trusting women. If you', 'start': 176.34, 'duration': 4.92},\n",
100
- " {'text': 'reach there, you may be hit by the #MeToo movement, so I',\n",
101
- " 'start': 184.92,\n",
102
- " 'duration': 3.86},\n",
103
- " {'text': 'think there are a lot of them right now. I think',\n",
104
- " 'start': 200.28,\n",
105
- " 'duration': 2.539},\n",
106
- " {'text': 'there may be a little more than in other countries.',\n",
107
- " 'start': 221.76,\n",
108
- " 'duration': 4.8},\n",
109
- " {'text': '[Applause]', 'start': 238.29, 'duration': 3.23},\n",
110
- " {'text': '[Music]', 'start': 243.27, 'duration': 7.169},\n",
111
- " {'text': 'Personally, I', 'start': 245.78, 'duration': 8.019},\n",
112
- " {'text': \"would say it's content that only has these emotions. As an\",\n",
113
- " 'start': 250.439,\n",
114
- " 'duration': 6.061},\n",
115
- " {'text': 'example, I said that we need to strongly pass the anti-discrimination law, but',\n",
116
- " 'start': 253.799,\n",
117
- " 'duration': 4.801},\n",
118
- " {'text': 'this is actually an expression', 'start': 258.6, 'duration': 3.89},\n",
119
- " {'text': 'dictatorship class. Guys', 'start': 259.919, 'duration': 5.111},\n",
120
- " {'text': '[Applause]', 'start': 262.49, 'duration': 4.149},\n",
121
- " {'text': '[Music] I think you', 'start': 265.03, 'duration': 3.949},\n",
122
- " {'text': \"'re talking a lot, but I want to hear it. Oh, you're not going to listen. You're so mean.\",\n",
123
- " 'start': 268.979,\n",
124
- " 'duration': 5.041},\n",
125
- " {'text': 'For example, I went to the bathroom, and I',\n",
126
- " 'start': 295.02,\n",
127
- " 'duration': 2.7},\n",
128
- " {'text': 'saw that it was unisex, but I want it to be safe with the door open. I',\n",
129
- " 'start': 297.72,\n",
130
- " 'duration': 4.8},\n",
131
- " {'text': 'think I was able to see well how men and women cut off harmony at the source. My',\n",
132
- " 'start': 354.18,\n",
133
- " 'duration': 2.6},\n",
134
- " {'text': 'girlfriend is', 'start': 357.96, 'duration': 4.5},\n",
135
- " {'text': 'not satisfied with something like that.',\n",
136
- " 'start': 359.759,\n",
137
- " 'duration': 4.081},\n",
138
- " {'text': 'Could anyone who really thought about gender equality talk about peeing and taking a shower? I think',\n",
139
- " 'start': 371.539,\n",
140
- " 'duration': 3.541},\n",
141
- " {'text': '[Music]', 'start': 378.61, 'duration': 3.159},\n",
142
- " {'text': 'In fact, many companies are', 'start': 384.479, 'duration': 3.321},\n",
143
- " {'text': 'paying for it,', 'start': 388.979, 'duration': 4.44},\n",
144
- " {'text': 'or in the labor market, there', 'start': 390.12, 'duration': 6.74},\n",
145
- " {'text': 'is employment discrimination without any reason for gender inequality.',\n",
146
- " 'start': 393.419,\n",
147
- " 'duration': 3.441},\n",
148
- " {'text': 'Evil men must disappear', 'start': 450.539, 'duration': 7.141},\n",
149
- " {'text': 'Because of these millitons, there are many such people',\n",
150
- " 'start': 454.56,\n",
151
- " 'duration': 7.139},\n",
152
- " {'text': 'in Korean society, and Ha Tae-kyung is a',\n",
153
- " 'start': 457.68,\n",
154
- " 'duration': 4.019},\n",
155
- " {'text': 'representative abolitionist. It',\n",
156
- " 'start': 472.979,\n",
157
- " 'duration': 7.521},\n",
158
- " {'text': 'means that I oppose giving privileges to Han Gender.',\n",
159
- " 'start': 503.099,\n",
160
- " 'duration': 5.641},\n",
161
- " {'text': 'Korean feminists are now', 'start': 504.3, 'duration': 8.58},\n",
162
- " {'text': 'understood as', 'start': 508.74, 'duration': 6.5},\n",
163
- " {'text': 'discriminatory against men. In such a society,',\n",
164
- " 'start': 528.42,\n",
165
- " 'duration': 4.56},\n",
166
- " {'text': 'I', 'start': 534.8, 'duration': 6.479},\n",
167
- " {'text': 'think we should pay more attention to the serious discrimination against women.',\n",
168
- " 'start': 537.18,\n",
169
- " 'duration': 4.099},\n",
170
- " {'text': 'Recently,', 'start': 544.08, 'duration': 3.0},\n",
171
- " {'text': 'when I look at the political situation in Korea, I feel that it is now retreating.',\n",
172
- " 'start': 551.459,\n",
173
- " 'duration': 4.741},\n",
174
- " {'text': '19% of female lawmakers in the National Assembly',\n",
175
- " 'start': 559.62,\n",
176
- " 'duration': 4.44},\n",
177
- " {'text': 'are now 19%.', 'start': 561.06, 'duration': 3.0},\n",
178
- " {'text': 'Why do people who send me messages like this send me messages like this',\n",
179
- " 'start': 592.14,\n",
180
- " 'duration': 3.84},\n",
181
- " {'text': 'when they come every day? It', 'start': 593.76, 'duration': 4.92},\n",
182
- " {'text': \"'s necessary, but it seems that there\",\n",
183
- " 'start': 615.3,\n",
184
- " 'duration': 3.479},\n",
185
- " {'text': 'are many cases where the target is directed at women,',\n",
186
- " 'start': 618.779,\n",
187
- " 'duration': 3.74},\n",
188
- " {'text': \"but if there's a motto that I\", 'start': 645.42, 'duration': 5.82},\n",
189
- " {'text': \"personally take while leading this group, let's\",\n",
190
- " 'start': 647.12,\n",
191
- " 'duration': 5.32},\n",
192
- " {'text': \"create a world where feminists don't have to choose feminism. I\",\n",
193
- " 'start': 657.899,\n",
194
- " 'duration': 3.721},\n",
195
- " {'text': 'choose', 'start': 665.339, 'duration': 5.161},\n",
196
- " {'text': \"feminism because I think you're watching.\",\n",
197
- " 'start': 676.019,\n",
198
- " 'duration': 4.081},\n",
199
- " {'text': 'As a person, I live to protect the woman I love.',\n",
200
- " 'start': 686.959,\n",
201
- " 'duration': 5.701},\n",
202
- " {'text': \"I think I'm about the level of a director who creates a hero.\",\n",
203
- " 'start': 697.82,\n",
204
- " 'duration': 6.94},\n",
205
- " {'text': 'Well,', 'start': 701.04, 'duration': 3.72},\n",
206
- " {'text': 'one day,', 'start': 707.16, 'duration': 10.41},\n",
207
- " {'text': '[Music] We were', 'start': 727.75, 'duration': 4.46},\n",
208
- " {'text': 'humiliated like', 'start': 730.2, 'duration': 5.139},\n",
209
- " {'text': 'this. I', 'start': 736.7, 'duration': 4.84},\n",
210
- " {'text': \"think there are so many messages in this very short video. First of all, I think there's\",\n",
211
- " 'start': 741.54,\n",
212
- " 'duration': 2.64},\n",
213
- " {'text': 'enough room for it to be interpreted as',\n",
214
- " 'start': 744.18,\n",
215
- " 'duration': 2.54},\n",
216
- " {'text': 'sexual', 'start': 756.36, 'duration': 4.979},\n",
217
- " {'text': 'harassment.', 'start': 762.66, 'duration': 5.1},\n",
218
- " {'text': 'But I really', 'start': 777.42, 'duration': 4.38},\n",
219
- " {'text': 'had no intention of interfering with the event.',\n",
220
- " 'start': 780.3,\n",
221
- " 'duration': 3.42},\n",
222
- " {'text': 'It was a chance to share, but I',\n",
223
- " 'start': 791.04,\n",
224
- " 'duration': 7.28},\n",
225
- " {'text': \"think I couldn't give you a good answer after hearing that conversation.\",\n",
226
- " 'start': 795.12,\n",
227
- " 'duration': 3.2},\n",
228
- " {'text': '[Music] I was', 'start': 802.19, 'duration': 2.649},\n",
229
- " {'text': 'very surprised. I', 'start': 803.76, 'duration': 2.04},\n",
230
- " {'text': 'went with my faith,', 'start': 805.8, 'duration': 4.26},\n",
231
- " {'text': 'but', 'start': 819.06, 'duration': 2.6},\n",
232
- " {'text': \"I felt completely betrayed. It's\",\n",
233
- " 'start': 822.019,\n",
234
- " 'duration': 3.991},\n",
235
- " {'text': 'just', 'start': 824.16, 'duration': 4.91},\n",
236
- " {'text': '[Music] [', 'start': 826.01, 'duration': 3.06},\n",
237
- " {'text': 'Music] If', 'start': 831.75, 'duration': 3.09},\n",
238
- " {'text': 'you', 'start': 839.88, 'duration': 5.04},\n",
239
- " {'text': \"feel sexually shameful, that's\", 'start': 842.04, 'duration': 5.7},\n",
240
- " {'text': \"sexual harassment. In Korea, that's the\",\n",
241
- " 'start': 847.74,\n",
242
- " 'duration': 5.06},\n",
243
- " {'text': \"case. It's\", 'start': 853.399, 'duration': 6.641},\n",
244
- " {'text': 'scary to lose everything and become something really different only from the genitals',\n",
245
- " 'start': 864.36,\n",
246
- " 'duration': 5.59},\n",
247
- " {'text': '[Music]', 'start': 864.95, 'duration': 5.0},\n",
248
- " {'text': '[Music]', 'start': 888.91, 'duration': 3.09}]"
249
- ]
250
- },
251
- "execution_count": 42,
252
- "metadata": {},
253
- "output_type": "execute_result"
254
- }
255
- ],
256
- "source": [
257
- "raw"
258
  ]
259
  },
260
  {
261
  "cell_type": "code",
262
- "execution_count": 43,
263
  "metadata": {},
264
  "outputs": [],
265
  "source": [
266
- "# Convert the list of dictionaries to a pandas dataframe\n",
267
- "df = pd.DataFrame(raw)\n",
 
 
268
  "\n",
269
- "# Add end column\n",
270
- "df['end'] = df['start'] + df['duration']\n",
271
  "\n",
272
- "# Add a new column to the dataframe called 'total_words' that contains the total number of words so far in the transcript\n",
273
- "df['total_words'] = df['text'].apply(lambda x: len(x.split())).cumsum()\n",
274
  "\n",
275
- "# Add \"\\n\\n\" at the end of df[\"text\"]\n",
276
- "df[\"text\"] = df[\"text\"] + \"\\n\\n\""
277
- ]
278
- },
279
- {
280
- "cell_type": "code",
281
- "execution_count": 44,
282
- "metadata": {},
283
- "outputs": [],
284
- "source": [
285
- "# Merge the text column into a single string and save to a transcript variable\n",
286
  "\n",
287
- "transcript = df['text'].str.cat(sep=' ')"
288
- ]
289
- },
290
- {
291
- "cell_type": "code",
292
- "execution_count": 45,
293
- "metadata": {},
294
- "outputs": [
295
- {
296
- "data": {
297
- "text/plain": [
298
- "\"[Music]\\n\\n [Music]\\n\\n [Applause]\\n\\n [Music]\\n\\n [Laughter] [Music] [Music] [\\n\\n Applause]\\n\\n [Music]\\n\\n Recently, the\\n\\n issue of sexual assault by celebrities has not stopped. It's\\n\\n true that it happened to me. It's\\n\\n reallyembarrassing\\n\\n [Music] It's disastrous\\n\\n [Music]\\n\\n There's also a part where feminists are obsessed with men's genitals. I think their germinating\\n\\n power is a really scary part. I\\n\\n think this castration will happen more often as they get castrated.\\n\\n In fact, feminism was popular at the time,\\n\\n but thanks to its popularity,\\n\\n a lot of\\n\\n things happened, such as scolding, ridicule, insults, and\\n\\n insults against\\n\\n men. I\\n\\n just couldn't stay there. Well, the\\n\\n pepper is 3 cm.\\n\\n Besides, all men are potential rape\\n\\n criminals. Men are useless. Men stopped\\n\\n trusting women. If you\\n\\n reach there, you may be hit by the #MeToo movement, so I\\n\\n think there are a lot of them right now. I think\\n\\n there may be a little more than in other countries.\\n\\n [Applause]\\n\\n [Music]\\n\\n Personally, I\\n\\n would say it's content that only has these emotions. As an\\n\\n example, I said that we need to strongly pass the anti-discrimination law, but\\n\\n this is actually an expression\\n\\n dictatorship class. Guys\\n\\n [Applause]\\n\\n [Music] I think you\\n\\n 're talking a lot, but I want to hear it. Oh, you're not going to listen. You're so mean.\\n\\n For example, I went to the bathroom, and I\\n\\n saw that it was unisex, but I want it to be safe with the door open. I\\n\\n think I was able to see well how men and women cut off harmony at the source. My\\n\\n girlfriend is\\n\\n not satisfied with something like that.\\n\\n Could anyone who really thought about gender equality talk about peeing and taking a shower? I think\\n\\n [Music]\\n\\n In fact, many companies are\\n\\n paying for it,\\n\\n or in the labor market, there\\n\\n is employment discrimination without any reason for gender inequality.\\n\\n Evil men must disappear\\n\\n Because of these millitons, there are many such people\\n\\n in Korean society, and Ha Tae-kyung is a\\n\\n representative abolitionist. It\\n\\n means that I oppose giving privileges to Han Gender.\\n\\n Korean feminists are now\\n\\n understood as\\n\\n discriminatory against men. In such a society,\\n\\n I\\n\\n think we should pay more attention to the serious discrimination against women.\\n\\n Recently,\\n\\n when I look at the political situation in Korea, I feel that it is now retreating.\\n\\n 19% of female lawmakers in the National Assembly\\n\\n are now 19%.\\n\\n Why do people who send me messages like this send me messages like this\\n\\n when they come every day? It\\n\\n 's necessary, but it seems that there\\n\\n are many cases where the target is directed at women,\\n\\n but if there's a motto that I\\n\\n personally take while leading this group, let's\\n\\n create a world where feminists don't have to choose feminism. I\\n\\n choose\\n\\n feminism because I think you're watching.\\n\\n As a person, I live to protect the woman I love.\\n\\n I think I'm about the level of a director who creates a hero.\\n\\n Well,\\n\\n one day,\\n\\n [Music] We were\\n\\n humiliated like\\n\\n this. I\\n\\n think there are so many messages in this very short video. First of all, I think there's\\n\\n enough room for it to be interpreted as\\n\\n sexual\\n\\n harassment.\\n\\n But I really\\n\\n had no intention of interfering with the event.\\n\\n It was a chance to share, but I\\n\\n think I couldn't give you a good answer after hearing that conversation.\\n\\n [Music] I was\\n\\n very surprised. I\\n\\n went with my faith,\\n\\n but\\n\\n I felt completely betrayed. It's\\n\\n just\\n\\n [Music] [\\n\\n Music] If\\n\\n you\\n\\n feel sexually shameful, that's\\n\\n sexual harassment. In Korea, that's the\\n\\n case. It's\\n\\n scary to lose everything and become something really different only from the genitals\\n\\n [Music]\\n\\n [Music]\\n\\n\""
299
- ]
300
- },
301
- "execution_count": 45,
302
- "metadata": {},
303
- "output_type": "execute_result"
304
- }
305
- ],
306
- "source": [
307
- "transcript"
308
- ]
309
- },
310
- {
311
- "cell_type": "code",
312
- "execution_count": 11,
313
- "metadata": {},
314
- "outputs": [],
315
- "source": [
316
- "tt = TextTilingTokenizer()\n",
317
  "\n",
318
- "# Tokenize the transcript into segments using the TextTilingTokenizer\n",
319
- "segments = tt.tokenize(transcript)"
320
- ]
321
- },
322
- {
323
- "cell_type": "code",
324
- "execution_count": 12,
325
- "metadata": {},
326
- "outputs": [],
327
- "source": [
328
- "# # Remove \\n\\n from each segment\n",
329
- "segments = [segment.replace('\\n\\n','').strip() for segment in segments]"
330
- ]
331
- },
332
- {
333
- "cell_type": "code",
334
- "execution_count": 13,
335
- "metadata": {},
336
- "outputs": [],
337
- "source": [
338
- "# Calculate a list of word count for each segment\n",
339
- "segments_wc = [len(segment.split()) for segment in segments]\n",
340
  "\n",
341
- "# Make it cumulative\n",
342
- "segments_wc = np.cumsum(segments_wc)"
343
- ]
344
- },
345
- {
346
- "cell_type": "code",
347
- "execution_count": 14,
348
- "metadata": {},
349
- "outputs": [],
350
- "source": [
351
  "def to_timestamp(seconds):\n",
352
  " seconds = int(seconds)\n",
353
  "\n",
@@ -358,175 +76,321 @@
358
  " if seconds >= 3600:\n",
359
  " return f\"{hours:02d}:{minutes:02d}:{seconds_remaining:02d}\"\n",
360
  " else:\n",
361
- " return f\"{minutes:02d}:{seconds_remaining:02d}\""
362
- ]
363
- },
364
- {
365
- "cell_type": "code",
366
- "execution_count": 15,
367
- "metadata": {},
368
- "outputs": [
369
- {
370
- "data": {
371
- "text/plain": [
372
- "'01:40'"
373
- ]
374
- },
375
- "execution_count": 15,
376
- "metadata": {},
377
- "output_type": "execute_result"
378
- }
379
- ],
380
- "source": [
381
- "to_timestamp(100)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382
  ]
383
  },
384
  {
385
  "cell_type": "code",
386
- "execution_count": 16,
387
  "metadata": {},
388
  "outputs": [],
389
  "source": [
390
- "# For each value in segments_wc, get the index of the closest value in df['total_words']\n",
391
- "# This will be the index of the row in df that is closest to the end of each segment\n",
392
- "idx = [np.argmin(np.abs(df['total_words'] - total_words)) for total_words in segments_wc]\n",
393
  "\n",
394
- "# Get segment end times from idx\n",
395
- "segment_end_times = df['end'].iloc[idx].values\n",
 
 
 
 
 
 
 
396
  "\n",
397
- "# Add 0.0 to the beginning of segment_end_times\n",
398
- "segment_end_times = np.insert(segment_end_times, 0, 0.0)\n",
399
  "\n",
400
- "# segment_times is a list of tuples containing the start and end times of each segment\n",
401
- "segment_times = [(to_timestamp(segment_end_times[i-1]), to_timestamp(segment_end_times[i])) for i in range(1,len(segment_end_times))]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
  ]
403
  },
404
  {
405
  "cell_type": "code",
406
- "execution_count": 22,
407
  "metadata": {},
408
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
409
  "source": [
410
- "# At the beginning of each segment, add the title, author, and segment times\n",
411
- "segments_times = [f\"({to_timestamp(segment_end_times[i-1])}, {to_timestamp(segment_end_times[i])})\" for i in range(1,len(segment_end_times))]"
 
 
 
 
 
 
 
412
  ]
413
  },
414
  {
415
  "cell_type": "code",
416
- "execution_count": 23,
417
  "metadata": {},
418
  "outputs": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
419
  {
420
  "data": {
421
  "text/plain": [
422
- "['(00:00, 00:48)',\n",
423
- " '(00:48, 01:10)',\n",
424
- " '(01:10, 01:46)',\n",
425
- " '(01:46, 02:26)',\n",
426
- " '(02:26, 02:57)',\n",
427
- " '(02:57, 03:25)',\n",
428
- " '(03:25, 04:11)',\n",
429
- " '(04:11, 04:41)',\n",
430
- " '(04:41, 05:26)',\n",
431
- " '(05:26, 05:45)',\n",
432
- " '(05:45, 06:13)',\n",
433
- " '(06:13, 06:40)',\n",
434
- " '(06:40, 07:02)',\n",
435
- " '(07:02, 07:54)',\n",
436
- " '(07:54, 08:17)',\n",
437
- " '(08:17, 09:24)',\n",
438
- " '(09:24, 10:10)',\n",
439
- " '(10:10, 11:02)',\n",
440
- " '(11:02, 11:47)',\n",
441
- " '(11:47, 12:09)',\n",
442
- " '(12:09, 12:52)',\n",
443
- " '(12:52, 13:50)',\n",
444
- " '(13:50, 14:15)',\n",
445
- " '(14:15, 14:38)',\n",
446
- " '(14:38, 16:14)',\n",
447
- " '(16:14, 17:16)',\n",
448
- " '(17:16, 17:47)',\n",
449
- " '(17:47, 18:17)',\n",
450
- " '(18:17, 18:56)',\n",
451
- " '(18:56, 19:31)',\n",
452
- " '(19:31, 19:52)',\n",
453
- " '(19:52, 21:03)',\n",
454
- " '(21:03, 21:39)',\n",
455
- " '(21:39, 22:08)',\n",
456
- " '(22:08, 22:42)',\n",
457
- " '(22:42, 23:35)',\n",
458
- " '(23:35, 24:51)',\n",
459
- " '(24:51, 26:01)',\n",
460
- " '(26:01, 26:28)',\n",
461
- " '(26:28, 26:57)',\n",
462
- " '(26:57, 28:37)',\n",
463
- " '(28:37, 29:00)',\n",
464
- " '(29:00, 29:50)',\n",
465
- " '(29:50, 30:12)',\n",
466
- " '(30:12, 30:55)',\n",
467
- " '(30:55, 31:47)',\n",
468
- " '(31:47, 32:54)',\n",
469
- " '(32:54, 33:33)',\n",
470
- " '(33:33, 33:50)',\n",
471
- " '(33:50, 34:20)',\n",
472
- " '(34:20, 34:48)',\n",
473
- " '(34:48, 35:22)',\n",
474
- " '(35:22, 36:14)',\n",
475
- " '(36:14, 37:15)']"
476
  ]
477
  },
478
- "execution_count": 23,
479
  "metadata": {},
480
  "output_type": "execute_result"
481
  }
482
  ],
483
  "source": [
484
- "segments_times"
485
- ]
486
- },
487
- {
488
- "cell_type": "code",
489
- "execution_count": 72,
490
- "metadata": {},
491
- "outputs": [],
492
- "source": [
493
- "text = '''\n",
494
- "Segment from 'Feminism Is 'Dividing This'' Country' by VICE News\n",
495
- "Timestamp: (10:51, 12:24)\n",
496
- "---\n",
497
- "personally take while leading this group, let's create a world where feminists don't have to choose feminism. I choose feminism because I think you're watching. As a person, I live to protect the woman I love. I think I'm about the level of a director who creates a hero. Well, one day, [Music] We were humiliated like this. I think there are so many messages in this very short video. First of all, I think there's\n",
498
- "---\n",
499
- "'''\n",
500
- "\n",
501
- "# Get the title and timestamp from the text\n",
502
- "import re\n",
503
- "\n",
504
- "# define regular expression patterns\n",
505
- "title_pattern = r\"Segment from '(.+)'\"\n",
506
- "timestamp_pattern = r\"Timestamp: \\((.+)\\)\"\n",
507
- "\n",
508
- "# search for title, source, and timestamp using regular expressions\n",
509
- "title = re.search(title_pattern, text).group(1)\n",
510
- "start_timestamp = re.search(timestamp_pattern, text).group(1).split(\",\")[0]\n",
511
- "\n",
512
- "url = f\"URL: https://www.youtube.com/watch?v={video_id}&t={start_timestamp}\"\n",
513
- "\n",
514
- "# Add url in text before first \"---\"\n",
515
- "text = re.sub(r\"---\", f\"{url}\\n---\", text, count=1)\n"
516
- ]
517
- },
518
- {
519
- "cell_type": "code",
520
- "execution_count": 3,
521
- "metadata": {},
522
- "outputs": [],
523
- "source": [
524
- "from youtubesearchpython import VideosSearch\n",
525
- "\n",
526
- "videosSearch = VideosSearch('NoCopyrightSounds', limit = 2)\n",
527
- "\n",
528
- "# Get URLs\n",
529
- "urls = [video[\"id\"] for video in videosSearch.result()[\"result\"]]"
530
  ]
531
  }
532
  ],
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 17,
6
  "metadata": {},
7
+ "outputs": [],
 
 
 
 
 
 
 
 
8
  "source": [
9
  "from youtube_transcript_api import YouTubeTranscriptApi\n",
10
+ "from nltk.tokenize import TextTilingTokenizer \n",
11
+ "from youtubesearchpython import VideosSearch\n",
12
+ "from semantic_search import SemanticSearch \n",
13
  "import pandas as pd\n",
14
+ "import gradio as gr\n",
15
  "import numpy as np\n",
16
  "import requests\n",
17
+ "import tiktoken\n",
18
+ "import openai\n",
19
  "import json\n",
20
+ "import nltk\n",
21
+ "import re\n",
22
+ "import os"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  ]
24
  },
25
  {
26
  "cell_type": "code",
27
+ "execution_count": 18,
28
  "metadata": {},
29
  "outputs": [],
30
  "source": [
31
+ "def set_openai_key(key):\n",
32
+ " if key == \"env\":\n",
33
+ " key = os.environ.get(\"OPENAI_API_KEY\")\n",
34
+ " openai.api_key = key\n",
35
  "\n",
36
+ "def get_youtube_data(url):\n",
 
37
  "\n",
38
+ " video_id = url.split(\"=\")[1]\n",
 
39
  "\n",
40
+ " try:\n",
41
+ " raw = YouTubeTranscriptApi.get_transcript(video_id)\n",
42
+ " except:\n",
43
+ " try:\n",
44
+ " transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)\n",
45
+ " for transcript in transcript_list:\n",
46
+ " raw = transcript.translate('en').fetch()\n",
47
+ " break\n",
48
+ " except:\n",
49
+ " print(f\"No transcript found for {url}\") # Usually because the video itself disabled captions\n",
50
+ " return False\n",
51
  "\n",
52
+ " response = requests.get(f\"https://noembed.com/embed?dataType=json&url={url}\")\n",
53
+ " data = json.loads(response.content)\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  "\n",
55
+ " title, author = data[\"title\"], data[\"author_name\"]\n",
56
+ "\n",
57
+ " # Remove any \"'\" from title\n",
58
+ " title = title.replace(\"'\", \"\")\n",
59
+ " author = author.replace(\"'\", \"\")\n",
60
+ "\n",
61
+ " df = pd.DataFrame(raw)\n",
62
+ "\n",
63
+ " df['end'] = df['start'] + df['duration']\n",
64
+ " df['total_words'] = df['text'].apply(lambda x: len(x.split())).cumsum()\n",
65
+ " df[\"text\"] = df[\"text\"] + \"\\n\\n\"\n",
66
+ "\n",
67
+ " return df, title, author\n",
 
 
 
 
 
 
 
 
 
68
  "\n",
 
 
 
 
 
 
 
 
 
 
69
  "def to_timestamp(seconds):\n",
70
  " seconds = int(seconds)\n",
71
  "\n",
 
76
  " if seconds >= 3600:\n",
77
  " return f\"{hours:02d}:{minutes:02d}:{seconds_remaining:02d}\"\n",
78
  " else:\n",
79
+ " return f\"{minutes:02d}:{seconds_remaining:02d}\"\n",
80
+ "\n",
81
+ "def to_seconds(timestamp):\n",
82
+ " time_list = timestamp.split(':')\n",
83
+ " total_seconds = 0\n",
84
+ " if len(time_list) == 2: # Minutes:Seconds format\n",
85
+ " total_seconds = int(time_list[0]) * 60 + int(time_list[1])\n",
86
+ " elif len(time_list) == 3: # Hours:Minutes:Seconds format\n",
87
+ " total_seconds = int(time_list[0]) * 3600 + int(time_list[1]) * 60 + int(time_list[2])\n",
88
+ " else:\n",
89
+ " raise ValueError(\"Invalid timestamp format\")\n",
90
+ " return total_seconds\n",
91
+ "\n",
92
+ "def get_segments(df, title, author, split_by_topic, segment_length = 200):\n",
93
+ "\n",
94
+ " transcript = df['text'].str.cat(sep=' ')\n",
95
+ "\n",
96
+ " if not split_by_topic:\n",
97
+ " words = transcript.split()\n",
98
+ " segments = [' '.join(words[i:i+segment_length]) for i in range(0, len(words), segment_length)]\n",
99
+ " else:\n",
100
+ " try:\n",
101
+ " segments = tt.tokenize(transcript)\n",
102
+ " except:\n",
103
+ " return \"\"\n",
104
+ "\n",
105
+ " segments = [segment.replace('\\n','').strip() for segment in segments]\n",
106
+ "\n",
107
+ " segments_wc = [len(segment.split()) for segment in segments]\n",
108
+ " segments_wc = np.cumsum(segments_wc)\n",
109
+ "\n",
110
+ " idx = [np.argmin(np.abs(df['total_words'] - total_words)) for total_words in segments_wc]\n",
111
+ "\n",
112
+ " segments_end_times = df['end'].iloc[idx].values\n",
113
+ " segments_end_times = np.insert(segments_end_times, 0, 0.0)\n",
114
+ "\n",
115
+ " segments_times = [f\"({to_timestamp(segments_end_times[i-1])}, {to_timestamp(segments_end_times[i])})\" for i in range(1,len(segments_end_times))]\n",
116
+ "\n",
117
+ " segments_text = [f\"Segment from '{title}' by {author}\\nTimestamp: {segment_time}\\n\\n{segment}\\n\" for segment, segment_time in zip(segments, segments_times)]\n",
118
+ "\n",
119
+ " return segments_text\n",
120
+ "\n",
121
+ "def fit_searcher(segments, n_neighbours):\n",
122
+ " global searcher\n",
123
+ " searcher.fit(segments, n_neighbors=n_neighbours)\n",
124
+ " return True\n",
125
+ "\n",
126
+ "def num_tokens(text, model):\n",
127
+ " encoding = tiktoken.encoding_for_model(model)\n",
128
+ " return len(encoding.encode(text))\n",
129
+ "\n",
130
+ "def refencify(text):\n",
131
+ " title_pattern = r\"Segment from '(.+)'\"\n",
132
+ " timestamp_pattern = r\"Timestamp: \\((.+)\\)\"\n",
133
+ "\n",
134
+ " print(text)\n",
135
+ "\n",
136
+ " title = re.search(title_pattern, text).group(1)\n",
137
+ " timestamp = re.search(timestamp_pattern, text).group(1).split(\",\")\n",
138
+ " start_timestamp, end_timestamp = timestamp\n",
139
+ "\n",
140
+ " url = titles_to_urls[title]\n",
141
+ " start_seconds = to_seconds(start_timestamp)\n",
142
+ " end_seconds = to_seconds(end_timestamp)\n",
143
+ "\n",
144
+ " video_iframe = f'''<iframe\n",
145
+ " width=\"400\"\n",
146
+ " height=\"240\"\n",
147
+ " src=\"{url.replace(\"watch?v=\", \"embed/\")}?start={start_seconds}&end={end_seconds}&controls=0\"\n",
148
+ " frameborder=\"0\"\n",
149
+ " allow=\"accelerometer; autoplay; modestbranding; encrypted-media; gyroscope; picture-in-picture\"\n",
150
+ " allowfullscreen\n",
151
+ " >\n",
152
+ " </iframe>'''\n",
153
+ "\n",
154
+ " return start_timestamp, end_timestamp, f\"{video_iframe}\\n\\n\"\n",
155
+ "\n",
156
+ "def form_query(question, model, token_budget):\n",
157
+ "\n",
158
+ " results = searcher(question)\n",
159
+ "\n",
160
+ " introduction = 'Use the below segments from multiple youtube videos to answer the subsequent question. If the answer cannot be found in the articles, write \"I could not find an answer.\" Cite each sentence using the [title, author, timestamp] notation. Every sentence MUST have a citation!'\n",
161
+ "\n",
162
+ " message = introduction\n",
163
+ "\n",
164
+ " question = f\"\\n\\nQuestion: {question}\"\n",
165
+ "\n",
166
+ " references = \"\"\n",
167
+ "\n",
168
+ " for i, result in enumerate(results):\n",
169
+ " result = result + \"\\n\\n\"\n",
170
+ " if (\n",
171
+ " num_tokens(message + result + question, model=model)\n",
172
+ " > token_budget\n",
173
+ " ):\n",
174
+ " break\n",
175
+ " else:\n",
176
+ " message += result\n",
177
+ " start_timestamp, end_timestamp, iframe = refencify(result)\n",
178
+ " references += f\"### Segment {i+1} ({start_timestamp} - {end_timestamp}):\\n\" + iframe\n",
179
+ "\n",
180
+ " # Remove the last extra two newlines\n",
181
+ " message = message[:-2]\n",
182
+ "\n",
183
+ " references = \"Segments that might have been used to answer your question: (If you specified more segments than shown here, consider increasing your token budget)\\n\\n\" + references\n",
184
+ "\n",
185
+ " return message + question, references\n",
186
+ "\n",
187
+ "def generate_answer(question, model, token_budget, temperature):\n",
188
+ " \n",
189
+ " message, references = form_query(question, model, token_budget)\n",
190
+ "\n",
191
+ " messages = [\n",
192
+ " {\"role\": \"system\", \"content\": \"You answer questions about YouTube videos.\"},\n",
193
+ " {\"role\": \"user\", \"content\": message},\n",
194
+ " ]\n",
195
+ "\n",
196
+ " response = openai.ChatCompletion.create(\n",
197
+ " model=model,\n",
198
+ " messages=messages,\n",
199
+ " temperature=temperature\n",
200
+ " )\n",
201
+ " \n",
202
+ " response_message = response[\"choices\"][0][\"message\"][\"content\"]\n",
203
+ "\n",
204
+ " return response_message, references\n",
205
+ "\n",
206
+ "def add_to_dict(title, url):\n",
207
+ " global title_counter\n",
208
+ "\n",
209
+ " if title not in titles_to_urls:\n",
210
+ " # This is the first occurrence of this title\n",
211
+ " titles_to_urls[title] = url\n",
212
+ " return title\n",
213
+ " else:\n",
214
+ " # This title has already been seen, so we need to add a number suffix to it\n",
215
+ " # First, check if we've already seen this title before\n",
216
+ " if title in title_counter:\n",
217
+ " # If we have, increment the counter\n",
218
+ " title_counter[title] += 1\n",
219
+ " else:\n",
220
+ " # If we haven't, start the counter at 1\n",
221
+ " title_counter[title] = 1\n",
222
+ " \n",
223
+ " # Add the suffix to the title\n",
224
+ " new_title = f\"{title} ({title_counter[title]})\"\n",
225
+ " \n",
226
+ " # Add the new title to the dictionary\n",
227
+ " titles_to_urls[new_title] = url\n",
228
+ " return new_title\n",
229
+ "\n",
230
+ "def search_youtube(question, n_videos):\n",
231
+ " videosSearch = VideosSearch(question, limit = n_videos)\n",
232
+ " urls = [\"https://www.youtube.com/watch?v=\" + video[\"id\"] for video in videosSearch.result()[\"result\"]]\n",
233
+ " print(urls)\n",
234
+ " return urls"
235
  ]
236
  },
237
  {
238
  "cell_type": "code",
239
+ "execution_count": 19,
240
  "metadata": {},
241
  "outputs": [],
242
  "source": [
243
+ "def main(openAI_key, question, n_videos, urls_text, split_by_topic, segment_length, n_neighbours, model, token_budget, temperature):\n",
 
 
244
  "\n",
245
+ " print(question)\n",
246
+ " print(urls_text)\n",
247
+ "\n",
248
+ " set_openai_key(openAI_key)\n",
249
+ "\n",
250
+ " if urls_text == \"\":\n",
251
+ " urls = search_youtube(question, n_videos)\n",
252
+ " else:\n",
253
+ " urls = list(set(urls_text.split(\"\\n\")))\n",
254
  "\n",
255
+ " global titles_to_urls\n",
256
+ " titles_to_urls = {}\n",
257
  "\n",
258
+ " segments = []\n",
259
+ "\n",
260
+ " for url in urls:\n",
261
+ "\n",
262
+ " if \"youtu.be\" in url:\n",
263
+ " url = url.replace(\"youtu.be/\", \"youtube.com/watch?v=\")\n",
264
+ "\n",
265
+ " res = get_youtube_data(url)\n",
266
+ "\n",
267
+ " if not res:\n",
268
+ " continue\n",
269
+ "\n",
270
+ " df, title, author = res\n",
271
+ " \n",
272
+ " title = add_to_dict(title, url)\n",
273
+ "\n",
274
+ " video_segments = get_segments(df, title, author, split_by_topic, segment_length)\n",
275
+ "\n",
276
+ " segments.extend(video_segments)\n",
277
+ " \n",
278
+ " if segments == []:\n",
279
+ " return \"Something wrong happened! Try specifying the YouTube videos or changing the query.\", \"\"\n",
280
+ "\n",
281
+ " print(\"Segments generated successfully!\")\n",
282
+ "\n",
283
+ " if fit_searcher(segments, n_neighbours):\n",
284
+ " print(\"Searcher fit successfully!\")\n",
285
+ " answer, references = generate_answer(question, model, token_budget, temperature)\n",
286
+ "\n",
287
+ " print(answer)\n",
288
+ "\n",
289
+ " return answer, references"
290
  ]
291
  },
292
  {
293
  "cell_type": "code",
294
+ "execution_count": 20,
295
  "metadata": {},
296
+ "outputs": [
297
+ {
298
+ "name": "stderr",
299
+ "output_type": "stream",
300
+ "text": [
301
+ "[nltk_data] Downloading package stopwords to\n",
302
+ "[nltk_data] C:\\Users\\andrew\\AppData\\Roaming\\nltk_data...\n",
303
+ "[nltk_data] Package stopwords is already up-to-date!\n"
304
+ ]
305
+ }
306
+ ],
307
  "source": [
308
+ "nltk.download('stopwords')\n",
309
+ "tt = TextTilingTokenizer()\n",
310
+ "searcher = SemanticSearch()\n",
311
+ "\n",
312
+ "# Initialize a counter for duplicate titles\n",
313
+ "title_counter = {}\n",
314
+ "\n",
315
+ "# One to one mapping from titles to urls\n",
316
+ "titles_to_urls = {}"
317
  ]
318
  },
319
  {
320
  "cell_type": "code",
321
+ "execution_count": 21,
322
  "metadata": {},
323
  "outputs": [
324
+ {
325
+ "name": "stdout",
326
+ "output_type": "stream",
327
+ "text": [
328
+ "How much money do youtubers make?\n",
329
+ "\n",
330
+ "['https://www.youtube.com/watch?v=G4qOiwIE_o0', 'https://www.youtube.com/watch?v=ABG-1iy07NM', 'https://www.youtube.com/watch?v=-cz4in1WY-o', 'https://www.youtube.com/watch?v=KA5FmevAzTc', 'https://www.youtube.com/watch?v=JZ7GHzkBvzs']\n",
331
+ "Segments generated successfully!\n",
332
+ "Fitting with n=5...\n",
333
+ "Searcher fit successfully!\n",
334
+ "Segment from 'I Asked YouTube Millionaires How Much They Make' by Jordan Welch\n",
335
+ "Timestamp: (00:00, 01:04)\n",
336
+ "\n",
337
+ "what's the most you've generated in a month from your YouTube channel oh man oh the best month uh I'll just say our biggest one in earnings I've always wondered how much the biggest YouTubers make so today I'm here at the most exclusive YouTube conference in the world to ask them myself we're here with my brother phidius how long did it take you to make your first dollar from YouTube Oh two years what is the most amount of money that you spent on a single video oh I spent 50 000 on a video and I made back like 25. I'm very bad in business so how much money I make I make around seventy thousand dollars per month per month around 15K that we get and 15 chaos and sold thirty thousand dollars per month congratulations bro I wish you the best when you say it I have to say that this guy that's the best AdSense in the world so he makes more money than most of the people that he asks us questions let me share how much I make in the video yeah yeah tell us maybe and then dude I love you\n",
338
+ "\n",
339
+ "\n",
340
+ "\n",
341
+ "Segment from 'HOW MUCH MONEY DO YOUTUBERS MAKE?' by MrBeast\n",
342
+ "Timestamp: (02:48, 03:22)\n",
343
+ "\n",
344
+ "if you see a video with 1 million views they probably made around two thousand dollars if it had you know sponsorship or a brand deal or something they probably made a couple hundred to a couple thousand depending on their size and yeah that's the video make sure you like make sure you subscribe this is an iPad I just dropped the iPad why is the outro not playing Oh Oh\n",
345
+ "\n",
346
+ "\n",
347
+ "\n",
348
+ "Segment from 'How Much YouTube Pays You For 1,000 Views In 2023' by Make Money Matt\n",
349
+ "Timestamp: (00:00, 00:55)\n",
350
+ "\n",
351
+ "YouTube shows ads on the videos that you upload at the beginning sometimes in the middle and at the end of your videos once you get 4 000 hours of watch time and a thousand subscribers then you can join What's called the YouTube Partner program then YouTube will pay you a cut of the revenue and that's what we're going to be talking about in this video how much you actually make now I personally run a ton of different YouTube channels and I did some research into both my own channels and other people who've uploaded videos talking about how much money they make per thousand views and the answer is that channels make between one dollar and thirty dollars per thousand views so the average Channel based on my research makes 15.50 per thousand views but with that being said most channels from my personal experience only get around five to ten dollars per thousand views that you make now in order to make more money per thousand views you can do a couple things first of all you can create a channel in what's called a high RPM Niche now RPM is how much you actually\n",
352
+ "\n",
353
+ "\n",
354
+ "\n",
355
+ "Segment from 'HOW MUCH MONEY DO YOUTUBERS MAKE?' by MrBeast\n",
356
+ "Timestamp: (00:00, 00:57)\n",
357
+ "\n",
358
+ "what's up guys today we're gonna be talking about what's up guys today we're gonna be talking about how much money youtubers make I see comments about YouTube money all the time and most people have no idea what they're talking about so the question I see asked the most is how much do youtubers make per view well instead of just making up a random number we're actually going to take a look at my analytics I'm going to pick one day we're gonna see how many views I got and how much money I made off those views that way you know the numbers I'm telling you are actually real so I picked the day from last month and on that day I got 240,000 views and made around six hundred and sixty-nine dollars basically if you do the math that means every thousand views I got around two dollars and seventy cents so I want you to remember that number that every thousand views is two dollars and seventy cents because that's not just a number I made up that straight out of my analytics and we're going to use that to see how much these big\n",
359
+ "\n",
360
+ "\n",
361
+ "\n",
362
+ "Segment from 'HOW MUCH MONEY DO YOUTUBERS MAKE?' by MrBeast\n",
363
+ "Timestamp: (00:57, 02:02)\n",
364
+ "\n",
365
+ "youtubers are making so if every youtuber made what I made per thousand views that means in the last 30 days nine youtubers have made over a million dollars in just the last 30 days PewDiePie has 13 billion billion with the BEA views and if he made two dollars and seventy cents per thousand views that means over the lifespan of his channel he's made 35 million dollars even better a channel called Ryan toy review has over 6 billion views which means he's potentially made over 16 million dollars off of reviewing toys what's up guys beast toy reviews here and today we're reviewing this toy Karambit as you'll see it's uh can I just get the 16 million dollars and guys it keeps getting better most big youtubers make an additional six figures offer brand deals and sponsorships and to prove that that's true here's the deal that I was offered an app which I'm not going to name offered me $400 to do a video on it and then an additional 50 cents for every one of you that downloaded the app basically if I had done a video on that app I would have got\n",
366
+ "\n",
367
+ "\n",
368
+ "\n",
369
+ "Youtubers can make between one dollar and thirty dollars per thousand views, with the average channel making around 15.50 dollars per thousand views. However, most channels only get around five to ten dollars per thousand views. Some big youtubers make over a million dollars in just 30 days, and PewDiePie has made 35 million dollars over the lifespan of his channel. Additionally, most big youtubers make an additional six figures from brand deals and sponsorships. [Make Money Matt, 00:00-00:55; MrBeast, 00:57-02:02]\n"
370
+ ]
371
+ },
372
  {
373
  "data": {
374
  "text/plain": [
375
+ "('Youtubers can make between one dollar and thirty dollars per thousand views, with the average channel making around 15.50 dollars per thousand views. However, most channels only get around five to ten dollars per thousand views. Some big youtubers make over a million dollars in just 30 days, and PewDiePie has made 35 million dollars over the lifespan of his channel. Additionally, most big youtubers make an additional six figures from brand deals and sponsorships. [Make Money Matt, 00:00-00:55; MrBeast, 00:57-02:02]',\n",
376
+ " 'Segments that might have been used to answer your question: (If you specified more segments than shown here, consider increasing your token budget)\\n\\n### Segment 1 (00:00 - 01:04):\\n<iframe\\n width=\"400\"\\n height=\"240\"\\n src=\"https://www.youtube.com/embed/JZ7GHzkBvzs?start=0&end=64&controls=0\"\\n frameborder=\"0\"\\n allow=\"accelerometer; autoplay; modestbranding; encrypted-media; gyroscope; picture-in-picture\"\\n allowfullscreen\\n >\\n </iframe>\\n\\n### Segment 2 (02:48 - 03:22):\\n<iframe\\n width=\"400\"\\n height=\"240\"\\n src=\"https://www.youtube.com/embed/G4qOiwIE_o0?start=168&end=202&controls=0\"\\n frameborder=\"0\"\\n allow=\"accelerometer; autoplay; modestbranding; encrypted-media; gyroscope; picture-in-picture\"\\n allowfullscreen\\n >\\n </iframe>\\n\\n### Segment 3 (00:00 - 00:55):\\n<iframe\\n width=\"400\"\\n height=\"240\"\\n src=\"https://www.youtube.com/embed/KA5FmevAzTc?start=0&end=55&controls=0\"\\n frameborder=\"0\"\\n allow=\"accelerometer; autoplay; modestbranding; encrypted-media; gyroscope; picture-in-picture\"\\n allowfullscreen\\n >\\n </iframe>\\n\\n### Segment 4 (00:00 - 00:57):\\n<iframe\\n width=\"400\"\\n height=\"240\"\\n src=\"https://www.youtube.com/embed/G4qOiwIE_o0?start=0&end=57&controls=0\"\\n frameborder=\"0\"\\n allow=\"accelerometer; autoplay; modestbranding; encrypted-media; gyroscope; picture-in-picture\"\\n allowfullscreen\\n >\\n </iframe>\\n\\n### Segment 5 (00:57 - 02:02):\\n<iframe\\n width=\"400\"\\n height=\"240\"\\n src=\"https://www.youtube.com/embed/G4qOiwIE_o0?start=57&end=122&controls=0\"\\n frameborder=\"0\"\\n allow=\"accelerometer; autoplay; modestbranding; encrypted-media; gyroscope; picture-in-picture\"\\n allowfullscreen\\n >\\n </iframe>\\n\\n')"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
  ]
378
  },
379
+ "execution_count": 21,
380
  "metadata": {},
381
  "output_type": "execute_result"
382
  }
383
  ],
384
  "source": [
385
+ "main(openAI_key=\"env\", \n",
386
+ " question=\"How much money do youtubers make?\", n_videos=5, \n",
387
+ " urls_text=\"\", \n",
388
+ " split_by_topic=False, \n",
389
+ " segment_length=200, \n",
390
+ " n_neighbours=5, \n",
391
+ " model=\"gpt-3.5-turbo\", \n",
392
+ " token_budget=2000, \n",
393
+ " temperature=0)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
  ]
395
  }
396
  ],