Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,11 +1,317 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import gradio as gr
|
3 |
+
from googleapiclient.discovery import build
|
4 |
+
from googleapiclient.errors import HttpError
|
5 |
+
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
|
6 |
+
import time
|
7 |
+
import traceback
|
8 |
+
import tempfile
|
9 |
+
from datetime import datetime, timedelta
|
10 |
+
|
11 |
+
# --- Constants ---
|
12 |
+
YOUTUBE_API_SERVICE_NAME = "youtube"
|
13 |
+
YOUTUBE_API_VERSION = "v3"
|
14 |
+
API_KEY = "YOUR_API_KEY_HERE" # Replace with your actual YouTube Data API key
|
15 |
+
DEFAULT_KEYWORDS = "3d, blender, maya, 3ds max, cinema 4d, houdini, zbrush, unreal engine, unity, substance painter, substance designer, v-ray, arnold, rendering, texturing, rigging, vfx, cgi, autodesk, fusion 360"
|
16 |
+
DEFAULT_DAYS = 180 # Default to 6 months
|
17 |
+
DEFAULT_MAX_VIDEOS = 100 # Default to 100 videos
|
18 |
+
|
19 |
+
# --- YouTube API Helper Functions ---
|
20 |
+
|
21 |
+
def get_youtube_service():
|
22 |
+
"""Initializes and returns the YouTube API service."""
|
23 |
+
try:
|
24 |
+
return build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=API_KEY, cache_discovery=False)
|
25 |
+
except HttpError as e:
|
26 |
+
raise ConnectionError(f"Could not connect to YouTube API: {e}")
|
27 |
+
|
28 |
+
def get_channel_id(service, handle):
|
29 |
+
"""Gets the channel ID from a handle or ID."""
|
30 |
+
if not handle:
|
31 |
+
raise ValueError("Channel handle or ID is required.")
|
32 |
+
if handle.startswith("UC") and len(handle) == 24:
|
33 |
+
return handle
|
34 |
+
handle = handle if handle.startswith('@') else f"@{handle}"
|
35 |
+
try:
|
36 |
+
search_response = service.search().list(q=handle, part="id", type="channel", maxResults=1).execute()
|
37 |
+
if not search_response.get("items"):
|
38 |
+
raise ValueError(f"Channel '{handle}' not found.")
|
39 |
+
return search_response["items"][0]["id"]["channelId"]
|
40 |
+
except HttpError as e:
|
41 |
+
raise ConnectionError(f"API error finding channel ID: {e.content}")
|
42 |
+
|
43 |
+
def get_uploads_playlist_id(service, channel_id):
|
44 |
+
"""Gets the uploads playlist ID."""
|
45 |
+
try:
|
46 |
+
response = service.channels().list(id=channel_id, part="contentDetails").execute()
|
47 |
+
if not response.get("items"):
|
48 |
+
raise ValueError(f"No channel details for ID '{channel_id}'.")
|
49 |
+
return response["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
|
50 |
+
except HttpError as e:
|
51 |
+
raise ConnectionError(f"API error getting uploads playlist: {e.content}")
|
52 |
+
|
53 |
+
def get_all_video_ids(service, playlist_id, keywords_set, days_filter, max_videos):
|
54 |
+
"""Fetches video IDs with pre-filtering by keywords, date, and max limit."""
|
55 |
+
video_ids = []
|
56 |
+
next_page_token = None
|
57 |
+
cutoff_date = (datetime.now() - timedelta(days=days_filter)).isoformat("T") + "Z"
|
58 |
+
|
59 |
+
while True:
|
60 |
+
try:
|
61 |
+
response = service.playlistItems().list(
|
62 |
+
playlistId=playlist_id,
|
63 |
+
part="snippet,contentDetails",
|
64 |
+
maxResults=50,
|
65 |
+
pageToken=next_page_token
|
66 |
+
).execute()
|
67 |
+
for item in response.get("items", []):
|
68 |
+
video_id = item["contentDetails"]["videoId"]
|
69 |
+
snippet = item["snippet"]
|
70 |
+
title = snippet["title"].lower()
|
71 |
+
description = snippet.get("description", "").lower()
|
72 |
+
published_at = snippet["publishedAt"]
|
73 |
+
|
74 |
+
# Date filter
|
75 |
+
if published_at < cutoff_date:
|
76 |
+
continue
|
77 |
+
|
78 |
+
# Keyword pre-filter (title or description)
|
79 |
+
if any(keyword in title or keyword in description for keyword in keywords_set):
|
80 |
+
video_ids.append(video_id)
|
81 |
+
|
82 |
+
if len(video_ids) >= max_videos:
|
83 |
+
return video_ids[:max_videos]
|
84 |
+
|
85 |
+
next_page_token = response.get("nextPageToken")
|
86 |
+
if not next_page_token:
|
87 |
+
break
|
88 |
+
except HttpError as e:
|
89 |
+
print(f"API Error fetching video IDs: {e.content}")
|
90 |
+
break
|
91 |
+
return video_ids[:max_videos]
|
92 |
+
|
93 |
+
def process_video(service, video_id, keywords_set):
|
94 |
+
"""Processes a video for keyword mentions and links."""
|
95 |
+
video_url = f"https://www.youtube.com/watch?v={video_id}"
|
96 |
+
result = {
|
97 |
+
"video_id": video_id,
|
98 |
+
"video_url": video_url,
|
99 |
+
"title": f"Video ID: {video_id}",
|
100 |
+
"transcript_mentions": set(),
|
101 |
+
"description_mentions": set(),
|
102 |
+
"description_links": []
|
103 |
+
}
|
104 |
+
|
105 |
+
try:
|
106 |
+
video_response = service.videos().list(id=video_id, part="snippet").execute()
|
107 |
+
if video_response.get("items"):
|
108 |
+
snippet = video_response["items"][0]["snippet"]
|
109 |
+
result["title"] = snippet.get("title", f"Video ID: {video_id}")
|
110 |
+
description = snippet.get("description", "").lower()
|
111 |
+
for keyword in keywords_set:
|
112 |
+
if keyword in description:
|
113 |
+
result["description_mentions"].add(keyword)
|
114 |
+
result["description_links"] = re.findall(r'https?://\S+', snippet.get("description", ""))
|
115 |
+
except HttpError as e:
|
116 |
+
print(f"API error getting details for {video_id}: {e.resp.status}")
|
117 |
+
|
118 |
+
if not result["description_mentions"]:
|
119 |
+
try:
|
120 |
+
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
|
121 |
+
transcript = transcript_list.find_transcript(['en', 'en-US', 'en-GB'])
|
122 |
+
if transcript:
|
123 |
+
full_transcript = transcript.fetch()
|
124 |
+
transcript_text = " ".join(segment['text'] for segment in full_transcript).lower()
|
125 |
+
for keyword in keywords_set:
|
126 |
+
if keyword in transcript_text:
|
127 |
+
result["transcript_mentions"].add(keyword)
|
128 |
+
except (TranscriptsDisabled, NoTranscriptFound, Exception) as e:
|
129 |
+
print(f"Error fetching transcript for {video_id}: {type(e).__name__}")
|
130 |
+
|
131 |
+
if result["transcript_mentions"] or result["description_mentions"]:
|
132 |
+
return result
|
133 |
+
return None
|
134 |
+
|
135 |
+
# --- Main Function ---
|
136 |
+
|
137 |
+
def scan_channel_videos(channel_handle, keywords_str, days_filter, max_videos, progress=gr.Progress(track_tqdm=True)):
|
138 |
+
"""Scans a YouTube channel for keyword mentions and links with user-defined filters."""
|
139 |
+
start_time = time.time()
|
140 |
+
status_log = []
|
141 |
+
results = []
|
142 |
+
|
143 |
+
def log_status(message):
|
144 |
+
print(message)
|
145 |
+
status_log.append(message)
|
146 |
+
yield "\n".join(status_log), gr.Markdown("### Processing..."), None
|
147 |
+
|
148 |
+
try:
|
149 |
+
yield from log_status("1. Initializing YouTube Service...")
|
150 |
+
service = get_youtube_service()
|
151 |
+
|
152 |
+
yield from log_status(f"2. Finding Channel ID for '{channel_handle}'...")
|
153 |
+
channel_id = get_channel_id(service, channel_handle)
|
154 |
+
yield from log_status(f" Found Channel ID: {channel_id}")
|
155 |
+
|
156 |
+
yield from log_status(f"3. Fetching Uploads Playlist ID...")
|
157 |
+
playlist_id = get_uploads_playlist_id(service, channel_id)
|
158 |
+
yield from log_status(f" Found Playlist ID: {playlist_id}")
|
159 |
+
|
160 |
+
keywords_list = [k.strip().lower() for k in keywords_str.split(',') if k.strip()]
|
161 |
+
if not keywords_list:
|
162 |
+
raise ValueError("At least one keyword is required.")
|
163 |
+
keywords_set = set(keywords_list)
|
164 |
+
|
165 |
+
# Validate user inputs
|
166 |
+
days_filter = int(days_filter) if days_filter else DEFAULT_DAYS
|
167 |
+
max_videos = int(max_videos) if max_videos else DEFAULT_MAX_VIDEOS
|
168 |
+
if days_filter < 1:
|
169 |
+
raise ValueError("Days filter must be at least 1.")
|
170 |
+
if max_videos < 1:
|
171 |
+
raise ValueError("Max videos must be at least 1.")
|
172 |
+
|
173 |
+
yield from log_status(f"4. Fetching Video IDs with filters (last {days_filter} days, max {max_videos} videos)...")
|
174 |
+
video_ids = get_all_video_ids(service, playlist_id, keywords_set, days_filter, max_videos)
|
175 |
+
if not video_ids:
|
176 |
+
yield from log_status(" No videos found matching filters.")
|
177 |
+
yield "\n".join(status_log), gr.Markdown("### Error\nNo videos found matching filters."), None
|
178 |
+
return
|
179 |
+
yield from log_status(f" Found {len(video_ids)} videos after filtering.")
|
180 |
+
|
181 |
+
yield from log_status(f"5. Scanning {len(video_ids)} videos for keywords: {', '.join(keywords_list)}...")
|
182 |
+
for video_id in progress.tqdm(video_ids, desc="Scanning Videos"):
|
183 |
+
result = process_video(service, video_id, keywords_set)
|
184 |
+
if result:
|
185 |
+
results.append(result)
|
186 |
+
yield from log_status(f" Found mentions in: {result['title']} - {result['video_url']} ({video_id})")
|
187 |
+
|
188 |
+
yield from log_status("\n6. Formatting Results...")
|
189 |
+
final_md = f"""
|
190 |
+
## Scan Results for {channel_handle}
|
191 |
+
|
192 |
+
**Searched Keywords**: {', '.join(keywords_list)}
|
193 |
+
**Videos Found**: {len(results)} out of {len(video_ids)} scanned (filtered from channel total)
|
194 |
+
**Scan Duration**: {time.time() - start_time:.2f} seconds
|
195 |
+
**Filters Applied**: Last {days_filter} days, max {max_videos} videos
|
196 |
+
|
197 |
+
---
|
198 |
+
|
199 |
+
"""
|
200 |
+
final_text = f"Scan Results for {channel_handle}\n\n"
|
201 |
+
final_text += f"Searched Keywords: {', '.join(keywords_list)}\n"
|
202 |
+
final_text += f"Videos Found: {len(results)} out of {len(video_ids)} scanned (filtered from channel total)\n"
|
203 |
+
final_text += f"Scan Duration: {time.time() - start_time:.2f} seconds\n"
|
204 |
+
final_text += f"Filters Applied: Last {days_filter} days, max {max_videos} videos\n\n"
|
205 |
+
|
206 |
+
if not results:
|
207 |
+
final_md += "\n**No mentions found for the specified keywords.**"
|
208 |
+
final_text += "No mentions found for the specified keywords.\n"
|
209 |
+
else:
|
210 |
+
for res in sorted(results, key=lambda x: x['title']):
|
211 |
+
final_md += f"""
|
212 |
+
### {res['title']}
|
213 |
+
|
214 |
+
- **Video URL**: [{res['video_url']}]({res['video_url']})
|
215 |
+
"""
|
216 |
+
final_text += f"Video: {res['title']}\n"
|
217 |
+
final_text += f"Video URL: {res['video_url']}\n"
|
218 |
+
|
219 |
+
if res['transcript_mentions']:
|
220 |
+
mentions = ', '.join(sorted(res['transcript_mentions']))
|
221 |
+
final_md += f"- **Transcript Mentions**: {mentions}\n"
|
222 |
+
final_text += f"Transcript Mentions: {mentions}\n"
|
223 |
+
if res['description_mentions']:
|
224 |
+
mentions = ', '.join(sorted(res['description_mentions']))
|
225 |
+
final_md += f"- **Description Mentions**: {mentions}\n"
|
226 |
+
final_text += f"Description Mentions: {mentions}\n"
|
227 |
+
if res['description_links']:
|
228 |
+
final_md += f"- **Links in Description**:\n"
|
229 |
+
final_text += f"Links in Description:\n"
|
230 |
+
for link in res['description_links']:
|
231 |
+
final_md += f" - [{link}]({link})\n"
|
232 |
+
final_text += f" - {link}\n"
|
233 |
+
final_md += "\n---\n"
|
234 |
+
final_text += "\n---\n"
|
235 |
+
|
236 |
+
with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.txt') as temp_file:
|
237 |
+
temp_file.write(final_text)
|
238 |
+
temp_file_path = temp_file.name
|
239 |
+
|
240 |
+
yield "\n".join(status_log), gr.Markdown(final_md), gr.File(value=temp_file_path, label="Download Results", filename="youtube_scan_results.txt")
|
241 |
+
|
242 |
+
except ValueError as ve:
|
243 |
+
yield from log_status(f"Error: {ve}")
|
244 |
+
yield "\n".join(status_log), gr.Markdown(f"### Error\n**Input Error:** {ve}"), None
|
245 |
+
except ConnectionError as ce:
|
246 |
+
yield from log_status(f"Error: {ce}")
|
247 |
+
yield "\n".join(status_log), gr.Markdown(f"### Error\n**API Connection Error:** {ce}"), None
|
248 |
+
except Exception as e:
|
249 |
+
traceback.print_exc()
|
250 |
+
yield from log_status(f"Error: {e}")
|
251 |
+
yield "\n".join(status_log), gr.Markdown(f"### Error\n**Unexpected Error:** {e}"), None
|
252 |
+
|
253 |
+
# --- Gradio Interface ---
|
254 |
+
|
255 |
+
with gr.Blocks(theme=gr.themes.Soft()) as app:
|
256 |
+
gr.Markdown("# YouTube Keyword Scanner")
|
257 |
+
gr.Markdown("Search for keywords in YouTube video transcripts and descriptions, with customizable filters and downloadable results.")
|
258 |
+
|
259 |
+
with gr.Row():
|
260 |
+
with gr.Column(scale=1):
|
261 |
+
gr.Markdown("## Settings")
|
262 |
+
channel_input = gr.Textbox(
|
263 |
+
label="Channel Handle or ID",
|
264 |
+
placeholder="e.g., @theAIsearch or UCxxxxxxxxxxxxxx",
|
265 |
+
value="@theAIsearch"
|
266 |
+
)
|
267 |
+
keywords_input = gr.Textbox(
|
268 |
+
label="Keywords (comma-separated)",
|
269 |
+
placeholder="e.g., 3d, blender, maya",
|
270 |
+
value=DEFAULT_KEYWORDS
|
271 |
+
)
|
272 |
+
days_filter_input = gr.Number(
|
273 |
+
label="Days to Look Back",
|
274 |
+
value=DEFAULT_DAYS,
|
275 |
+
minimum=1,
|
276 |
+
precision=0,
|
277 |
+
info="Filter videos from the last X days"
|
278 |
+
)
|
279 |
+
max_videos_input = gr.Number(
|
280 |
+
label="Max Videos to Scan",
|
281 |
+
value=DEFAULT_MAX_VIDEOS,
|
282 |
+
minimum=1,
|
283 |
+
precision=0,
|
284 |
+
info="Limit the number of videos scanned"
|
285 |
+
)
|
286 |
+
submit_button = gr.Button("Submit", variant="primary")
|
287 |
+
clear_button = gr.Button("Clear")
|
288 |
+
|
289 |
+
with gr.Column(scale=2):
|
290 |
+
gr.Markdown("## Status & Logs")
|
291 |
+
status_output = gr.Textbox(
|
292 |
+
label="Scan Progress",
|
293 |
+
lines=10,
|
294 |
+
max_lines=20,
|
295 |
+
interactive=False,
|
296 |
+
autoscroll=True
|
297 |
+
)
|
298 |
+
gr.Markdown("## Results")
|
299 |
+
results_output = gr.Markdown(value="Results will appear here.")
|
300 |
+
download_output = gr.File(label="Download Results", visible=False)
|
301 |
+
|
302 |
+
submit_button.click(
|
303 |
+
fn=scan_channel_videos,
|
304 |
+
inputs=[channel_input, keywords_input, days_filter_input, max_videos_input],
|
305 |
+
outputs=[status_output, results_output, download_output]
|
306 |
+
)
|
307 |
+
clear_button.click(
|
308 |
+
fn=lambda: ("", "Results cleared.", "", DEFAULT_KEYWORDS, DEFAULT_DAYS, DEFAULT_MAX_VIDEOS, None),
|
309 |
+
inputs=[],
|
310 |
+
outputs=[status_output, results_output, channel_input, keywords_input, days_filter_input, max_videos_input, download_output]
|
311 |
+
)
|
312 |
+
|
313 |
+
gr.Markdown("**Note:** Requires a valid YouTube Data API key. Filters help optimize performance.")
|
314 |
+
|
315 |
+
# --- Run the App ---
|
316 |
+
if __name__ == "__main__":
|
317 |
+
app.launch(debug=False)
|