Spaces:
Running
Running
Update index.js
Browse files
index.js
CHANGED
@@ -1,132 +1,96 @@
|
|
1 |
-
const express = require(
|
2 |
-
const { chromium } = require(
|
3 |
-
const bodyParser = require('body-parser');
|
4 |
-
const cors = require('cors');
|
5 |
|
6 |
const app = express();
|
7 |
-
|
8 |
-
app.use(bodyParser.json());
|
9 |
-
app.use(cors());
|
10 |
|
11 |
-
|
12 |
-
<!DOCTYPE html>
|
13 |
-
<html lang="en">
|
14 |
-
<head>
|
15 |
-
<meta charset="UTF-8">
|
16 |
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
17 |
-
<title>YouTube Transcript Generator (Playwright)</title>
|
18 |
-
<style>
|
19 |
-
body {
|
20 |
-
font-family: Arial, sans-serif;
|
21 |
-
max-width: 800px;
|
22 |
-
margin: 0 auto;
|
23 |
-
padding: 20px;
|
24 |
-
}
|
25 |
-
form {
|
26 |
-
display: flex;
|
27 |
-
flex-direction: column;
|
28 |
-
}
|
29 |
-
input, button {
|
30 |
-
margin: 10px 0;
|
31 |
-
padding: 5px;
|
32 |
-
}
|
33 |
-
#result {
|
34 |
-
white-space: pre-wrap;
|
35 |
-
background-color: #f0f0f0;
|
36 |
-
padding: 10px;
|
37 |
-
border-radius: 5px;
|
38 |
-
}
|
39 |
-
</style>
|
40 |
-
</head>
|
41 |
-
<body>
|
42 |
-
<h1>YouTube Transcript Generator (Playwright)</h1>
|
43 |
-
<form id="transcriptForm">
|
44 |
-
<input type="text" id="videoUrl" name="videoUrl" placeholder="YouTube Video URL" required>
|
45 |
-
<input type="text" id="videoTitle" name="videoTitle" placeholder="Video Title" required>
|
46 |
-
<button type="submit">Generate Transcript</button>
|
47 |
-
</form>
|
48 |
-
<div id="result"></div>
|
49 |
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
headers: {
|
63 |
-
'Content-Type': 'application/json',
|
64 |
-
},
|
65 |
-
body: JSON.stringify({ videoUrl, videoTitle }),
|
66 |
-
});
|
67 |
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
|
|
|
|
83 |
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
});
|
87 |
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
|
|
|
|
|
|
97 |
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
// Wait for the "Show transcript" button and click it
|
108 |
-
await page.click('button[aria-label="Show transcript"]');
|
109 |
-
|
110 |
-
// Wait for the transcript container to appear
|
111 |
-
await page.waitForSelector('ytd-transcript-segment-list-renderer');
|
112 |
-
|
113 |
-
// Extract the transcript text
|
114 |
-
const transcript = await page.evaluate(() => {
|
115 |
-
const elements = Array.from(document.querySelectorAll('ytd-transcript-segment-renderer .segment-text'));
|
116 |
-
return elements.map(element => element.innerText).join('\n');
|
117 |
-
});
|
118 |
-
|
119 |
-
res.json({ transcript });
|
120 |
-
|
121 |
-
} catch (error) {
|
122 |
-
console.error('Error extracting transcript:', error);
|
123 |
-
res.status(500).send('Error extracting transcript');
|
124 |
-
} finally {
|
125 |
-
await browser.close();
|
126 |
-
}
|
127 |
});
|
128 |
|
129 |
-
const PORT = 7860;
|
130 |
app.listen(PORT, () => {
|
131 |
-
|
132 |
-
});
|
|
|
1 |
+
const express = require("express");
|
2 |
+
const { chromium } = require("playwright");
|
|
|
|
|
3 |
|
4 |
const app = express();
|
5 |
+
const PORT = process.env.PORT || 7860;
|
|
|
|
|
6 |
|
7 |
+
let browser;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
+
// Launch Playwright browser once at startup
|
10 |
+
(async () => {
|
11 |
+
browser = await chromium.launch({
|
12 |
+
headless: true,
|
13 |
+
args: [
|
14 |
+
"--no-sandbox",
|
15 |
+
"--disable-setuid-sandbox",
|
16 |
+
"--disable-dev-shm-usage"
|
17 |
+
]
|
18 |
+
});
|
19 |
+
})();
|
20 |
|
21 |
+
// Core scraping function
|
22 |
+
async function scrapeChannelVideos(channelName) {
|
23 |
+
const url = `https://www.youtube.com/@${channelName}/videos`;
|
24 |
+
const context = await browser.newContext({
|
25 |
+
userAgent:
|
26 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
|
27 |
+
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
28 |
+
});
|
29 |
+
const page = await context.newPage();
|
30 |
|
31 |
+
try {
|
32 |
+
await page.goto(url, { waitUntil: "networkidle", timeout: 20000 });
|
33 |
+
await page.waitForSelector('a#video-title-link', { timeout: 10000 });
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
+
const videos = await page.$$eval("#dismissible", (els) =>
|
36 |
+
els.slice(0, 3).map((el) => {
|
37 |
+
const anchor = el.querySelector('a#video-title-link');
|
38 |
+
const img = el.querySelector("img");
|
39 |
+
const meta = el.querySelector("#metadata-line span");
|
40 |
+
const vidUrl = anchor?.href || "";
|
41 |
+
const vidIdMatch = vidUrl.match(/v=([^&]+)/);
|
42 |
+
return {
|
43 |
+
title: anchor?.title || anchor?.textContent.trim() || "",
|
44 |
+
videoId: vidIdMatch?.[1] || null,
|
45 |
+
url: vidUrl,
|
46 |
+
thumbnail:
|
47 |
+
vidIdMatch && `https://img.youtube.com/vi/${vidIdMatch[1]}/maxresdefault.jpg`,
|
48 |
+
published: meta?.textContent.trim() || ""
|
49 |
+
};
|
50 |
+
})
|
51 |
+
);
|
52 |
|
53 |
+
await context.close();
|
54 |
+
return videos;
|
55 |
+
} catch (err) {
|
56 |
+
await context.close();
|
57 |
+
throw err;
|
58 |
+
}
|
59 |
+
}
|
60 |
+
|
61 |
+
// Home route
|
62 |
+
app.get("/", (req, res) => {
|
63 |
+
res.json({
|
64 |
+
message: "Welcome to the YouTube Video Scraper API",
|
65 |
+
docs: "/api",
|
66 |
+
example: "/api/video/MrBeast"
|
67 |
+
});
|
68 |
});
|
69 |
|
70 |
+
// API landing/documentation route
|
71 |
+
app.get("/api", (req, res) => {
|
72 |
+
res.json({
|
73 |
+
endpoints: [
|
74 |
+
{
|
75 |
+
route: "/api/video/:channelName",
|
76 |
+
method: "GET",
|
77 |
+
description: "Scrape the latest 3 videos for a given YouTube channel"
|
78 |
+
}
|
79 |
+
]
|
80 |
+
});
|
81 |
+
});
|
82 |
|
83 |
+
// Dynamic video-scraping endpoint
|
84 |
+
app.get("/api/video/:channelName", async (req, res) => {
|
85 |
+
try {
|
86 |
+
const channel = req.params.channelName;
|
87 |
+
const videos = await scrapeChannelVideos(channel);
|
88 |
+
res.json({ channel, videos, timestamp: new Date().toISOString() });
|
89 |
+
} catch (error) {
|
90 |
+
res.status(500).json({ error: error.message });
|
91 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
});
|
93 |
|
|
|
94 |
app.listen(PORT, () => {
|
95 |
+
console.log(`Server listening on port ${PORT}`);
|
96 |
+
});
|