deepak191z commited on
Commit
d751bce
·
verified ·
1 Parent(s): 45bc5e7

Update index.js

Browse files
Files changed (1) hide show
  1. index.js +82 -118
index.js CHANGED
@@ -1,132 +1,96 @@
1
- const express = require('express');
2
- const { chromium } = require('playwright');
3
- const bodyParser = require('body-parser');
4
- const cors = require('cors');
5
 
6
  const app = express();
7
- app.use(bodyParser.urlencoded({ extended: true }));
8
- app.use(bodyParser.json());
9
- app.use(cors());
10
 
11
- const html = `
12
- <!DOCTYPE html>
13
- <html lang="en">
14
- <head>
15
- <meta charset="UTF-8">
16
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
17
- <title>YouTube Transcript Generator (Playwright)</title>
18
- <style>
19
- body {
20
- font-family: Arial, sans-serif;
21
- max-width: 800px;
22
- margin: 0 auto;
23
- padding: 20px;
24
- }
25
- form {
26
- display: flex;
27
- flex-direction: column;
28
- }
29
- input, button {
30
- margin: 10px 0;
31
- padding: 5px;
32
- }
33
- #result {
34
- white-space: pre-wrap;
35
- background-color: #f0f0f0;
36
- padding: 10px;
37
- border-radius: 5px;
38
- }
39
- </style>
40
- </head>
41
- <body>
42
- <h1>YouTube Transcript Generator (Playwright)</h1>
43
- <form id="transcriptForm">
44
- <input type="text" id="videoUrl" name="videoUrl" placeholder="YouTube Video URL" required>
45
- <input type="text" id="videoTitle" name="videoTitle" placeholder="Video Title" required>
46
- <button type="submit">Generate Transcript</button>
47
- </form>
48
- <div id="result"></div>
49
 
50
- <script>
51
- document.getElementById('transcriptForm').addEventListener('submit', async (e) => {
52
- e.preventDefault();
53
- const videoUrl = document.getElementById('videoUrl').value;
54
- const videoTitle = document.getElementById('videoTitle').value;
55
- const resultDiv = document.getElementById('result');
 
 
 
 
 
56
 
57
- resultDiv.textContent = 'Generating transcript...';
 
 
 
 
 
 
 
 
58
 
59
- try {
60
- const response = await fetch('/extract-transcript', {
61
- method: 'POST',
62
- headers: {
63
- 'Content-Type': 'application/json',
64
- },
65
- body: JSON.stringify({ videoUrl, videoTitle }),
66
- });
67
 
68
- if (response.ok) {
69
- const data = await response.json();
70
- resultDiv.textContent = data.transcript;
71
- } else {
72
- resultDiv.textContent = 'Error generating transcript. Please try again.';
73
- }
74
- } catch (error) {
75
- console.error('Error:', error);
76
- resultDiv.textContent = 'An error occurred. Please try again.';
77
- }
78
- });
79
- </script>
80
- </body>
81
- </html>
82
- `;
 
 
83
 
84
- app.get('/', (req, res) => {
85
- res.send(html);
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  });
87
 
88
- app.post('/extract-transcript', async (req, res) => {
89
- const { videoUrl, videoTitle } = req.body;
90
- if (!videoUrl || !videoTitle) {
91
- return res.status(400).send('videoUrl and videoTitle are required');
92
- }
93
-
94
- const browser = await chromium.launch();
95
- const context = await browser.newContext();
96
- const page = await context.newPage();
 
 
 
97
 
98
- try {
99
- await page.goto(videoUrl, { waitUntil: 'networkidle' });
100
-
101
- // Set viewport size
102
- await page.setViewportSize({ width: 1920, height: 1080 });
103
-
104
- // Click the "Expand" button to expand the video description
105
- await page.click('tp-yt-paper-button#expand');
106
-
107
- // Wait for the "Show transcript" button and click it
108
- await page.click('button[aria-label="Show transcript"]');
109
-
110
- // Wait for the transcript container to appear
111
- await page.waitForSelector('ytd-transcript-segment-list-renderer');
112
-
113
- // Extract the transcript text
114
- const transcript = await page.evaluate(() => {
115
- const elements = Array.from(document.querySelectorAll('ytd-transcript-segment-renderer .segment-text'));
116
- return elements.map(element => element.innerText).join('\n');
117
- });
118
-
119
- res.json({ transcript });
120
-
121
- } catch (error) {
122
- console.error('Error extracting transcript:', error);
123
- res.status(500).send('Error extracting transcript');
124
- } finally {
125
- await browser.close();
126
- }
127
  });
128
 
129
- const PORT = 7860;
130
  app.listen(PORT, () => {
131
- console.log(`Server is running on port ${PORT}`);
132
- });
 
1
+ const express = require("express");
2
+ const { chromium } = require("playwright");
 
 
3
 
4
  const app = express();
5
+ const PORT = process.env.PORT || 7860;
 
 
6
 
7
+ let browser;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ // Launch Playwright browser once at startup
10
+ (async () => {
11
+ browser = await chromium.launch({
12
+ headless: true,
13
+ args: [
14
+ "--no-sandbox",
15
+ "--disable-setuid-sandbox",
16
+ "--disable-dev-shm-usage"
17
+ ]
18
+ });
19
+ })();
20
 
21
+ // Core scraping function
22
+ async function scrapeChannelVideos(channelName) {
23
+ const url = `https://www.youtube.com/@${channelName}/videos`;
24
+ const context = await browser.newContext({
25
+ userAgent:
26
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
27
+ "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
28
+ });
29
+ const page = await context.newPage();
30
 
31
+ try {
32
+ await page.goto(url, { waitUntil: "networkidle", timeout: 20000 });
33
+ await page.waitForSelector('a#video-title-link', { timeout: 10000 });
 
 
 
 
 
34
 
35
+ const videos = await page.$$eval("#dismissible", (els) =>
36
+ els.slice(0, 3).map((el) => {
37
+ const anchor = el.querySelector('a#video-title-link');
38
+ const img = el.querySelector("img");
39
+ const meta = el.querySelector("#metadata-line span");
40
+ const vidUrl = anchor?.href || "";
41
+ const vidIdMatch = vidUrl.match(/v=([^&]+)/);
42
+ return {
43
+ title: anchor?.title || anchor?.textContent.trim() || "",
44
+ videoId: vidIdMatch?.[1] || null,
45
+ url: vidUrl,
46
+ thumbnail:
47
+ vidIdMatch && `https://img.youtube.com/vi/${vidIdMatch[1]}/maxresdefault.jpg`,
48
+ published: meta?.textContent.trim() || ""
49
+ };
50
+ })
51
+ );
52
 
53
+ await context.close();
54
+ return videos;
55
+ } catch (err) {
56
+ await context.close();
57
+ throw err;
58
+ }
59
+ }
60
+
61
+ // Home route
62
+ app.get("/", (req, res) => {
63
+ res.json({
64
+ message: "Welcome to the YouTube Video Scraper API",
65
+ docs: "/api",
66
+ example: "/api/video/MrBeast"
67
+ });
68
  });
69
 
70
+ // API landing/documentation route
71
+ app.get("/api", (req, res) => {
72
+ res.json({
73
+ endpoints: [
74
+ {
75
+ route: "/api/video/:channelName",
76
+ method: "GET",
77
+ description: "Scrape the latest 3 videos for a given YouTube channel"
78
+ }
79
+ ]
80
+ });
81
+ });
82
 
83
+ // Dynamic video-scraping endpoint
84
+ app.get("/api/video/:channelName", async (req, res) => {
85
+ try {
86
+ const channel = req.params.channelName;
87
+ const videos = await scrapeChannelVideos(channel);
88
+ res.json({ channel, videos, timestamp: new Date().toISOString() });
89
+ } catch (error) {
90
+ res.status(500).json({ error: error.message });
91
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  });
93
 
 
94
  app.listen(PORT, () => {
95
+ console.log(`Server listening on port ${PORT}`);
96
+ });