Spaces:
Sleeping
Sleeping
Create extract_m3u8.py
Browse files- extract_m3u8.py +147 -0
extract_m3u8.py
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import json
|
3 |
+
import re
|
4 |
+
from selenium import webdriver
|
5 |
+
from selenium.webdriver.chrome.options import Options
|
6 |
+
from selenium.webdriver.chrome.service import Service
|
7 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
8 |
+
|
9 |
+
def extract_m3u8_urls(url, wait_time=10, headers=None):
|
10 |
+
"""
|
11 |
+
Captura URLs m3u8 de una p谩gina web y las devuelve directamente.
|
12 |
+
|
13 |
+
Args:
|
14 |
+
url (str): URL de la p谩gina a analizar
|
15 |
+
wait_time (int): Tiempo de espera en segundos
|
16 |
+
headers (dict): Headers personalizados
|
17 |
+
|
18 |
+
Returns:
|
19 |
+
list: Lista de URLs m3u8 encontradas
|
20 |
+
"""
|
21 |
+
# Configurar opciones de Chrome
|
22 |
+
chrome_options = Options()
|
23 |
+
chrome_options.add_argument("--headless")
|
24 |
+
chrome_options.add_argument("--disable-gpu")
|
25 |
+
chrome_options.add_argument("--window-size=1920x1080")
|
26 |
+
chrome_options.add_argument("--no-sandbox")
|
27 |
+
chrome_options.add_argument("--disable-dev-shm-usage")
|
28 |
+
|
29 |
+
# Agregar headers personalizados
|
30 |
+
if headers:
|
31 |
+
for key, value in headers.items():
|
32 |
+
if key.lower() == 'user-agent':
|
33 |
+
chrome_options.add_argument(f"--user-agent={value}")
|
34 |
+
else:
|
35 |
+
chrome_options.add_argument(f"--header={key}: {value}")
|
36 |
+
|
37 |
+
# Habilitar registro de red
|
38 |
+
chrome_options.set_capability("goog:loggingPrefs", {"performance": "ALL"})
|
39 |
+
|
40 |
+
# Iniciar navegador silenciosamente
|
41 |
+
service = Service()
|
42 |
+
driver = webdriver.Chrome(service=service, options=chrome_options)
|
43 |
+
|
44 |
+
try:
|
45 |
+
# Navegar a la URL
|
46 |
+
driver.get(url)
|
47 |
+
|
48 |
+
# Aplicar headers a futuras peticiones XHR y fetch
|
49 |
+
if headers:
|
50 |
+
header_script = """
|
51 |
+
// Headers para XHR
|
52 |
+
(function(open) {
|
53 |
+
XMLHttpRequest.prototype.open = function(method, url) {
|
54 |
+
var xhr = this;
|
55 |
+
var args = arguments;
|
56 |
+
open.apply(xhr, args);
|
57 |
+
"""
|
58 |
+
|
59 |
+
for key, value in headers.items():
|
60 |
+
if key.lower() != 'user-agent':
|
61 |
+
header_script += f'xhr.setRequestHeader("{key}", "{value}");'
|
62 |
+
|
63 |
+
header_script += """
|
64 |
+
};
|
65 |
+
})(XMLHttpRequest.prototype.open);
|
66 |
+
|
67 |
+
// Headers para fetch
|
68 |
+
(function(fetch) {
|
69 |
+
window.fetch = function(url, options) {
|
70 |
+
options = options || {};
|
71 |
+
options.headers = options.headers || {};
|
72 |
+
"""
|
73 |
+
|
74 |
+
for key, value in headers.items():
|
75 |
+
header_script += f'options.headers["{key}"] = "{value}";'
|
76 |
+
|
77 |
+
header_script += """
|
78 |
+
return fetch.call(this, url, options);
|
79 |
+
};
|
80 |
+
})(window.fetch);
|
81 |
+
"""
|
82 |
+
|
83 |
+
driver.execute_script(header_script)
|
84 |
+
|
85 |
+
# Esperar a que la p谩gina cargue completamente
|
86 |
+
time.sleep(wait_time)
|
87 |
+
|
88 |
+
# Colecci贸n para todas las URLs m3u8
|
89 |
+
m3u8_urls = set()
|
90 |
+
|
91 |
+
# 1. Buscar en peticiones de red
|
92 |
+
logs = driver.get_log("performance")
|
93 |
+
for log in logs:
|
94 |
+
try:
|
95 |
+
log_entry = json.loads(log["message"])["message"]
|
96 |
+
|
97 |
+
# Filtrar peticiones de red
|
98 |
+
if "Network.responseReceived" in log_entry["method"] or "Network.requestWillBeSent" in log_entry["method"]:
|
99 |
+
if "request" in log_entry["params"] and "url" in log_entry["params"]["request"]:
|
100 |
+
url = log_entry["params"]["request"]["url"]
|
101 |
+
if ".m3u8" in url:
|
102 |
+
m3u8_urls.add(url)
|
103 |
+
|
104 |
+
# Tambi茅n buscar en respuestas
|
105 |
+
elif "response" in log_entry["params"] and "url" in log_entry["params"]["response"]:
|
106 |
+
url = log_entry["params"]["response"]["url"]
|
107 |
+
if ".m3u8" in url:
|
108 |
+
m3u8_urls.add(url)
|
109 |
+
except:
|
110 |
+
continue
|
111 |
+
|
112 |
+
# 2. Buscar en el contenido de la p谩gina y scripts
|
113 |
+
m3u8_pattern = re.compile(r'https?://[^"\'\s]+\.m3u8[^"\'\s]*')
|
114 |
+
|
115 |
+
# En el HTML
|
116 |
+
for match in m3u8_pattern.finditer(driver.page_source):
|
117 |
+
m3u8_urls.add(match.group(0))
|
118 |
+
|
119 |
+
# En scripts
|
120 |
+
scripts = driver.find_elements("tag name", "script")
|
121 |
+
for script in scripts:
|
122 |
+
content = script.get_attribute("innerHTML")
|
123 |
+
for match in m3u8_pattern.finditer(content):
|
124 |
+
m3u8_urls.add(match.group(0))
|
125 |
+
|
126 |
+
# 3. Buscar en variables JavaScript
|
127 |
+
js_variables = driver.execute_script("""
|
128 |
+
var allVars = {};
|
129 |
+
for (var key in window) {
|
130 |
+
try {
|
131 |
+
if (typeof window[key] === 'string' && window[key].includes('.m3u8')) {
|
132 |
+
allVars[key] = window[key];
|
133 |
+
}
|
134 |
+
} catch(e) {}
|
135 |
+
}
|
136 |
+
return allVars;
|
137 |
+
""")
|
138 |
+
|
139 |
+
for key, value in js_variables.items():
|
140 |
+
for match in m3u8_pattern.finditer(value):
|
141 |
+
m3u8_urls.add(match.group(0))
|
142 |
+
|
143 |
+
return list(m3u8_urls)
|
144 |
+
|
145 |
+
finally:
|
146 |
+
# Cerrar navegador
|
147 |
+
driver.quit()
|