|
import os |
|
import re |
|
import sys |
|
import subprocess |
|
import importlib.util |
|
import shutil |
|
import tempfile |
|
|
|
|
|
def install_if_missing(package_name, import_name=None): |
|
import_name = import_name or package_name |
|
if importlib.util.find_spec(import_name) is None: |
|
print(f"Installation de {package_name}...") |
|
subprocess.check_call([sys.executable, "-m", "pip", "install", package_name]) |
|
|
|
for package in [ |
|
("requests",), |
|
("bs4", "bs4"), |
|
("gradio",), |
|
]: |
|
install_if_missing(*package) |
|
|
|
|
|
import requests |
|
import gradio as gr |
|
from bs4 import BeautifulSoup |
|
|
|
|
|
def slugify(text, max_length=50): |
|
text = text.lower() |
|
text = re.sub(r'[^\w\s-]', '', text) |
|
text = re.sub(r'[-\s]+', '_', text) |
|
return text[:max_length].strip('_') |
|
|
|
def get_episode_links(main_url): |
|
"""Récupère toutes les URL des pages d’épisodes depuis la page principale.""" |
|
response = requests.get(main_url) |
|
response.raise_for_status() |
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
links = [] |
|
|
|
for a in soup.find_all('a', href=True): |
|
href = a['href'] |
|
if "/podcasts/" in href and href != main_url: |
|
full_url = href if href.startswith("http") else f"https://www.radiofrance.fr{href}" |
|
if full_url not in links: |
|
links.append(full_url) |
|
|
|
return list(dict.fromkeys(links)) |
|
|
|
def extract_mp3_from_episode(url): |
|
"""Extrait le lien MP3 d’un épisode.""" |
|
try: |
|
response = requests.get(url) |
|
response.raise_for_status() |
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
audio_tag = soup.find("audio") |
|
if audio_tag and audio_tag.get("src", "").endswith(".mp3"): |
|
return audio_tag["src"] |
|
except Exception: |
|
pass |
|
return None |
|
|
|
def get_podcast_title(url): |
|
"""Extrait le titre général du podcast.""" |
|
try: |
|
response = requests.get(url) |
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
title_tag = soup.find('h1') or soup.find('title') |
|
return slugify(title_tag.get_text()) if title_tag else "podcast" |
|
except Exception: |
|
return "podcast" |
|
|
|
def download_and_zip_podcast_series(main_url): |
|
try: |
|
title = get_podcast_title(main_url) |
|
episode_pages = get_episode_links(main_url) |
|
|
|
if not episode_pages: |
|
return "Aucune page d’épisode trouvée.", None |
|
|
|
mp3_links = [] |
|
for ep_url in episode_pages: |
|
mp3 = extract_mp3_from_episode(ep_url) |
|
if mp3: |
|
mp3_links.append(mp3) |
|
|
|
if not mp3_links: |
|
return "Aucun fichier MP3 trouvé dans les épisodes.", None |
|
|
|
temp_dir = tempfile.mkdtemp() |
|
for i, mp3_url in enumerate(mp3_links, start=1): |
|
filename = f"{title}_{i:02}.mp3" |
|
filepath = os.path.join(temp_dir, filename) |
|
with requests.get(mp3_url, stream=True) as r: |
|
r.raise_for_status() |
|
with open(filepath, 'wb') as f: |
|
for chunk in r.iter_content(chunk_size=8192): |
|
f.write(chunk) |
|
|
|
zip_path = os.path.join(temp_dir, f"{title}.zip") |
|
shutil.make_archive(zip_path.replace('.zip', ''), 'zip', temp_dir) |
|
|
|
return f"{len(mp3_links)} épisode(s) téléchargé(s).", zip_path |
|
|
|
except Exception as e: |
|
return f"Erreur : {str(e)}", None |
|
|
|
|
|
with gr.Blocks() as app: |
|
gr.Markdown("# Téléchargeur de Podcasts MP3 - France Culture") |
|
with gr.Row(): |
|
url_input = gr.Textbox(label="URL de la série", placeholder="https://www.radiofrance.fr/...") |
|
download_button = gr.Button("Télécharger et compresser") |
|
output_text = gr.Textbox(label="Message") |
|
file_output = gr.File(label="Fichier ZIP", file_types=[".zip"]) |
|
|
|
def process(url): |
|
message, zip_file = download_and_zip_podcast_series(url) |
|
return message, zip_file |
|
|
|
download_button.click(fn=process, inputs=[url_input], outputs=[output_text, file_output]) |
|
|
|
|
|
if __name__ == "__main__": |
|
app.launch(share=True) |