|
import gradio as gr |
|
import requests |
|
import re |
|
import os |
|
import zipfile |
|
import tempfile |
|
from urllib.parse import urljoin |
|
from bs4 import BeautifulSoup |
|
from mutagen.mp3 import MP3 |
|
from mutagen.id3 import ID3 |
|
|
|
def get_clean_title(filepath): |
|
try: |
|
audio = MP3(filepath, ID3=ID3) |
|
for tag in ['TIT2', 'TIT3', 'TALB']: |
|
if tag in audio: |
|
title = audio[tag].text[0] |
|
title = re.sub(r'[\\/*?:"<>|]', '', title).strip() |
|
return title[:100] |
|
return os.path.basename(filepath).split('.')[0] |
|
except Exception: |
|
return os.path.basename(filepath).split('.')[0] |
|
|
|
def process_url(url): |
|
try: |
|
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'} |
|
response = requests.get(url, headers=headers, timeout=15) |
|
response.raise_for_status() |
|
except Exception as e: |
|
return None, f"Erreur : {str(e)}" |
|
|
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
mp3_links = [] |
|
|
|
|
|
for script in soup.find_all('script', type='application/ld+json'): |
|
if script.string: |
|
matches = re.findall(r'"contentUrl"\s*:\s*"([^"]+?\.mp3)', script.string) |
|
for match in matches: |
|
clean_url = urljoin(url, match.split('?')[0]) |
|
if clean_url not in mp3_links: |
|
mp3_links.append(clean_url) |
|
|
|
|
|
if not mp3_links: |
|
for a in soup.find_all('a', href=re.compile(r'\.mp3')): |
|
mp3_url = urljoin(url, a['href'].split('?')[0]) |
|
if mp3_url not in mp3_links: |
|
mp3_links.append(mp3_url) |
|
|
|
if not mp3_links: |
|
return None, "Aucun épisode trouvé" |
|
|
|
temp_dir = tempfile.mkdtemp() |
|
filenames = [] |
|
|
|
for idx, mp3_url in enumerate(mp3_links, 1): |
|
try: |
|
temp_path = os.path.join(temp_dir, f"temp_{idx}.mp3") |
|
|
|
|
|
with requests.get(mp3_url, headers=headers, stream=True, timeout=20) as r: |
|
r.raise_for_status() |
|
with open(temp_path, 'wb') as f: |
|
for chunk in r.iter_content(chunk_size=8192): |
|
f.write(chunk) |
|
|
|
|
|
title = get_clean_title(temp_path) |
|
final_name = f"{idx:02d} - {title}.mp3" |
|
final_path = os.path.join(temp_dir, final_name) |
|
os.rename(temp_path, final_path) |
|
filenames.append(final_path) |
|
except Exception: |
|
continue |
|
|
|
if not filenames: |
|
return None, "Échec des téléchargements" |
|
|
|
zip_path = os.path.join(temp_dir, 'podcast.zip') |
|
with zipfile.ZipFile(zip_path, 'w') as zipf: |
|
for file in filenames: |
|
zipf.write(file, arcname=os.path.basename(file)) |
|
|
|
return zip_path, None |
|
|
|
def download_podcast(url): |
|
zip_path, error = process_url(url) |
|
if error: |
|
raise gr.Error(error) |
|
return zip_path |
|
|
|
with gr.Blocks() as app: |
|
gr.Markdown("## 🎙️ Téléchargeur Radio France") |
|
with gr.Row(): |
|
url = gr.Textbox(label="URL de la série", placeholder="Collez l'URL ici...") |
|
btn = gr.Button("Télécharger", variant="primary") |
|
output = gr.File(label="Épisodes") |
|
|
|
examples = gr.Examples( |
|
examples=[[ |
|
"https://www.radiofrance.fr/franceculture/podcasts/serie-le-secret-de-la-licorne-les-aventures-de-tintin" |
|
]], |
|
inputs=[url] |
|
) |
|
|
|
btn.click(download_podcast, inputs=url, outputs=output) |
|
|
|
app.launch() |