PodMagic / app.py
Ribot's picture
Update app.py
31f35d4 verified
raw
history blame
4.5 kB
import gradio as gr
import requests
import re
import os
import zipfile
import tempfile
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from mutagen.mp3 import MP3
from mutagen.id3 import ID3, TIT3
def get_clean_title(filepath):
try:
audio = MP3(filepath, ID3=ID3)
# Utilisation de TIT3 (Subtitle) qui contient souvent le numéro d'épisode
if 'TIT3' in audio:
title = audio['TIT3'].text[0]
elif 'TIT2' in audio:
title = audio['TIT2'].text[0]
else:
return os.path.basename(filepath)
# Nettoyage avancé du titre
title = re.sub(r'\s*-\s*Radio France$', '', title, flags=re.IGNORECASE)
title = re.sub(r'[\\/*?:"<>|]', '', title).strip()
return title[:100] # Limite la longueur du nom de fichier
except Exception:
return os.path.basename(filepath).split('.')[0]
def process_url(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Referer': 'https://www.radiofrance.fr/'
}
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status()
except Exception as e:
return None, f"Erreur de connexion : {str(e)}"
# Détection des épisodes avec BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
main_content = soup.find('main') or soup
episodes = main_content.find_all(['article', 'div'], class_=re.compile(r'episode|podcast|card', re.IGNORECASE))
mp3_links = []
for episode in episodes:
script_tag = episode.find('script', type='application/ld+json')
if script_tag:
match = re.search(r'"contentUrl"\s*:\s*"([^"]+?\.mp3)', script_tag.string)
if match:
mp3_url = urljoin(url, match.group(1).split('?')[0]
if mp3_url not in mp3_links:
mp3_links.append(mp3_url)
# Alternative si détection échoue
if not mp3_links:
matches = re.findall(r'(https://media\.radiofrance-podcast\.net[^"\']+?\.mp3)', response.text)
mp3_links = list(dict.fromkeys(matches))
if not mp3_links:
return None, "Aucun épisode trouvé - Vérifiez l'URL"
temp_dir = tempfile.mkdtemp()
filenames = []
for idx, mp3_url in enumerate(mp3_links, 1):
try:
# Téléchargement temporaire
temp_name = f"temp_{idx}.mp3"
temp_path = os.path.join(temp_dir, temp_name)
with requests.get(mp3_url, headers=headers, stream=True, timeout=20) as r:
r.raise_for_status()
with open(temp_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
# Renommage final
clean_title = get_clean_title(temp_path)
final_name = f"{idx:02d} - {clean_title}.mp3"
final_path = os.path.join(temp_dir, final_name)
os.rename(temp_path, final_path)
filenames.append(final_path)
except Exception as e:
continue
if not filenames:
return None, "Échec du téléchargement"
# Création ZIP
zip_path = os.path.join(temp_dir, 'podcast.zip')
with zipfile.ZipFile(zip_path, 'w') as zipf:
for file in filenames:
zipf.write(file, arcname=os.path.basename(file))
return zip_path, None
def download_podcast(url):
zip_path, error = process_url(url)
if error:
raise gr.Error(error)
return zip_path
with gr.Blocks(title="RadioFrance Podcaster Pro") as app:
gr.Markdown("## 🎧 Téléchargeur Intelligent Radio France")
with gr.Row():
url_input = gr.Textbox(
label="URL de la série podcast",
placeholder="Ex: https://www.radiofrance.fr/...",
max_lines=1
)
btn = gr.Button("Générer le ZIP", variant="primary")
output = gr.File(label="Épisodes téléchargés")
examples = gr.Examples(
examples=[[
"https://www.radiofrance.fr/franceculture/podcasts/serie-le-secret-de-la-licorne-les-aventures-de-tintin"
]],
inputs=[url_input]
)
btn.click(
fn=download_podcast,
inputs=url_input,
outputs=output,
api_name="download"
)
app.launch(show_error=True)