PodMagic / app.py
Ribot's picture
Update app.py
52a320c verified
raw
history blame
4.13 kB
import gradio as gr
import requests
import re
import os
import zipfile
import tempfile
from urllib.parse import urljoin
from mutagen.mp3 import MP3
from mutagen.id3 import ID3, TIT2
def get_clean_title(filepath):
try:
audio = MP3(filepath, ID3=ID3)
if 'TIT2' in audio:
title = audio['TIT2'].text[0]
# Nettoyage des caractères spéciaux
title = re.sub(r'[\\/*?:"<>|]', "", title).strip()
return title
except Exception as e:
print(f"Erreur lecture métadonnées : {str(e)}")
return os.path.basename(filepath).split('.')[0]
def process_url(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Referer': 'https://www.radiofrance.fr/'
}
response = requests.get(url, headers=headers)
response.raise_for_status()
except Exception as e:
return None, f"Erreur de connexion : {str(e)}"
# Extraction ciblée des épisodes
soup = BeautifulSoup(response.text, 'html.parser')
main_content = soup.find('main') or soup
episodes = main_content.find_all('article', class_=re.compile(r'episode|podcast'))
mp3_links = []
for episode in episodes:
script_tag = episode.find('script', type='application/ld+json')
if script_tag:
match = re.search(r'"contentUrl"\s*:\s*"([^"]+?\.mp3)', script_tag.string)
if match:
mp3_url = urljoin(url, match.group(1).split('?')[0])
mp3_links.append(mp3_url)
# Filtrage des doublons
mp3_links = list(dict.fromkeys(mp3_links))[:4] # Limite aux 4 premiers épisodes
if not mp3_links:
return None, "Aucun épisode principal trouvé"
temp_dir = tempfile.mkdtemp()
filenames = []
for idx, mp3_url in enumerate(mp3_links, 1):
try:
# Téléchargement original
original_name = os.path.basename(mp3_url).split('?')[0]
temp_path = os.path.join(temp_dir, f"temp_{idx}_{original_name}")
with requests.get(mp3_url, headers=headers, stream=True, timeout=10) as r:
r.raise_for_status()
with open(temp_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
# Renommage avec métadonnées
clean_title = get_clean_title(temp_path)
final_name = f"{idx:02d} - {clean_title}.mp3"
final_path = os.path.join(temp_dir, final_name)
os.rename(temp_path, final_path)
filenames.append(final_path)
except Exception as e:
continue
if not filenames:
return None, "Échec du téléchargement des épisodes"
# Création du ZIP
zip_path = os.path.join(temp_dir, 'podcast_episodes.zip')
with zipfile.ZipFile(zip_path, 'w') as zipf:
for file in filenames:
zipf.write(file, arcname=os.path.basename(file))
return zip_path, None
def download_podcast(url):
zip_path, error = process_url(url)
if error:
raise gr.Error(error)
return zip_path
with gr.Blocks(title="Podcast Clean Downloader") as app:
gr.Markdown("## 🎙️ Téléchargeur Intelligent de Podcasts")
with gr.Row():
url_input = gr.Textbox(
label="URL Radio France",
placeholder="Collez ici l'URL de la série podcast...",
max_lines=1
)
btn = gr.Button("Générer le ZIP des épisodes", variant="primary")
output = gr.File(label="Télécharger les épisodes")
examples = gr.Examples(
examples=[[
"https://www.radiofrance.fr/franceculture/podcasts/serie-le-secret-de-la-licorne-les-aventures-de-tintin"
]],
inputs=[url_input],
label="Exemple fonctionnel"
)
btn.click(
fn=download_podcast,
inputs=url_input,
outputs=output,
api_name="download"
)
app.launch(show_error=True, share=False)