Update app.py
Browse files
app.py
CHANGED
@@ -32,29 +32,65 @@ def slugify(text, max_length=50):
|
|
32 |
text = re.sub(r'[-\s]+', '_', text)
|
33 |
return text[:max_length].strip('_')
|
34 |
|
35 |
-
def
|
36 |
-
|
|
|
37 |
response.raise_for_status()
|
38 |
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
podcast_title = slugify(title_tag.get_text()) if title_tag else "podcast"
|
43 |
-
|
44 |
-
# Liens MP3
|
45 |
-
mp3_links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.mp3')]
|
46 |
|
47 |
-
|
|
|
|
|
|
|
|
|
48 |
|
49 |
-
def download_and_zip_podcast(url):
|
50 |
-
try:
|
51 |
-
podcast_title, mp3_links = extract_mp3_links_and_title(url)
|
52 |
if not mp3_links:
|
53 |
-
return "Aucun fichier MP3
|
54 |
|
55 |
temp_dir = tempfile.mkdtemp()
|
56 |
for i, mp3_url in enumerate(mp3_links, start=1):
|
57 |
-
filename = f"{
|
58 |
filepath = os.path.join(temp_dir, filename)
|
59 |
with requests.get(mp3_url, stream=True) as r:
|
60 |
r.raise_for_status()
|
@@ -62,29 +98,29 @@ def download_and_zip_podcast(url):
|
|
62 |
for chunk in r.iter_content(chunk_size=8192):
|
63 |
f.write(chunk)
|
64 |
|
65 |
-
zip_path = os.path.join(temp_dir, f"{
|
66 |
shutil.make_archive(zip_path.replace('.zip', ''), 'zip', temp_dir)
|
67 |
|
68 |
-
return f"{len(mp3_links)}
|
69 |
|
70 |
except Exception as e:
|
71 |
return f"Erreur : {str(e)}", None
|
72 |
|
73 |
# === INTERFACE GRADIO ===
|
74 |
with gr.Blocks() as app:
|
75 |
-
gr.Markdown("# Téléchargeur de Podcasts MP3")
|
76 |
with gr.Row():
|
77 |
-
url_input = gr.Textbox(label="URL de la
|
78 |
download_button = gr.Button("Télécharger et compresser")
|
79 |
output_text = gr.Textbox(label="Message")
|
80 |
file_output = gr.File(label="Fichier ZIP", file_types=[".zip"])
|
81 |
|
82 |
def process(url):
|
83 |
-
message, zip_file =
|
84 |
return message, zip_file
|
85 |
|
86 |
download_button.click(fn=process, inputs=[url_input], outputs=[output_text, file_output])
|
87 |
|
88 |
-
# === LANCEMENT
|
89 |
if __name__ == "__main__":
|
90 |
-
app.launch(share=True)
|
|
|
32 |
text = re.sub(r'[-\s]+', '_', text)
|
33 |
return text[:max_length].strip('_')
|
34 |
|
35 |
+
def get_episode_links(main_url):
|
36 |
+
"""Récupère toutes les URL des pages d’épisodes depuis la page principale."""
|
37 |
+
response = requests.get(main_url)
|
38 |
response.raise_for_status()
|
39 |
soup = BeautifulSoup(response.text, 'html.parser')
|
40 |
+
links = []
|
41 |
+
|
42 |
+
for a in soup.find_all('a', href=True):
|
43 |
+
href = a['href']
|
44 |
+
if "/podcasts/" in href and href != main_url:
|
45 |
+
full_url = href if href.startswith("http") else f"https://www.radiofrance.fr{href}"
|
46 |
+
if full_url not in links:
|
47 |
+
links.append(full_url)
|
48 |
+
|
49 |
+
return list(dict.fromkeys(links)) # dédoublonner
|
50 |
+
|
51 |
+
def extract_mp3_from_episode(url):
|
52 |
+
"""Extrait le lien MP3 d’un épisode."""
|
53 |
+
try:
|
54 |
+
response = requests.get(url)
|
55 |
+
response.raise_for_status()
|
56 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
57 |
+
audio_tag = soup.find("audio")
|
58 |
+
if audio_tag and audio_tag.get("src", "").endswith(".mp3"):
|
59 |
+
return audio_tag["src"]
|
60 |
+
except Exception:
|
61 |
+
pass
|
62 |
+
return None
|
63 |
+
|
64 |
+
def get_podcast_title(url):
|
65 |
+
"""Extrait le titre général du podcast."""
|
66 |
+
try:
|
67 |
+
response = requests.get(url)
|
68 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
69 |
+
title_tag = soup.find('h1') or soup.find('title')
|
70 |
+
return slugify(title_tag.get_text()) if title_tag else "podcast"
|
71 |
+
except Exception:
|
72 |
+
return "podcast"
|
73 |
+
|
74 |
+
def download_and_zip_podcast_series(main_url):
|
75 |
+
try:
|
76 |
+
title = get_podcast_title(main_url)
|
77 |
+
episode_pages = get_episode_links(main_url)
|
78 |
|
79 |
+
if not episode_pages:
|
80 |
+
return "Aucune page d’épisode trouvée.", None
|
|
|
|
|
|
|
|
|
81 |
|
82 |
+
mp3_links = []
|
83 |
+
for ep_url in episode_pages:
|
84 |
+
mp3 = extract_mp3_from_episode(ep_url)
|
85 |
+
if mp3:
|
86 |
+
mp3_links.append(mp3)
|
87 |
|
|
|
|
|
|
|
88 |
if not mp3_links:
|
89 |
+
return "Aucun fichier MP3 trouvé dans les épisodes.", None
|
90 |
|
91 |
temp_dir = tempfile.mkdtemp()
|
92 |
for i, mp3_url in enumerate(mp3_links, start=1):
|
93 |
+
filename = f"{title}_{i:02}.mp3"
|
94 |
filepath = os.path.join(temp_dir, filename)
|
95 |
with requests.get(mp3_url, stream=True) as r:
|
96 |
r.raise_for_status()
|
|
|
98 |
for chunk in r.iter_content(chunk_size=8192):
|
99 |
f.write(chunk)
|
100 |
|
101 |
+
zip_path = os.path.join(temp_dir, f"{title}.zip")
|
102 |
shutil.make_archive(zip_path.replace('.zip', ''), 'zip', temp_dir)
|
103 |
|
104 |
+
return f"{len(mp3_links)} épisode(s) téléchargé(s).", zip_path
|
105 |
|
106 |
except Exception as e:
|
107 |
return f"Erreur : {str(e)}", None
|
108 |
|
109 |
# === INTERFACE GRADIO ===
|
110 |
with gr.Blocks() as app:
|
111 |
+
gr.Markdown("# Téléchargeur de Podcasts MP3 - France Culture")
|
112 |
with gr.Row():
|
113 |
+
url_input = gr.Textbox(label="URL de la série", placeholder="https://www.radiofrance.fr/...")
|
114 |
download_button = gr.Button("Télécharger et compresser")
|
115 |
output_text = gr.Textbox(label="Message")
|
116 |
file_output = gr.File(label="Fichier ZIP", file_types=[".zip"])
|
117 |
|
118 |
def process(url):
|
119 |
+
message, zip_file = download_and_zip_podcast_series(url)
|
120 |
return message, zip_file
|
121 |
|
122 |
download_button.click(fn=process, inputs=[url_input], outputs=[output_text, file_output])
|
123 |
|
124 |
+
# === LANCEMENT ===
|
125 |
if __name__ == "__main__":
|
126 |
+
app.launch(share=True)
|