Spaces:

Ribot
/

PodMagic

Running

App Files Files Community

Ribot commited on 18 days ago

Commit

c0153c3

verified ·

1 Parent(s): 1f317db

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -42

app.py CHANGED Viewed

@@ -10,17 +10,11 @@ import tempfile
 def install_if_missing(package_name, import_name=None):
     import_name = import_name or package_name
     if importlib.util.find_spec(import_name) is None:
-        print(f"Installation de {package_name}...")
         subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
-for package in [
-    ("requests",),
-    ("bs4", "bs4"),
-    ("gradio",),
-]:
     install_if_missing(*package)
-# === IMPORTS ===
 import requests
 import gradio as gr
 from bs4 import BeautifulSoup
@@ -32,61 +26,54 @@ def slugify(text, max_length=50):
     text = re.sub(r'[-\s]+', '_', text)
     return text[:max_length].strip('_')
-def get_episode_links(main_url):
-    """Récupère toutes les URL des pages d’épisodes depuis la page principale."""
     response = requests.get(main_url)
     response.raise_for_status()
     soup = BeautifulSoup(response.text, 'html.parser')
-    links = []
     for a in soup.find_all('a', href=True):
         href = a['href']
-        if "/podcasts/" in href and href != main_url:
             full_url = href if href.startswith("http") else f"https://www.radiofrance.fr{href}"
-            if full_url not in links:
-                links.append(full_url)
-    return list(dict.fromkeys(links))  # dédoublonner
-def extract_mp3_from_episode(url):
-    """Extrait le lien MP3 d’un épisode."""
     try:
-        response = requests.get(url)
         response.raise_for_status()
-        soup = BeautifulSoup(response.text, 'html.parser')
-        audio_tag = soup.find("audio")
-        if audio_tag and audio_tag.get("src", "").endswith(".mp3"):
-            return audio_tag["src"]
     except Exception:
-        pass
-    return None
-def get_podcast_title(url):
-    """Extrait le titre général du podcast."""
     try:
-        response = requests.get(url)
         soup = BeautifulSoup(response.text, 'html.parser')
         title_tag = soup.find('h1') or soup.find('title')
         return slugify(title_tag.get_text()) if title_tag else "podcast"
-    except Exception:
         return "podcast"
-def download_and_zip_podcast_series(main_url):
     try:
         title = get_podcast_title(main_url)
-        episode_pages = get_episode_links(main_url)
         if not episode_pages:
             return "Aucune page d’épisode trouvée.", None
         mp3_links = []
-        for ep_url in episode_pages:
-            mp3 = extract_mp3_from_episode(ep_url)
             if mp3:
                 mp3_links.append(mp3)
         if not mp3_links:
-            return "Aucun fichier MP3 trouvé dans les épisodes.", None
         temp_dir = tempfile.mkdtemp()
         for i, mp3_url in enumerate(mp3_links, start=1):
@@ -100,27 +87,24 @@ def download_and_zip_podcast_series(main_url):
         zip_path = os.path.join(temp_dir, f"{title}.zip")
         shutil.make_archive(zip_path.replace('.zip', ''), 'zip', temp_dir)
-        return f"{len(mp3_links)} épisode(s) téléchargé(s).", zip_path
     except Exception as e:
         return f"Erreur : {str(e)}", None
 # === INTERFACE GRADIO ===
 with gr.Blocks() as app:
-    gr.Markdown("# Téléchargeur de Podcasts MP3 - France Culture")
-    with gr.Row():
-        url_input = gr.Textbox(label="URL de la série", placeholder="https://www.radiofrance.fr/...")
     download_button = gr.Button("Télécharger et compresser")
     output_text = gr.Textbox(label="Message")
     file_output = gr.File(label="Fichier ZIP", file_types=[".zip"])
     def process(url):
-        message, zip_file = download_and_zip_podcast_series(url)
-        return message, zip_file
     download_button.click(fn=process, inputs=[url_input], outputs=[output_text, file_output])
-# === LANCEMENT ===
 if __name__ == "__main__":
     app.launch(share=True)

 def install_if_missing(package_name, import_name=None):
     import_name = import_name or package_name
     if importlib.util.find_spec(import_name) is None:
         subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
+for package in [("requests",), ("bs4", "bs4"), ("gradio",)]:
     install_if_missing(*package)
 import requests
 import gradio as gr
 from bs4 import BeautifulSoup
     text = re.sub(r'[-\s]+', '_', text)
     return text[:max_length].strip('_')
+def get_episode_pages(main_url):
     response = requests.get(main_url)
     response.raise_for_status()
     soup = BeautifulSoup(response.text, 'html.parser')
+    episode_urls = []
     for a in soup.find_all('a', href=True):
         href = a['href']
+        if "/franceculture/podcasts/" in href and not href.endswith('/serie'):
             full_url = href if href.startswith("http") else f"https://www.radiofrance.fr{href}"
+            episode_urls.append(full_url)
+    return list(dict.fromkeys(episode_urls))
+def get_mp3_link_with_ithema(page_url):
     try:
+        response = requests.get(page_url)
         response.raise_for_status()
+        html = response.text
+        matches = re.findall(r'https://[^"]*ithema[^"]*\.mp3', html)
+        return matches[0] if matches else None
     except Exception:
+        return None
+def get_podcast_title(main_url):
     try:
+        response = requests.get(main_url)
         soup = BeautifulSoup(response.text, 'html.parser')
         title_tag = soup.find('h1') or soup.find('title')
         return slugify(title_tag.get_text()) if title_tag else "podcast"
+    except:
         return "podcast"
+def download_and_zip_podcast(main_url):
     try:
         title = get_podcast_title(main_url)
+        episode_pages = get_episode_pages(main_url)
         if not episode_pages:
             return "Aucune page d’épisode trouvée.", None
         mp3_links = []
+        for page in episode_pages:
+            mp3 = get_mp3_link_with_ithema(page)
             if mp3:
                 mp3_links.append(mp3)
         if not mp3_links:
+            return "Aucun fichier MP3 contenant 'ithema' trouvé.", None
         temp_dir = tempfile.mkdtemp()
         for i, mp3_url in enumerate(mp3_links, start=1):
         zip_path = os.path.join(temp_dir, f"{title}.zip")
         shutil.make_archive(zip_path.replace('.zip', ''), 'zip', temp_dir)
+        return f"{len(mp3_links)} fichier(s) téléchargé(s).", zip_path
     except Exception as e:
         return f"Erreur : {str(e)}", None
 # === INTERFACE GRADIO ===
 with gr.Blocks() as app:
+    gr.Markdown("# Téléchargeur de Podcasts MP3 (France Culture)")
+    url_input = gr.Textbox(label="URL de la série", placeholder="https://www.radiofrance.fr/...")
     download_button = gr.Button("Télécharger et compresser")
     output_text = gr.Textbox(label="Message")
     file_output = gr.File(label="Fichier ZIP", file_types=[".zip"])
     def process(url):
+        msg, zip_path = download_and_zip_podcast(url)
+        return msg, zip_path
     download_button.click(fn=process, inputs=[url_input], outputs=[output_text, file_output])
 if __name__ == "__main__":
     app.launch(share=True)