Ribot commited on
Commit
1f317db
·
verified ·
1 Parent(s): 11fd592

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -21
app.py CHANGED
@@ -32,29 +32,65 @@ def slugify(text, max_length=50):
32
  text = re.sub(r'[-\s]+', '_', text)
33
  return text[:max_length].strip('_')
34
 
35
- def extract_mp3_links_and_title(url):
36
- response = requests.get(url)
 
37
  response.raise_for_status()
38
  soup = BeautifulSoup(response.text, 'html.parser')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
- # Titre du podcast
41
- title_tag = soup.find('h1') or soup.find('title')
42
- podcast_title = slugify(title_tag.get_text()) if title_tag else "podcast"
43
-
44
- # Liens MP3
45
- mp3_links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.mp3')]
46
 
47
- return podcast_title, mp3_links
 
 
 
 
48
 
49
- def download_and_zip_podcast(url):
50
- try:
51
- podcast_title, mp3_links = extract_mp3_links_and_title(url)
52
  if not mp3_links:
53
- return "Aucun fichier MP3 trouvé.", None
54
 
55
  temp_dir = tempfile.mkdtemp()
56
  for i, mp3_url in enumerate(mp3_links, start=1):
57
- filename = f"{podcast_title}_{i:02}.mp3"
58
  filepath = os.path.join(temp_dir, filename)
59
  with requests.get(mp3_url, stream=True) as r:
60
  r.raise_for_status()
@@ -62,29 +98,29 @@ def download_and_zip_podcast(url):
62
  for chunk in r.iter_content(chunk_size=8192):
63
  f.write(chunk)
64
 
65
- zip_path = os.path.join(temp_dir, f"{podcast_title}.zip")
66
  shutil.make_archive(zip_path.replace('.zip', ''), 'zip', temp_dir)
67
 
68
- return f"{len(mp3_links)} fichiers téléchargés avec succès.", zip_path
69
 
70
  except Exception as e:
71
  return f"Erreur : {str(e)}", None
72
 
73
  # === INTERFACE GRADIO ===
74
  with gr.Blocks() as app:
75
- gr.Markdown("# Téléchargeur de Podcasts MP3")
76
  with gr.Row():
77
- url_input = gr.Textbox(label="URL de la page série", placeholder="https://www.radiofrance.fr/...")
78
  download_button = gr.Button("Télécharger et compresser")
79
  output_text = gr.Textbox(label="Message")
80
  file_output = gr.File(label="Fichier ZIP", file_types=[".zip"])
81
 
82
  def process(url):
83
- message, zip_file = download_and_zip_podcast(url)
84
  return message, zip_file
85
 
86
  download_button.click(fn=process, inputs=[url_input], outputs=[output_text, file_output])
87
 
88
- # === LANCEMENT LOCAL OU SUR HUGGINGFACE ===
89
  if __name__ == "__main__":
90
- app.launch(share=True) # `share=True` utile pour Hugging Face
 
32
  text = re.sub(r'[-\s]+', '_', text)
33
  return text[:max_length].strip('_')
34
 
35
+ def get_episode_links(main_url):
36
+ """Récupère toutes les URL des pages d’épisodes depuis la page principale."""
37
+ response = requests.get(main_url)
38
  response.raise_for_status()
39
  soup = BeautifulSoup(response.text, 'html.parser')
40
+ links = []
41
+
42
+ for a in soup.find_all('a', href=True):
43
+ href = a['href']
44
+ if "/podcasts/" in href and href != main_url:
45
+ full_url = href if href.startswith("http") else f"https://www.radiofrance.fr{href}"
46
+ if full_url not in links:
47
+ links.append(full_url)
48
+
49
+ return list(dict.fromkeys(links)) # dédoublonner
50
+
51
+ def extract_mp3_from_episode(url):
52
+ """Extrait le lien MP3 d’un épisode."""
53
+ try:
54
+ response = requests.get(url)
55
+ response.raise_for_status()
56
+ soup = BeautifulSoup(response.text, 'html.parser')
57
+ audio_tag = soup.find("audio")
58
+ if audio_tag and audio_tag.get("src", "").endswith(".mp3"):
59
+ return audio_tag["src"]
60
+ except Exception:
61
+ pass
62
+ return None
63
+
64
+ def get_podcast_title(url):
65
+ """Extrait le titre général du podcast."""
66
+ try:
67
+ response = requests.get(url)
68
+ soup = BeautifulSoup(response.text, 'html.parser')
69
+ title_tag = soup.find('h1') or soup.find('title')
70
+ return slugify(title_tag.get_text()) if title_tag else "podcast"
71
+ except Exception:
72
+ return "podcast"
73
+
74
+ def download_and_zip_podcast_series(main_url):
75
+ try:
76
+ title = get_podcast_title(main_url)
77
+ episode_pages = get_episode_links(main_url)
78
 
79
+ if not episode_pages:
80
+ return "Aucune page d’épisode trouvée.", None
 
 
 
 
81
 
82
+ mp3_links = []
83
+ for ep_url in episode_pages:
84
+ mp3 = extract_mp3_from_episode(ep_url)
85
+ if mp3:
86
+ mp3_links.append(mp3)
87
 
 
 
 
88
  if not mp3_links:
89
+ return "Aucun fichier MP3 trouvé dans les épisodes.", None
90
 
91
  temp_dir = tempfile.mkdtemp()
92
  for i, mp3_url in enumerate(mp3_links, start=1):
93
+ filename = f"{title}_{i:02}.mp3"
94
  filepath = os.path.join(temp_dir, filename)
95
  with requests.get(mp3_url, stream=True) as r:
96
  r.raise_for_status()
 
98
  for chunk in r.iter_content(chunk_size=8192):
99
  f.write(chunk)
100
 
101
+ zip_path = os.path.join(temp_dir, f"{title}.zip")
102
  shutil.make_archive(zip_path.replace('.zip', ''), 'zip', temp_dir)
103
 
104
+ return f"{len(mp3_links)} épisode(s) téléchargé(s).", zip_path
105
 
106
  except Exception as e:
107
  return f"Erreur : {str(e)}", None
108
 
109
  # === INTERFACE GRADIO ===
110
  with gr.Blocks() as app:
111
+ gr.Markdown("# Téléchargeur de Podcasts MP3 - France Culture")
112
  with gr.Row():
113
+ url_input = gr.Textbox(label="URL de la série", placeholder="https://www.radiofrance.fr/...")
114
  download_button = gr.Button("Télécharger et compresser")
115
  output_text = gr.Textbox(label="Message")
116
  file_output = gr.File(label="Fichier ZIP", file_types=[".zip"])
117
 
118
  def process(url):
119
+ message, zip_file = download_and_zip_podcast_series(url)
120
  return message, zip_file
121
 
122
  download_button.click(fn=process, inputs=[url_input], outputs=[output_text, file_output])
123
 
124
+ # === LANCEMENT ===
125
  if __name__ == "__main__":
126
+ app.launch(share=True)