Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -38,6 +38,7 @@ from moviepy import VideoFileClip
|
|
38 |
import yt_dlp
|
39 |
from youtube_transcript_api import YouTubeTranscriptApi
|
40 |
from urllib.parse import urlparse, parse_qs
|
|
|
41 |
from ratelimit import limits, sleep_and_retry
|
42 |
import time
|
43 |
import fasttext
|
@@ -51,6 +52,7 @@ from PyPDF2 import PdfReader
|
|
51 |
from pptx import Presentation
|
52 |
import trafilatura
|
53 |
from bs4 import BeautifulSoup
|
|
|
54 |
from dotenv import load_dotenv
|
55 |
|
56 |
load_dotenv()
|
@@ -1084,6 +1086,42 @@ def process_document_with_password(file, password: str, doc_processor: DocumentP
|
|
1084 |
st.error(str(e))
|
1085 |
return None
|
1086 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1087 |
def process_web():
|
1088 |
"""Traitement des contenus web"""
|
1089 |
url = st.text_input("URL du site web")
|
@@ -1096,6 +1134,18 @@ def process_web():
|
|
1096 |
auth = {"username": username, "password": password}
|
1097 |
|
1098 |
if url and st.button("Analyser"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1099 |
try:
|
1100 |
doc_processor = DocumentProcessor(
|
1101 |
st.session_state.audio_processor.llm.model_name,
|
|
|
38 |
import yt_dlp
|
39 |
from youtube_transcript_api import YouTubeTranscriptApi
|
40 |
from urllib.parse import urlparse, parse_qs
|
41 |
+
import mimetypes
|
42 |
from ratelimit import limits, sleep_and_retry
|
43 |
import time
|
44 |
import fasttext
|
|
|
52 |
from pptx import Presentation
|
53 |
import trafilatura
|
54 |
from bs4 import BeautifulSoup
|
55 |
+
|
56 |
from dotenv import load_dotenv
|
57 |
|
58 |
load_dotenv()
|
|
|
1086 |
st.error(str(e))
|
1087 |
return None
|
1088 |
|
1089 |
+
|
1090 |
+
|
1091 |
+
|
1092 |
+
def is_text_content(url):
|
1093 |
+
try:
|
1094 |
+
# Utiliser Selenium ou Playwright pour le rendu JavaScript
|
1095 |
+
response = requests.get(url)
|
1096 |
+
return ('text' in response.headers.get('content-type', '').lower()
|
1097 |
+
or 'html' in response.headers.get('content-type', '').lower()
|
1098 |
+
or 'application/json' in response.headers.get('content-type', '').lower())
|
1099 |
+
except:
|
1100 |
+
return False
|
1101 |
+
|
1102 |
+
def is_valid_content_url(url):
|
1103 |
+
"""Vérifie si l'URL est valide pour l'extraction de contenu"""
|
1104 |
+
parsed = urlparse(url)
|
1105 |
+
|
1106 |
+
excluded_domains = [
|
1107 |
+
'youtube.com', 'vimeo.com', 'dailymotion.com',
|
1108 |
+
'imgur.com', 'flickr.com', 'instagram.com',
|
1109 |
+
'facebook.com', 'fb.com', 'twitter.com', 'x.com',
|
1110 |
+
'tiktok.com', 'linkedin.com', 'pinterest.com',
|
1111 |
+
'snapchat.com', 'reddit.com', 'tumblr.com',
|
1112 |
+
'whatsapp.com', 'telegram.org', 'discord.com'
|
1113 |
+
]
|
1114 |
+
|
1115 |
+
excluded_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.mp4', '.mp3', '.pdf']
|
1116 |
+
|
1117 |
+
domain = parsed.netloc.lower()
|
1118 |
+
path = parsed.path.lower()
|
1119 |
+
|
1120 |
+
return not (
|
1121 |
+
any(exc in domain for exc in excluded_domains) or
|
1122 |
+
any(path.endswith(ext) for ext in excluded_extensions)
|
1123 |
+
)
|
1124 |
+
|
1125 |
def process_web():
|
1126 |
"""Traitement des contenus web"""
|
1127 |
url = st.text_input("URL du site web")
|
|
|
1134 |
auth = {"username": username, "password": password}
|
1135 |
|
1136 |
if url and st.button("Analyser"):
|
1137 |
+
if not url.startswith(('http://', 'https://')):
|
1138 |
+
st.error("L'URL doit commencer par 'http://' ou 'https://'")
|
1139 |
+
return
|
1140 |
+
|
1141 |
+
if not is_valid_content_url(url):
|
1142 |
+
st.error(f"Cette URL ({url}) ne peut pas être traitée (vidéo, image ou autre contenu non supporté)")
|
1143 |
+
return
|
1144 |
+
|
1145 |
+
if not is_text_content(url):
|
1146 |
+
st.error(f"Cette URL ({url}) ne contient pas de contenu textuel analysable")
|
1147 |
+
return
|
1148 |
+
|
1149 |
try:
|
1150 |
doc_processor = DocumentProcessor(
|
1151 |
st.session_state.audio_processor.llm.model_name,
|