Spaces:
Sleeping
Sleeping
import os | |
import tempfile | |
import gradio as gr | |
import re | |
import sys | |
import shutil | |
import importlib.util | |
def check_requirements(): | |
"""Check if all required packages are installed and return status""" | |
requirements_status = [] | |
packages = [ | |
('gradio', 'gradio'), | |
('yt-dlp', 'yt_dlp'), | |
('openai-whisper', 'whisper'), | |
('torch', 'torch'), | |
('torchaudio', 'torchaudio'), | |
('numpy', 'numpy'), | |
('regex', 'regex'), | |
] | |
for package_name, import_name in packages: | |
try: | |
spec = importlib.util.find_spec(import_name) | |
if spec is None: | |
requirements_status.append(f"β {package_name}: Not found") | |
continue | |
module = importlib.import_module(import_name) | |
version = getattr(module, '__version__', 'Unknown version') | |
requirements_status.append(f"β {package_name}: {version}") | |
except ImportError as e: | |
requirements_status.append(f"β {package_name}: Import error - {str(e)}") | |
except Exception as e: | |
requirements_status.append(f"β οΈ {package_name}: Found but error - {str(e)}") | |
# Add Python info | |
requirements_status.append(f"\nπ Python: {sys.version}") | |
requirements_status.append(f"π Python executable: {sys.executable}") | |
return "\n".join(requirements_status) | |
# Try to import required packages with error handling | |
try: | |
from yt_dlp import YoutubeDL | |
YT_DLP_AVAILABLE = True | |
except ImportError as e: | |
YT_DLP_AVAILABLE = False | |
print(f"yt-dlp import error: {e}") | |
# Try multiple whisper import methods | |
WHISPER_AVAILABLE = False | |
WHISPER_TYPE = None | |
try: | |
import whisper | |
WHISPER_AVAILABLE = True | |
WHISPER_TYPE = "openai-whisper" | |
print("Using OpenAI Whisper") | |
except ImportError as e: | |
print(f"OpenAI Whisper import error: {e}") | |
try: | |
from transformers import pipeline | |
WHISPER_AVAILABLE = True | |
WHISPER_TYPE = "transformers" | |
print("Using Transformers Whisper") | |
except ImportError as e2: | |
print(f"Transformers Whisper import error: {e2}") | |
print(f"Python version: {sys.version}") | |
print(f"Python executable: {sys.executable}") | |
print(f"yt-dlp available: {YT_DLP_AVAILABLE}") | |
print(f"whisper available: {WHISPER_AVAILABLE} (type: {WHISPER_TYPE})") | |
# Additional diagnostics | |
if YT_DLP_AVAILABLE: | |
try: | |
from yt_dlp import YoutubeDL | |
print(f"yt-dlp version: {YoutubeDL().__class__.__module__}") | |
except: | |
pass | |
if WHISPER_AVAILABLE and WHISPER_TYPE == "openai-whisper": | |
try: | |
import whisper | |
print(f"whisper version: {whisper.__version__}") | |
except: | |
pass | |
def download_audio(url, cookies_file_path=None): | |
"""Download audio from YouTube URL and return the file path""" | |
if not YT_DLP_AVAILABLE: | |
raise Exception("yt-dlp is not available. Please check the installation.") | |
try: | |
# Create a temporary directory for downloads | |
temp_dir = tempfile.mkdtemp() | |
output_path = os.path.join(temp_dir, "audio") | |
# Base yt-dlp options | |
ydl_opts = { | |
'format': 'bestaudio[ext=m4a]/bestaudio/best', | |
'outtmpl': output_path + '.%(ext)s', | |
'quiet': True, | |
'no_warnings': True, | |
'force_ipv4': True, | |
'referer': 'https://www.youtube.com/', | |
'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
'extractor_retries': 3, | |
'fragment_retries': 3, | |
'retry_sleep_functions': {'http': lambda n: 2 ** n}, | |
} | |
# Add cookies file if provided | |
if cookies_file_path and os.path.exists(cookies_file_path): | |
print(f"β Using cookies file: {cookies_file_path}") | |
ydl_opts['cookiefile'] = cookies_file_path | |
else: | |
print("β οΈ No valid cookies file provided β likely to hit 403 Forbidden.") | |
# Extra headers to mimic real browser | |
ydl_opts['http_headers'] = { | |
'User-Agent': ydl_opts['user_agent'], | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
'Accept-Language': 'en-US,en;q=0.5', | |
'Accept-Encoding': 'gzip, deflate', | |
'DNT': '1', | |
'Connection': 'keep-alive', | |
'Upgrade-Insecure-Requests': '1', | |
'Referer': 'https://www.youtube.com/', | |
} | |
print(f"π§ yt-dlp options:\n{ydl_opts}") | |
with YoutubeDL(ydl_opts) as ydl: | |
info_dict = ydl.extract_info(url, download=True) | |
filename = ydl.prepare_filename(info_dict) | |
# Search for the downloaded audio file | |
for ext in ['.m4a', '.webm', '.mp4', '.mp3']: | |
potential_file = output_path + ext | |
if os.path.exists(potential_file): | |
print(f"β Audio file downloaded: {potential_file}") | |
return potential_file | |
raise FileNotFoundError("Downloaded audio file not found.") | |
except Exception as e: | |
import traceback | |
traceback.print_exc() # For debugging | |
if "403" in str(e) or "Forbidden" in str(e): | |
raise Exception(f"YouTube blocked the request (403 Forbidden). Please upload a valid cookies.txt file. Original error: {str(e)}") | |
else: | |
raise Exception(f"Failed to download audio: {str(e)}") | |
def transcribe_audio(file_path): | |
"""Transcribe audio file using Whisper""" | |
if not WHISPER_AVAILABLE: | |
raise Exception("OpenAI Whisper is not available. Please install it using: pip install openai-whisper") | |
try: | |
if WHISPER_TYPE == "openai-whisper": | |
# Use OpenAI Whisper | |
model = whisper.load_model("tiny") | |
result = model.transcribe(file_path) | |
return result["text"] | |
elif WHISPER_TYPE == "transformers": | |
# Use Transformers Whisper | |
from transformers import pipeline | |
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-tiny") | |
result = transcriber(file_path) | |
return result["text"] | |
else: | |
raise Exception("No compatible Whisper installation found") | |
except Exception as e: | |
raise Exception(f"Failed to transcribe audio: {str(e)}") | |
def extract_stock_info_simple(text): | |
"""Extract stock information using simple pattern matching""" | |
try: | |
stock_info = [] | |
# Simple patterns to look for stock-related information | |
stock_patterns = [ | |
r'\b[A-Z]{1,5}\b(?:\s+stock|\s+shares|\s+symbol)', # Stock symbols | |
r'(?:buy|sell|target|price)\s+[A-Z]{1,5}', | |
r'\$\d+(?:\.\d{2})?', # Dollar amounts | |
r'\b(?:bullish|bearish|buy|sell|hold)\b', | |
] | |
# Look for company names and stock mentions | |
companies = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*(?:\s+(?:Inc|Corp|Company|Ltd)\.?)?', text) | |
symbols = re.findall(r'\b[A-Z]{2,5}\b', text) | |
prices = re.findall(r'\$\d+(?:\.\d{2})?', text) | |
actions = re.findall(r'\b(?:buy|sell|hold|bullish|bearish|target|stop\s+loss)\b', text, re.IGNORECASE) | |
# Format the extracted information | |
result = "=== EXTRACTED STOCK INFORMATION ===\n\n" | |
if companies: | |
result += f"π Mentioned Companies: {', '.join(set(companies[:10]))}\n\n" | |
if symbols: | |
result += f"π€ Potential Stock Symbols: {', '.join(set(symbols[:10]))}\n\n" | |
if prices: | |
result += f"π° Price Mentions: {', '.join(set(prices[:10]))}\n\n" | |
if actions: | |
result += f"π Trading Actions: {', '.join(set(actions[:10]))}\n\n" | |
# Look for specific recommendation patterns | |
recommendations = [] | |
sentences = text.split('.') | |
for sentence in sentences: | |
if any(word in sentence.lower() for word in ['buy', 'sell', 'target', 'recommendation']): | |
if any(symbol in sentence for symbol in symbols[:5]): | |
recommendations.append(sentence.strip()) | |
if recommendations: | |
result += "π― Potential Recommendations:\n" | |
for rec in recommendations[:5]: | |
result += f"β’ {rec}\n" | |
if not any([companies, symbols, prices, actions]): | |
result += "β οΈ No clear stock recommendations found in the transcript.\n" | |
result += "This might be because:\n" | |
result += "β’ The video doesn't contain stock recommendations\n" | |
result += "β’ The audio quality was poor\n" | |
result += "β’ The content is not in English\n" | |
return result | |
except Exception as e: | |
return f"Error extracting stock info: {str(e)}" | |
def cleanup_file(file_path): | |
"""Clean up temporary files""" | |
try: | |
if file_path and os.path.exists(file_path): | |
os.remove(file_path) | |
# Also try to remove the directory if it's empty | |
try: | |
os.rmdir(os.path.dirname(file_path)) | |
except: | |
pass | |
except: | |
pass | |
def process_cookies_file(cookies_file): | |
"""Process uploaded cookies file and return the path""" | |
if cookies_file is None: | |
return None | |
try: | |
# Create a temporary file for cookies | |
temp_cookies_path = tempfile.mktemp(suffix='.txt') | |
# Copy the uploaded file to temp location | |
shutil.copy2(cookies_file.name, temp_cookies_path) | |
return temp_cookies_path | |
except Exception as e: | |
print(f"Error processing cookies file: {e}") | |
return None | |
def process_video(url, cookies_file, progress=gr.Progress()): | |
"""Main function to process YouTube video""" | |
# Check if required packages are available | |
if not YT_DLP_AVAILABLE: | |
return "Error: yt-dlp is not installed properly. Please install it using: pip install yt-dlp", "", "β Error: Missing yt-dlp" | |
if not WHISPER_AVAILABLE: | |
return "Error: OpenAI Whisper is not installed properly. Please install it using: pip install openai-whisper", "", "β Error: Missing Whisper" | |
if not url or not url.strip(): | |
return "Please provide a valid YouTube URL", "", "β Error: Invalid URL" | |
audio_path = None | |
cookies_temp_path = None | |
try: | |
# Validate URL | |
if not any(domain in url.lower() for domain in ['youtube.com', 'youtu.be']): | |
return "Please provide a valid YouTube URL", "", "β Error: Invalid URL" | |
# Process cookies file if provided | |
progress(0.05, desc="Processing cookies...") | |
cookies_temp_path = process_cookies_file(cookies_file) | |
status_msg = "β Cookies loaded" if cookies_temp_path else "β οΈ No cookies (may encounter bot detection)" | |
# Download audio | |
progress(0.2, desc="Downloading audio...") | |
audio_path = download_audio(url, cookies_temp_path) | |
# Transcribe audio | |
progress(0.6, desc="Transcribing audio...") | |
transcript = transcribe_audio(audio_path) | |
if not transcript.strip(): | |
return "No speech detected in the video", "", "β No speech detected" | |
# Extract stock information | |
progress(0.9, desc="Extracting stock information...") | |
stock_details = extract_stock_info_simple(transcript) | |
progress(1.0, desc="Complete!") | |
return transcript, stock_details, "β Processing completed successfully" | |
except Exception as e: | |
error_msg = f"Error processing video: {str(e)}" | |
return error_msg, "", f"β Error: {str(e)}" | |
finally: | |
# Clean up temporary files | |
cleanup_file(audio_path) | |
cleanup_file(cookies_temp_path) | |
# Create Gradio interface | |
with gr.Blocks( | |
title="Stock Recommendation Extractor", | |
theme=gr.themes.Soft(), | |
css=""" | |
.gradio-container { | |
max-width: 1400px; | |
margin: auto; | |
} | |
.status-box { | |
padding: 10px; | |
border-radius: 5px; | |
margin: 10px 0; | |
} | |
""" | |
) as demo: | |
gr.Markdown(""" | |
# π Stock Recommendation Extractor from YouTube | |
Extract stock recommendations and trading information from YouTube videos using AI transcription. | |
**How it works:** | |
1. Upload your cookies.txt file (optional but recommended to avoid bot detection) | |
2. Paste YouTube video URL | |
3. Downloads audio from YouTube video | |
4. Transcribes using OpenAI Whisper | |
5. Extracts stock-related information | |
**β οΈ Disclaimer:** This is for educational purposes only. Always do your own research! | |
""") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
# Requirements check button | |
gr.Markdown("### π System Check") | |
check_req_btn = gr.Button( | |
"Check Requirements", | |
variant="secondary", | |
size="sm" | |
) | |
requirements_output = gr.Textbox( | |
label="π Requirements Status", | |
lines=10, | |
interactive=False, | |
visible=False | |
) | |
# Cookies file upload | |
cookies_input = gr.File( | |
label="πͺ Upload Cookies File (cookies.txt)", | |
file_types=[".txt"], | |
file_count="single" | |
) | |
gr.Markdown(""" | |
**How to get cookies.txt to fix 403 Forbidden errors:** | |
1. Install browser extension: "Get cookies.txt LOCALLY" | |
2. Visit YouTube in your browser (while logged in) | |
3. Click the extension icon and export cookies for youtube.com | |
4. Upload the downloaded cookies.txt file here | |
**Alternative extensions:** | |
- "cookies.txt" (Chrome/Firefox) | |
- "Export Cookies" (Chrome) | |
β οΈ **Important**: Without cookies, you'll likely get 403 Forbidden errors | |
""") | |
url_input = gr.Textbox( | |
label="πΊ YouTube URL", | |
placeholder="https://www.youtube.com/watch?v=...", | |
lines=2 | |
) | |
process_btn = gr.Button( | |
"π Extract Stock Information", | |
variant="primary", | |
size="lg" | |
) | |
# Status display | |
status_output = gr.Textbox( | |
label="π Status", | |
lines=1, | |
interactive=False | |
) | |
gr.Markdown(""" | |
### π‘ Tips: | |
- **MUST upload cookies.txt** to avoid 403 Forbidden errors | |
- Works best with financial YouTube channels | |
- Ensure video has clear audio | |
- English content works best | |
- Try shorter videos first (under 10 minutes) | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
transcript_output = gr.Textbox( | |
label="π Full Transcript", | |
lines=15, | |
max_lines=20, | |
show_copy_button=True | |
) | |
with gr.Column(): | |
stock_info_output = gr.Textbox( | |
label="π Extracted Stock Information", | |
lines=15, | |
max_lines=20, | |
show_copy_button=True | |
) | |
# Event handlers | |
def show_requirements(): | |
status = check_requirements() | |
return gr.update(value=status, visible=True) | |
check_req_btn.click( | |
fn=show_requirements, | |
outputs=[requirements_output] | |
) | |
process_btn.click( | |
fn=process_video, | |
inputs=[url_input, cookies_input], | |
outputs=[transcript_output, stock_info_output, status_output], | |
show_progress=True | |
) | |
# Example section | |
gr.Markdown("### π Example URLs (Replace with actual financial videos)") | |
gr.Examples( | |
examples=[ | |
["https://www.youtube.com/watch?v=dQw4w9WgXcQ"], | |
], | |
inputs=[url_input], | |
label="Click to try example" | |
) | |
gr.Markdown(""" | |
### π§ Installation & Troubleshooting: | |
**Step 1: Click "Check Requirements" button above to see what's missing** | |
**If you get "Whisper Missing" error:** | |
```bash | |
pip install openai-whisper | |
``` | |
**If you get "yt-dlp Missing" error:** | |
```bash | |
pip install yt-dlp | |
``` | |
**Install all requirements at once:** | |
```bash | |
pip install gradio==4.44.0 yt-dlp==2023.12.30 openai-whisper==20231117 torch==2.1.0 torchaudio==2.1.0 numpy==1.24.3 regex==2023.8.8 | |
``` | |
**Alternative Whisper installation:** | |
```bash | |
pip install transformers torch torchaudio | |
``` | |
**If using virtual environment:** | |
```bash | |
# Create and activate virtual environment first | |
python -m venv myenv | |
# Windows: myenv\\Scripts\\activate | |
# Mac/Linux: source myenv/bin/activate | |
# Then install packages | |
pip install -r requirements.txt | |
``` | |
**Other Issues:** | |
- **Bot Detection Error**: Upload your cookies.txt file | |
- **No Audio Found**: Check if video has audio track | |
- **Transcription Failed**: Video might be too long or audio quality poor | |
- **No Stock Info**: Video might not contain financial content | |
""") | |
if __name__ == "__main__": | |
demo.launch() |