Spaces:

fizzarif7
/

assistant

Running

App Files Files Community

fizzarif7 commited on Jun 24

Commit

1285a73

verified ·

1 Parent(s): b5201bc

Update legal.py

Browse files

Files changed (1) hide show

legal.py +67 -57

legal.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from flask import Flask, request, jsonify, send_from_directory
 import speech_recognition as sr
-import threading
 import datetime
 import pyttsx3
 from langdetect import detect
@@ -16,18 +15,15 @@ from pydub import AudioSegment
 import os
 from werkzeug.utils import secure_filename
 import tempfile
 app = Flask(__name__, static_folder='.') # Serve static files from the current directory
 # Load Hugging Face API key from environment variable
-hf_token = os.environ.get("api")
 if not hf_token:
-    # Attempt to load from .env file if not set in environment
-    from dotenv import load_dotenv
-    load_dotenv()
-    hf_token = os.environ.get("API_KEY")
-    if not hf_token:
-        raise ValueError("Hugging Face API key not found. Please set 'API_KEY' as an environment variable or in a .env file.")
 login(token=hf_token)
@@ -44,12 +40,19 @@ summarizer_pipeline = pipeline("summarization", model=summarizer_model, tokenize
 embed_model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2")
 # Load both datasets
-df_parquet = pd.read_parquet("ibtehaj dataset.parquet")
-corpus_parquet = df_parquet["text"].dropna().tolist()
-# Load the JSON dataset
-with open("pdf_data.json", "r", encoding="utf-8") as f:
-    json_data = json.load(f)
 # Extract text from JSON
 corpus_json = []
@@ -63,6 +66,7 @@ for entry in json_data:
 corpus = corpus_parquet + corpus_json
 # Compute embeddings
 embeddings = embed_model.encode(corpus, show_progress_bar=True, batch_size=16)
 # Build FAISS index
@@ -107,7 +111,7 @@ def init_tts_engine():
                 tts_engine.setProperty('voice', v.id)
                 break
-init_tts_engine()
 # Global variables for managing state (simplify for web context)
 conversation_history = []
@@ -120,8 +124,12 @@ def serve_index():
 @app.route('/<path:path>')
 def serve_static_files(path):
     return send_from_directory('.', path)
 @app.route('/answer', methods=['POST'])
 def generate_answer_endpoint():
     global last_question_text, last_answer_text, conversation_history
@@ -143,6 +151,8 @@ def generate_answer_endpoint():
 @app.route('/read-aloud', methods=['POST'])
 def read_aloud_endpoint():
     data = request.get_json()
     text_to_read = data.get('text', '').strip()
@@ -157,20 +167,13 @@ def read_aloud_endpoint():
         tts_engine.save_to_file(text_to_read, temp_audio_path)
         tts_engine.runAndWait()
-        # You would typically serve this file or stream it.
-        # For simplicity, let's just confirm it was generated.
-        # In a real app, you might use Flask's send_file for audio playback.
-        # For now, let's just return success.
-        # This approach is suitable if the browser requests the audio file directly after this.
-        # For direct playback, you might stream it or serve it immediately.
-        # For web, it's more common to have the frontend's SpeechSynthesis API handle this.
-        # The frontend `readAloud` function already does this.
-        # So, this endpoint might not be strictly necessary unless for server-side TTS.
         return jsonify({"status": "TTS audio generated (server-side)."})
     except Exception as e:
         return jsonify({"status": f"Error during TTS: {str(e)}"}), 500
     finally:
-        if os.path.exists(temp_audio_path):
             os.remove(temp_audio_path)
@@ -186,38 +189,45 @@ def upload_mp3_endpoint():
     if file:
         filename = secure_filename(file.filename)
         # Create a temporary directory to save the uploaded file and its WAV conversion
-        with tempfile.TemporaryDirectory() as tmpdir:
-            mp3_path = os.path.join(tmpdir, filename)
-            file.save(mp3_path)
-            wav_path = os.path.join(tmpdir, filename.replace(".mp3", ".wav"))
-            try:
-                sound = AudioSegment.from_mp3(mp3_path)
-                sound.export(wav_path, format="wav")
-            except Exception as e:
-                return jsonify({"message": f"Error converting MP3 to WAV: {e}"}), 500
-            try:
-                recognizer = sr.Recognizer()
-                with sr.AudioFile(wav_path) as src:
-                    audio = recognizer.record(src)
-                    text = recognizer.recognize_google(audio)
-            except sr.UnknownValueError:
-                return jsonify({"message": "Speech not understood."}), 400
-            except sr.RequestError as e:
-                return jsonify({"message": f"Speech recognition service error: {e}"}), 500
-            # Store transcription temporarily (can be handled differently)
-            transcript_path = os.path.join(tmpdir, "transcription.txt")
-            with open(transcript_path, "w", encoding="utf-8") as f:
-                f.write(text)
-            # Option to summarize or generate answer from transcription
-            # For this web integration, we'll return the transcription and let frontend decide
-            return jsonify({
-                "message": "MP3 transcribed successfully.",
-                "transcription": text
-            })
 @app.route('/summarize', methods=['POST'])
 def summarize_endpoint():

 from flask import Flask, request, jsonify, send_from_directory
 import speech_recognition as sr
 import datetime
 import pyttsx3
 from langdetect import detect
 import os
 from werkzeug.utils import secure_filename
 import tempfile
+from dotenv import load_dotenv # Ensure dotenv is imported for .env loading
 app = Flask(__name__, static_folder='.') # Serve static files from the current directory
 # Load Hugging Face API key from environment variable
+load_dotenv() # Load environment variables from .env file
+hf_token = os.environ.get("API_KEY")
 if not hf_token:
+    raise ValueError("Hugging Face API key not found. Please set 'API_KEY' as an environment variable or in a .env file.")
 login(token=hf_token)
 embed_model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2")
 # Load both datasets
+try:
+    df_parquet = pd.read_parquet("ibtehaj dataset.parquet")
+    corpus_parquet = df_parquet["text"].dropna().tolist()
+except FileNotFoundError:
+    raise FileNotFoundError("ibtehaj dataset.parquet not found. Make sure it's in the same directory as app.py")
+try:
+    with open("pdf_data.json", "r", encoding="utf-8") as f:
+        json_data = json.load(f)
+except FileNotFoundError:
+    raise FileNotFoundError("pdf_data.json not found. Make sure it's in the same directory as app.py")
+except json.JSONDecodeError as e:
+    raise ValueError(f"Error decoding pdf_data.json: {e}")
 # Extract text from JSON
 corpus_json = []
 corpus = corpus_parquet + corpus_json
 # Compute embeddings
+# This can take a while. Consider pre-computing and saving the index if corpus is large.
 embeddings = embed_model.encode(corpus, show_progress_bar=True, batch_size=16)
 # Build FAISS index
                 tts_engine.setProperty('voice', v.id)
                 break
+init_tts_engine() # Initialize TTS engine once on startup
 # Global variables for managing state (simplify for web context)
 conversation_history = []
 @app.route('/<path:path>')
 def serve_static_files(path):
+    # This route serves static files like CSS, JS, and images
+    # It must be specific to paths that exist as files, otherwise it might catch API calls
+    # For now, it's fine, but in complex apps, static files are often served by Nginx/Apache.
     return send_from_directory('.', path)
 @app.route('/answer', methods=['POST'])
 def generate_answer_endpoint():
     global last_question_text, last_answer_text, conversation_history
 @app.route('/read-aloud', methods=['POST'])
 def read_aloud_endpoint():
+    # This endpoint is generally not needed if client-side SpeechSynthesis API is used.
+    # Keeping it for completeness if server-side TTS is desired.
     data = request.get_json()
     text_to_read = data.get('text', '').strip()
         tts_engine.save_to_file(text_to_read, temp_audio_path)
         tts_engine.runAndWait()
+        # You would typically serve this file or stream it for client playback.
+        # For this setup, we'll confirm generation. The frontend handles playback.
         return jsonify({"status": "TTS audio generated (server-side)."})
     except Exception as e:
         return jsonify({"status": f"Error during TTS: {str(e)}"}), 500
     finally:
+        if 'temp_audio_path' in locals() and os.path.exists(temp_audio_path):
             os.remove(temp_audio_path)
     if file:
         filename = secure_filename(file.filename)
         # Create a temporary directory to save the uploaded file and its WAV conversion
+        # Ensure that the temp directory is managed for cleanup.
+        try:
+            with tempfile.TemporaryDirectory() as tmpdir:
+                mp3_path = os.path.join(tmpdir, filename)
+                file.save(mp3_path)
+                wav_path = os.path.join(tmpdir, filename.replace(".mp3", ".wav"))
+                try:
+                    sound = AudioSegment.from_mp3(mp3_path)
+                    sound.export(wav_path, format="wav")
+                except Exception as e:
+                    # Catch pydub/ffmpeg related errors
+                    return jsonify({"message": f"Error converting MP3 to WAV. Ensure FFmpeg is installed and in your system's PATH. Details: {e}"}), 500
+                try:
+                    recognizer = sr.Recognizer()
+                    with sr.AudioFile(wav_path) as src:
+                        audio = recognizer.record(src)
+                        text = recognizer.recognize_google(audio)
+                except sr.UnknownValueError:
+                    return jsonify({"message": "Speech not understood. Please try again."}), 400
+                except sr.RequestError as e:
+                    return jsonify({"message": f"Could not request results from speech recognition service; {e}"}), 500
+                except Exception as e: # Catch any other unexpected SR errors
+                    return jsonify({"message": f"An unexpected error occurred during speech recognition: {e}"}), 500
+                # For web, you don't typically "save that file in .txt format and asks the user where to store that" server-side.
+                # The transcription is returned to the client. The client can then decide to save it.
+                return jsonify({
+                    "message": "MP3 transcribed successfully.",
+                    "transcription": text
+                })
+        except Exception as e:
+            # Catch any errors related to temporary directory creation or file saving
+            return jsonify({"message": f"An error occurred during file upload or temporary processing: {e}"}), 500
+    # This point should not be reached if 'if file' condition is handled.
+    return jsonify({"message": "An unknown file processing error occurred."}), 500
 @app.route('/summarize', methods=['POST'])
 def summarize_endpoint():