Update legal.py
Browse files
legal.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
from flask import Flask, request, jsonify, send_from_directory
|
2 |
import speech_recognition as sr
|
3 |
-
import threading
|
4 |
import datetime
|
5 |
import pyttsx3
|
6 |
from langdetect import detect
|
@@ -16,18 +15,15 @@ from pydub import AudioSegment
|
|
16 |
import os
|
17 |
from werkzeug.utils import secure_filename
|
18 |
import tempfile
|
|
|
19 |
|
20 |
app = Flask(__name__, static_folder='.') # Serve static files from the current directory
|
21 |
|
22 |
# Load Hugging Face API key from environment variable
|
23 |
-
|
|
|
24 |
if not hf_token:
|
25 |
-
|
26 |
-
from dotenv import load_dotenv
|
27 |
-
load_dotenv()
|
28 |
-
hf_token = os.environ.get("API_KEY")
|
29 |
-
if not hf_token:
|
30 |
-
raise ValueError("Hugging Face API key not found. Please set 'API_KEY' as an environment variable or in a .env file.")
|
31 |
|
32 |
login(token=hf_token)
|
33 |
|
@@ -44,12 +40,19 @@ summarizer_pipeline = pipeline("summarization", model=summarizer_model, tokenize
|
|
44 |
embed_model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2")
|
45 |
|
46 |
# Load both datasets
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
# Extract text from JSON
|
55 |
corpus_json = []
|
@@ -63,6 +66,7 @@ for entry in json_data:
|
|
63 |
corpus = corpus_parquet + corpus_json
|
64 |
|
65 |
# Compute embeddings
|
|
|
66 |
embeddings = embed_model.encode(corpus, show_progress_bar=True, batch_size=16)
|
67 |
|
68 |
# Build FAISS index
|
@@ -107,7 +111,7 @@ def init_tts_engine():
|
|
107 |
tts_engine.setProperty('voice', v.id)
|
108 |
break
|
109 |
|
110 |
-
init_tts_engine()
|
111 |
|
112 |
# Global variables for managing state (simplify for web context)
|
113 |
conversation_history = []
|
@@ -120,8 +124,12 @@ def serve_index():
|
|
120 |
|
121 |
@app.route('/<path:path>')
|
122 |
def serve_static_files(path):
|
|
|
|
|
|
|
123 |
return send_from_directory('.', path)
|
124 |
|
|
|
125 |
@app.route('/answer', methods=['POST'])
|
126 |
def generate_answer_endpoint():
|
127 |
global last_question_text, last_answer_text, conversation_history
|
@@ -143,6 +151,8 @@ def generate_answer_endpoint():
|
|
143 |
|
144 |
@app.route('/read-aloud', methods=['POST'])
|
145 |
def read_aloud_endpoint():
|
|
|
|
|
146 |
data = request.get_json()
|
147 |
text_to_read = data.get('text', '').strip()
|
148 |
|
@@ -157,20 +167,13 @@ def read_aloud_endpoint():
|
|
157 |
tts_engine.save_to_file(text_to_read, temp_audio_path)
|
158 |
tts_engine.runAndWait()
|
159 |
|
160 |
-
# You would typically serve this file or stream it.
|
161 |
-
# For
|
162 |
-
# In a real app, you might use Flask's send_file for audio playback.
|
163 |
-
# For now, let's just return success.
|
164 |
-
# This approach is suitable if the browser requests the audio file directly after this.
|
165 |
-
# For direct playback, you might stream it or serve it immediately.
|
166 |
-
# For web, it's more common to have the frontend's SpeechSynthesis API handle this.
|
167 |
-
# The frontend `readAloud` function already does this.
|
168 |
-
# So, this endpoint might not be strictly necessary unless for server-side TTS.
|
169 |
return jsonify({"status": "TTS audio generated (server-side)."})
|
170 |
except Exception as e:
|
171 |
return jsonify({"status": f"Error during TTS: {str(e)}"}), 500
|
172 |
finally:
|
173 |
-
if os.path.exists(temp_audio_path):
|
174 |
os.remove(temp_audio_path)
|
175 |
|
176 |
|
@@ -186,38 +189,45 @@ def upload_mp3_endpoint():
|
|
186 |
if file:
|
187 |
filename = secure_filename(file.filename)
|
188 |
# Create a temporary directory to save the uploaded file and its WAV conversion
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
|
222 |
@app.route('/summarize', methods=['POST'])
|
223 |
def summarize_endpoint():
|
|
|
1 |
from flask import Flask, request, jsonify, send_from_directory
|
2 |
import speech_recognition as sr
|
|
|
3 |
import datetime
|
4 |
import pyttsx3
|
5 |
from langdetect import detect
|
|
|
15 |
import os
|
16 |
from werkzeug.utils import secure_filename
|
17 |
import tempfile
|
18 |
+
from dotenv import load_dotenv # Ensure dotenv is imported for .env loading
|
19 |
|
20 |
app = Flask(__name__, static_folder='.') # Serve static files from the current directory
|
21 |
|
22 |
# Load Hugging Face API key from environment variable
|
23 |
+
load_dotenv() # Load environment variables from .env file
|
24 |
+
hf_token = os.environ.get("API_KEY")
|
25 |
if not hf_token:
|
26 |
+
raise ValueError("Hugging Face API key not found. Please set 'API_KEY' as an environment variable or in a .env file.")
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
login(token=hf_token)
|
29 |
|
|
|
40 |
embed_model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2")
|
41 |
|
42 |
# Load both datasets
|
43 |
+
try:
|
44 |
+
df_parquet = pd.read_parquet("ibtehaj dataset.parquet")
|
45 |
+
corpus_parquet = df_parquet["text"].dropna().tolist()
|
46 |
+
except FileNotFoundError:
|
47 |
+
raise FileNotFoundError("ibtehaj dataset.parquet not found. Make sure it's in the same directory as app.py")
|
48 |
+
|
49 |
+
try:
|
50 |
+
with open("pdf_data.json", "r", encoding="utf-8") as f:
|
51 |
+
json_data = json.load(f)
|
52 |
+
except FileNotFoundError:
|
53 |
+
raise FileNotFoundError("pdf_data.json not found. Make sure it's in the same directory as app.py")
|
54 |
+
except json.JSONDecodeError as e:
|
55 |
+
raise ValueError(f"Error decoding pdf_data.json: {e}")
|
56 |
|
57 |
# Extract text from JSON
|
58 |
corpus_json = []
|
|
|
66 |
corpus = corpus_parquet + corpus_json
|
67 |
|
68 |
# Compute embeddings
|
69 |
+
# This can take a while. Consider pre-computing and saving the index if corpus is large.
|
70 |
embeddings = embed_model.encode(corpus, show_progress_bar=True, batch_size=16)
|
71 |
|
72 |
# Build FAISS index
|
|
|
111 |
tts_engine.setProperty('voice', v.id)
|
112 |
break
|
113 |
|
114 |
+
init_tts_engine() # Initialize TTS engine once on startup
|
115 |
|
116 |
# Global variables for managing state (simplify for web context)
|
117 |
conversation_history = []
|
|
|
124 |
|
125 |
@app.route('/<path:path>')
|
126 |
def serve_static_files(path):
|
127 |
+
# This route serves static files like CSS, JS, and images
|
128 |
+
# It must be specific to paths that exist as files, otherwise it might catch API calls
|
129 |
+
# For now, it's fine, but in complex apps, static files are often served by Nginx/Apache.
|
130 |
return send_from_directory('.', path)
|
131 |
|
132 |
+
|
133 |
@app.route('/answer', methods=['POST'])
|
134 |
def generate_answer_endpoint():
|
135 |
global last_question_text, last_answer_text, conversation_history
|
|
|
151 |
|
152 |
@app.route('/read-aloud', methods=['POST'])
|
153 |
def read_aloud_endpoint():
|
154 |
+
# This endpoint is generally not needed if client-side SpeechSynthesis API is used.
|
155 |
+
# Keeping it for completeness if server-side TTS is desired.
|
156 |
data = request.get_json()
|
157 |
text_to_read = data.get('text', '').strip()
|
158 |
|
|
|
167 |
tts_engine.save_to_file(text_to_read, temp_audio_path)
|
168 |
tts_engine.runAndWait()
|
169 |
|
170 |
+
# You would typically serve this file or stream it for client playback.
|
171 |
+
# For this setup, we'll confirm generation. The frontend handles playback.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
return jsonify({"status": "TTS audio generated (server-side)."})
|
173 |
except Exception as e:
|
174 |
return jsonify({"status": f"Error during TTS: {str(e)}"}), 500
|
175 |
finally:
|
176 |
+
if 'temp_audio_path' in locals() and os.path.exists(temp_audio_path):
|
177 |
os.remove(temp_audio_path)
|
178 |
|
179 |
|
|
|
189 |
if file:
|
190 |
filename = secure_filename(file.filename)
|
191 |
# Create a temporary directory to save the uploaded file and its WAV conversion
|
192 |
+
# Ensure that the temp directory is managed for cleanup.
|
193 |
+
try:
|
194 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
195 |
+
mp3_path = os.path.join(tmpdir, filename)
|
196 |
+
file.save(mp3_path)
|
197 |
+
|
198 |
+
wav_path = os.path.join(tmpdir, filename.replace(".mp3", ".wav"))
|
199 |
+
try:
|
200 |
+
sound = AudioSegment.from_mp3(mp3_path)
|
201 |
+
sound.export(wav_path, format="wav")
|
202 |
+
except Exception as e:
|
203 |
+
# Catch pydub/ffmpeg related errors
|
204 |
+
return jsonify({"message": f"Error converting MP3 to WAV. Ensure FFmpeg is installed and in your system's PATH. Details: {e}"}), 500
|
205 |
+
|
206 |
+
try:
|
207 |
+
recognizer = sr.Recognizer()
|
208 |
+
with sr.AudioFile(wav_path) as src:
|
209 |
+
audio = recognizer.record(src)
|
210 |
+
text = recognizer.recognize_google(audio)
|
211 |
+
except sr.UnknownValueError:
|
212 |
+
return jsonify({"message": "Speech not understood. Please try again."}), 400
|
213 |
+
except sr.RequestError as e:
|
214 |
+
return jsonify({"message": f"Could not request results from speech recognition service; {e}"}), 500
|
215 |
+
except Exception as e: # Catch any other unexpected SR errors
|
216 |
+
return jsonify({"message": f"An unexpected error occurred during speech recognition: {e}"}), 500
|
217 |
+
|
218 |
+
|
219 |
+
# For web, you don't typically "save that file in .txt format and asks the user where to store that" server-side.
|
220 |
+
# The transcription is returned to the client. The client can then decide to save it.
|
221 |
+
return jsonify({
|
222 |
+
"message": "MP3 transcribed successfully.",
|
223 |
+
"transcription": text
|
224 |
+
})
|
225 |
+
except Exception as e:
|
226 |
+
# Catch any errors related to temporary directory creation or file saving
|
227 |
+
return jsonify({"message": f"An error occurred during file upload or temporary processing: {e}"}), 500
|
228 |
+
# This point should not be reached if 'if file' condition is handled.
|
229 |
+
return jsonify({"message": "An unknown file processing error occurred."}), 500
|
230 |
+
|
231 |
|
232 |
@app.route('/summarize', methods=['POST'])
|
233 |
def summarize_endpoint():
|