fizzarif7 commited on
Commit
1285a73
·
verified ·
1 Parent(s): b5201bc

Update legal.py

Browse files
Files changed (1) hide show
  1. legal.py +67 -57
legal.py CHANGED
@@ -1,6 +1,5 @@
1
  from flask import Flask, request, jsonify, send_from_directory
2
  import speech_recognition as sr
3
- import threading
4
  import datetime
5
  import pyttsx3
6
  from langdetect import detect
@@ -16,18 +15,15 @@ from pydub import AudioSegment
16
  import os
17
  from werkzeug.utils import secure_filename
18
  import tempfile
 
19
 
20
  app = Flask(__name__, static_folder='.') # Serve static files from the current directory
21
 
22
  # Load Hugging Face API key from environment variable
23
- hf_token = os.environ.get("api")
 
24
  if not hf_token:
25
- # Attempt to load from .env file if not set in environment
26
- from dotenv import load_dotenv
27
- load_dotenv()
28
- hf_token = os.environ.get("API_KEY")
29
- if not hf_token:
30
- raise ValueError("Hugging Face API key not found. Please set 'API_KEY' as an environment variable or in a .env file.")
31
 
32
  login(token=hf_token)
33
 
@@ -44,12 +40,19 @@ summarizer_pipeline = pipeline("summarization", model=summarizer_model, tokenize
44
  embed_model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2")
45
 
46
  # Load both datasets
47
- df_parquet = pd.read_parquet("ibtehaj dataset.parquet")
48
- corpus_parquet = df_parquet["text"].dropna().tolist()
49
-
50
- # Load the JSON dataset
51
- with open("pdf_data.json", "r", encoding="utf-8") as f:
52
- json_data = json.load(f)
 
 
 
 
 
 
 
53
 
54
  # Extract text from JSON
55
  corpus_json = []
@@ -63,6 +66,7 @@ for entry in json_data:
63
  corpus = corpus_parquet + corpus_json
64
 
65
  # Compute embeddings
 
66
  embeddings = embed_model.encode(corpus, show_progress_bar=True, batch_size=16)
67
 
68
  # Build FAISS index
@@ -107,7 +111,7 @@ def init_tts_engine():
107
  tts_engine.setProperty('voice', v.id)
108
  break
109
 
110
- init_tts_engine()
111
 
112
  # Global variables for managing state (simplify for web context)
113
  conversation_history = []
@@ -120,8 +124,12 @@ def serve_index():
120
 
121
  @app.route('/<path:path>')
122
  def serve_static_files(path):
 
 
 
123
  return send_from_directory('.', path)
124
 
 
125
  @app.route('/answer', methods=['POST'])
126
  def generate_answer_endpoint():
127
  global last_question_text, last_answer_text, conversation_history
@@ -143,6 +151,8 @@ def generate_answer_endpoint():
143
 
144
  @app.route('/read-aloud', methods=['POST'])
145
  def read_aloud_endpoint():
 
 
146
  data = request.get_json()
147
  text_to_read = data.get('text', '').strip()
148
 
@@ -157,20 +167,13 @@ def read_aloud_endpoint():
157
  tts_engine.save_to_file(text_to_read, temp_audio_path)
158
  tts_engine.runAndWait()
159
 
160
- # You would typically serve this file or stream it.
161
- # For simplicity, let's just confirm it was generated.
162
- # In a real app, you might use Flask's send_file for audio playback.
163
- # For now, let's just return success.
164
- # This approach is suitable if the browser requests the audio file directly after this.
165
- # For direct playback, you might stream it or serve it immediately.
166
- # For web, it's more common to have the frontend's SpeechSynthesis API handle this.
167
- # The frontend `readAloud` function already does this.
168
- # So, this endpoint might not be strictly necessary unless for server-side TTS.
169
  return jsonify({"status": "TTS audio generated (server-side)."})
170
  except Exception as e:
171
  return jsonify({"status": f"Error during TTS: {str(e)}"}), 500
172
  finally:
173
- if os.path.exists(temp_audio_path):
174
  os.remove(temp_audio_path)
175
 
176
 
@@ -186,38 +189,45 @@ def upload_mp3_endpoint():
186
  if file:
187
  filename = secure_filename(file.filename)
188
  # Create a temporary directory to save the uploaded file and its WAV conversion
189
- with tempfile.TemporaryDirectory() as tmpdir:
190
- mp3_path = os.path.join(tmpdir, filename)
191
- file.save(mp3_path)
192
-
193
- wav_path = os.path.join(tmpdir, filename.replace(".mp3", ".wav"))
194
- try:
195
- sound = AudioSegment.from_mp3(mp3_path)
196
- sound.export(wav_path, format="wav")
197
- except Exception as e:
198
- return jsonify({"message": f"Error converting MP3 to WAV: {e}"}), 500
199
-
200
- try:
201
- recognizer = sr.Recognizer()
202
- with sr.AudioFile(wav_path) as src:
203
- audio = recognizer.record(src)
204
- text = recognizer.recognize_google(audio)
205
- except sr.UnknownValueError:
206
- return jsonify({"message": "Speech not understood."}), 400
207
- except sr.RequestError as e:
208
- return jsonify({"message": f"Speech recognition service error: {e}"}), 500
209
-
210
- # Store transcription temporarily (can be handled differently)
211
- transcript_path = os.path.join(tmpdir, "transcription.txt")
212
- with open(transcript_path, "w", encoding="utf-8") as f:
213
- f.write(text)
214
-
215
- # Option to summarize or generate answer from transcription
216
- # For this web integration, we'll return the transcription and let frontend decide
217
- return jsonify({
218
- "message": "MP3 transcribed successfully.",
219
- "transcription": text
220
- })
 
 
 
 
 
 
 
221
 
222
  @app.route('/summarize', methods=['POST'])
223
  def summarize_endpoint():
 
1
  from flask import Flask, request, jsonify, send_from_directory
2
  import speech_recognition as sr
 
3
  import datetime
4
  import pyttsx3
5
  from langdetect import detect
 
15
  import os
16
  from werkzeug.utils import secure_filename
17
  import tempfile
18
+ from dotenv import load_dotenv # Ensure dotenv is imported for .env loading
19
 
20
  app = Flask(__name__, static_folder='.') # Serve static files from the current directory
21
 
22
  # Load Hugging Face API key from environment variable
23
+ load_dotenv() # Load environment variables from .env file
24
+ hf_token = os.environ.get("API_KEY")
25
  if not hf_token:
26
+ raise ValueError("Hugging Face API key not found. Please set 'API_KEY' as an environment variable or in a .env file.")
 
 
 
 
 
27
 
28
  login(token=hf_token)
29
 
 
40
  embed_model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2")
41
 
42
  # Load both datasets
43
+ try:
44
+ df_parquet = pd.read_parquet("ibtehaj dataset.parquet")
45
+ corpus_parquet = df_parquet["text"].dropna().tolist()
46
+ except FileNotFoundError:
47
+ raise FileNotFoundError("ibtehaj dataset.parquet not found. Make sure it's in the same directory as app.py")
48
+
49
+ try:
50
+ with open("pdf_data.json", "r", encoding="utf-8") as f:
51
+ json_data = json.load(f)
52
+ except FileNotFoundError:
53
+ raise FileNotFoundError("pdf_data.json not found. Make sure it's in the same directory as app.py")
54
+ except json.JSONDecodeError as e:
55
+ raise ValueError(f"Error decoding pdf_data.json: {e}")
56
 
57
  # Extract text from JSON
58
  corpus_json = []
 
66
  corpus = corpus_parquet + corpus_json
67
 
68
  # Compute embeddings
69
+ # This can take a while. Consider pre-computing and saving the index if corpus is large.
70
  embeddings = embed_model.encode(corpus, show_progress_bar=True, batch_size=16)
71
 
72
  # Build FAISS index
 
111
  tts_engine.setProperty('voice', v.id)
112
  break
113
 
114
+ init_tts_engine() # Initialize TTS engine once on startup
115
 
116
  # Global variables for managing state (simplify for web context)
117
  conversation_history = []
 
124
 
125
  @app.route('/<path:path>')
126
  def serve_static_files(path):
127
+ # This route serves static files like CSS, JS, and images
128
+ # It must be specific to paths that exist as files, otherwise it might catch API calls
129
+ # For now, it's fine, but in complex apps, static files are often served by Nginx/Apache.
130
  return send_from_directory('.', path)
131
 
132
+
133
  @app.route('/answer', methods=['POST'])
134
  def generate_answer_endpoint():
135
  global last_question_text, last_answer_text, conversation_history
 
151
 
152
  @app.route('/read-aloud', methods=['POST'])
153
  def read_aloud_endpoint():
154
+ # This endpoint is generally not needed if client-side SpeechSynthesis API is used.
155
+ # Keeping it for completeness if server-side TTS is desired.
156
  data = request.get_json()
157
  text_to_read = data.get('text', '').strip()
158
 
 
167
  tts_engine.save_to_file(text_to_read, temp_audio_path)
168
  tts_engine.runAndWait()
169
 
170
+ # You would typically serve this file or stream it for client playback.
171
+ # For this setup, we'll confirm generation. The frontend handles playback.
 
 
 
 
 
 
 
172
  return jsonify({"status": "TTS audio generated (server-side)."})
173
  except Exception as e:
174
  return jsonify({"status": f"Error during TTS: {str(e)}"}), 500
175
  finally:
176
+ if 'temp_audio_path' in locals() and os.path.exists(temp_audio_path):
177
  os.remove(temp_audio_path)
178
 
179
 
 
189
  if file:
190
  filename = secure_filename(file.filename)
191
  # Create a temporary directory to save the uploaded file and its WAV conversion
192
+ # Ensure that the temp directory is managed for cleanup.
193
+ try:
194
+ with tempfile.TemporaryDirectory() as tmpdir:
195
+ mp3_path = os.path.join(tmpdir, filename)
196
+ file.save(mp3_path)
197
+
198
+ wav_path = os.path.join(tmpdir, filename.replace(".mp3", ".wav"))
199
+ try:
200
+ sound = AudioSegment.from_mp3(mp3_path)
201
+ sound.export(wav_path, format="wav")
202
+ except Exception as e:
203
+ # Catch pydub/ffmpeg related errors
204
+ return jsonify({"message": f"Error converting MP3 to WAV. Ensure FFmpeg is installed and in your system's PATH. Details: {e}"}), 500
205
+
206
+ try:
207
+ recognizer = sr.Recognizer()
208
+ with sr.AudioFile(wav_path) as src:
209
+ audio = recognizer.record(src)
210
+ text = recognizer.recognize_google(audio)
211
+ except sr.UnknownValueError:
212
+ return jsonify({"message": "Speech not understood. Please try again."}), 400
213
+ except sr.RequestError as e:
214
+ return jsonify({"message": f"Could not request results from speech recognition service; {e}"}), 500
215
+ except Exception as e: # Catch any other unexpected SR errors
216
+ return jsonify({"message": f"An unexpected error occurred during speech recognition: {e}"}), 500
217
+
218
+
219
+ # For web, you don't typically "save that file in .txt format and asks the user where to store that" server-side.
220
+ # The transcription is returned to the client. The client can then decide to save it.
221
+ return jsonify({
222
+ "message": "MP3 transcribed successfully.",
223
+ "transcription": text
224
+ })
225
+ except Exception as e:
226
+ # Catch any errors related to temporary directory creation or file saving
227
+ return jsonify({"message": f"An error occurred during file upload or temporary processing: {e}"}), 500
228
+ # This point should not be reached if 'if file' condition is handled.
229
+ return jsonify({"message": "An unknown file processing error occurred."}), 500
230
+
231
 
232
  @app.route('/summarize', methods=['POST'])
233
  def summarize_endpoint():