tejash300 commited on
Commit
71359f1
Β·
verified Β·
1 Parent(s): 46a0c3e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +155 -0
app.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import torch
4
+ import uvicorn
5
+ import spacy
6
+ import pdfplumber
7
+ import moviepy.editor as mp
8
+ import librosa
9
+ import soundfile as sf
10
+ from fastapi import FastAPI, UploadFile, File, HTTPException
11
+ from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
12
+ from sentence_transformers import SentenceTransformer, util
13
+
14
+ # βœ… Suppress Warnings
15
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
16
+ os.environ['CUDA_VISIBLE_DEVICES'] = '0'
17
+
18
+ # βœ… Ensure GPU is Used
19
+ device = "cuda" if torch.cuda.is_available() else "cpu"
20
+
21
+ # βœ… Load NLP Models
22
+ try:
23
+ if not spacy.util.is_package("en_core_web_sm"):
24
+ spacy.cli.download("en_core_web_sm")
25
+
26
+ nlp = spacy.load("en_core_web_sm")
27
+ summarizer = pipeline("summarization", model="nsi319/legal-pegasus", device=0 if torch.cuda.is_available() else -1)
28
+ embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)
29
+ ner_model = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER", device=0 if torch.cuda.is_available() else -1)
30
+ speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=0 if torch.cuda.is_available() else -1)
31
+
32
+ except Exception as e:
33
+ raise RuntimeError(f"Error loading models: {str(e)}")
34
+
35
+ # βœ… Load Falcon 7B for Chatbot
36
+ MODEL_NAME = "tiiuae/falcon-7b-instruct"
37
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
38
+ chatbot_model = AutoModelForCausalLM.from_pretrained(
39
+ MODEL_NAME,
40
+ torch_dtype=torch.bfloat16,
41
+ device_map="auto"
42
+ )
43
+
44
+ # βœ… Initialize FastAPI
45
+ app = FastAPI()
46
+
47
+ # βœ… PDF Text Extraction
48
+ def extract_text_from_pdf(pdf_file):
49
+ """Extracts text from a PDF file using pdfplumber."""
50
+ try:
51
+ with pdfplumber.open(pdf_file) as pdf:
52
+ text = "\n".join([page.extract_text() or "" for page in pdf.pages])
53
+ if not text.strip():
54
+ raise ValueError("No readable text found in PDF. It may be a scanned document.")
55
+ return text
56
+ except Exception as e:
57
+ raise HTTPException(status_code=400, detail=f"PDF extraction failed: {str(e)}")
58
+
59
+ # βœ… Video-to-Audio Extraction
60
+ def extract_audio_from_video(video_path):
61
+ """Extracts audio from a video file."""
62
+ try:
63
+ video = mp.VideoFileClip(video_path)
64
+ audio_path = video_path.replace(".mp4", ".wav")
65
+ video.audio.write_audiofile(audio_path, codec="pcm_s16le")
66
+ return audio_path
67
+ except Exception as e:
68
+ raise HTTPException(status_code=500, detail=f"Audio extraction failed: {str(e)}")
69
+
70
+ # βœ… Speech-to-Text Transcription (Fix for Long Audio)
71
+ def transcribe_audio(audio_path):
72
+ """Transcribes speech to text using Whisper model with chunking for long files."""
73
+ try:
74
+ audio, sr = librosa.load(audio_path, sr=16000)
75
+ duration = len(audio) / sr
76
+
77
+ if duration > 30:
78
+ chunk_size = 30 * sr # 30-second chunks
79
+ chunks = [audio[i:i + chunk_size] for i in range(0, len(audio), chunk_size)]
80
+
81
+ transcripts = []
82
+ for idx, chunk in enumerate(chunks):
83
+ temp_chunk_path = f"temp_chunk_{idx}.wav"
84
+ sf.write(temp_chunk_path, chunk, sr)
85
+ result = speech_to_text(temp_chunk_path)
86
+ transcripts.append(result["text"])
87
+ os.remove(temp_chunk_path)
88
+
89
+ return " ".join(transcripts)
90
+ else:
91
+ result = speech_to_text(audio_path)
92
+ return result["text"]
93
+
94
+ except Exception as e:
95
+ raise HTTPException(status_code=500, detail=f"Speech-to-text failed: {str(e)}")
96
+
97
+ # βœ… Legal Document Summarization
98
+ async def summarize_legal_document(text):
99
+ """Generates a summary of the legal document."""
100
+ try:
101
+ summary = summarizer(text[:1024], max_length=200, min_length=50, do_sample=False)
102
+ return summary[0]['summary_text']
103
+ except Exception as e:
104
+ return "Summarization failed due to an internal error."
105
+
106
+ # βœ… Legal Document Analysis API
107
+ @app.post("/analyze_legal_document")
108
+ async def analyze_legal_document(file: UploadFile = File(...)):
109
+ """Analyzes a legal document by extracting text, summarizing, and identifying entities."""
110
+ try:
111
+ content = await file.read()
112
+ text = extract_text_from_pdf(io.BytesIO(content))
113
+
114
+ summary = await summarize_legal_document(text)
115
+
116
+ return {"status": "success", "summary": summary}
117
+ except Exception as e:
118
+ return {"status": "error", "detail": str(e)}
119
+
120
+ # βœ… Chatbot API
121
+ @app.post("/chatbot")
122
+ async def chatbot_endpoint(query: dict):
123
+ """Handles chatbot queries using Falcon 7B."""
124
+ try:
125
+ input_text = query.get("query", "")
126
+ if not input_text:
127
+ raise HTTPException(status_code=400, detail="Query cannot be empty.")
128
+
129
+ inputs = tokenizer(input_text, return_tensors="pt").to(device)
130
+ outputs = chatbot_model.generate(**inputs, max_length=200)
131
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
132
+
133
+ return {"status": "success", "answer": response}
134
+ except Exception as e:
135
+ return {"status": "error", "message": str(e)}
136
+
137
+ # βœ… Video Upload & Analysis API
138
+ @app.post("/analyze_video")
139
+ async def analyze_video(file: UploadFile = File(...)):
140
+ """Extracts speech from video and analyzes it."""
141
+ try:
142
+ video_path = f"temp_{file.filename}"
143
+ with open(video_path, "wb") as f:
144
+ f.write(await file.read())
145
+
146
+ audio_path = extract_audio_from_video(video_path)
147
+ transcript = transcribe_audio(audio_path)
148
+
149
+ return {"status": "success", "transcript": transcript}
150
+ except Exception as e:
151
+ return {"status": "error", "message": str(e)}
152
+
153
+ # βœ… Run FastAPI Server
154
+ if __name__ == "__main__":
155
+ uvicorn.run(app, host="0.0.0.0", port=7860)