Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import io
|
3 |
+
import torch
|
4 |
+
import uvicorn
|
5 |
+
import spacy
|
6 |
+
import pdfplumber
|
7 |
+
import moviepy.editor as mp
|
8 |
+
import librosa
|
9 |
+
import soundfile as sf
|
10 |
+
from fastapi import FastAPI, UploadFile, File, HTTPException
|
11 |
+
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
|
12 |
+
from sentence_transformers import SentenceTransformer, util
|
13 |
+
|
14 |
+
# β
Suppress Warnings
|
15 |
+
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
16 |
+
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
17 |
+
|
18 |
+
# β
Ensure GPU is Used
|
19 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
20 |
+
|
21 |
+
# β
Load NLP Models
|
22 |
+
try:
|
23 |
+
if not spacy.util.is_package("en_core_web_sm"):
|
24 |
+
spacy.cli.download("en_core_web_sm")
|
25 |
+
|
26 |
+
nlp = spacy.load("en_core_web_sm")
|
27 |
+
summarizer = pipeline("summarization", model="nsi319/legal-pegasus", device=0 if torch.cuda.is_available() else -1)
|
28 |
+
embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)
|
29 |
+
ner_model = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER", device=0 if torch.cuda.is_available() else -1)
|
30 |
+
speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=0 if torch.cuda.is_available() else -1)
|
31 |
+
|
32 |
+
except Exception as e:
|
33 |
+
raise RuntimeError(f"Error loading models: {str(e)}")
|
34 |
+
|
35 |
+
# β
Load Falcon 7B for Chatbot
|
36 |
+
MODEL_NAME = "tiiuae/falcon-7b-instruct"
|
37 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
38 |
+
chatbot_model = AutoModelForCausalLM.from_pretrained(
|
39 |
+
MODEL_NAME,
|
40 |
+
torch_dtype=torch.bfloat16,
|
41 |
+
device_map="auto"
|
42 |
+
)
|
43 |
+
|
44 |
+
# β
Initialize FastAPI
|
45 |
+
app = FastAPI()
|
46 |
+
|
47 |
+
# β
PDF Text Extraction
|
48 |
+
def extract_text_from_pdf(pdf_file):
|
49 |
+
"""Extracts text from a PDF file using pdfplumber."""
|
50 |
+
try:
|
51 |
+
with pdfplumber.open(pdf_file) as pdf:
|
52 |
+
text = "\n".join([page.extract_text() or "" for page in pdf.pages])
|
53 |
+
if not text.strip():
|
54 |
+
raise ValueError("No readable text found in PDF. It may be a scanned document.")
|
55 |
+
return text
|
56 |
+
except Exception as e:
|
57 |
+
raise HTTPException(status_code=400, detail=f"PDF extraction failed: {str(e)}")
|
58 |
+
|
59 |
+
# β
Video-to-Audio Extraction
|
60 |
+
def extract_audio_from_video(video_path):
|
61 |
+
"""Extracts audio from a video file."""
|
62 |
+
try:
|
63 |
+
video = mp.VideoFileClip(video_path)
|
64 |
+
audio_path = video_path.replace(".mp4", ".wav")
|
65 |
+
video.audio.write_audiofile(audio_path, codec="pcm_s16le")
|
66 |
+
return audio_path
|
67 |
+
except Exception as e:
|
68 |
+
raise HTTPException(status_code=500, detail=f"Audio extraction failed: {str(e)}")
|
69 |
+
|
70 |
+
# β
Speech-to-Text Transcription (Fix for Long Audio)
|
71 |
+
def transcribe_audio(audio_path):
|
72 |
+
"""Transcribes speech to text using Whisper model with chunking for long files."""
|
73 |
+
try:
|
74 |
+
audio, sr = librosa.load(audio_path, sr=16000)
|
75 |
+
duration = len(audio) / sr
|
76 |
+
|
77 |
+
if duration > 30:
|
78 |
+
chunk_size = 30 * sr # 30-second chunks
|
79 |
+
chunks = [audio[i:i + chunk_size] for i in range(0, len(audio), chunk_size)]
|
80 |
+
|
81 |
+
transcripts = []
|
82 |
+
for idx, chunk in enumerate(chunks):
|
83 |
+
temp_chunk_path = f"temp_chunk_{idx}.wav"
|
84 |
+
sf.write(temp_chunk_path, chunk, sr)
|
85 |
+
result = speech_to_text(temp_chunk_path)
|
86 |
+
transcripts.append(result["text"])
|
87 |
+
os.remove(temp_chunk_path)
|
88 |
+
|
89 |
+
return " ".join(transcripts)
|
90 |
+
else:
|
91 |
+
result = speech_to_text(audio_path)
|
92 |
+
return result["text"]
|
93 |
+
|
94 |
+
except Exception as e:
|
95 |
+
raise HTTPException(status_code=500, detail=f"Speech-to-text failed: {str(e)}")
|
96 |
+
|
97 |
+
# β
Legal Document Summarization
|
98 |
+
async def summarize_legal_document(text):
|
99 |
+
"""Generates a summary of the legal document."""
|
100 |
+
try:
|
101 |
+
summary = summarizer(text[:1024], max_length=200, min_length=50, do_sample=False)
|
102 |
+
return summary[0]['summary_text']
|
103 |
+
except Exception as e:
|
104 |
+
return "Summarization failed due to an internal error."
|
105 |
+
|
106 |
+
# β
Legal Document Analysis API
|
107 |
+
@app.post("/analyze_legal_document")
|
108 |
+
async def analyze_legal_document(file: UploadFile = File(...)):
|
109 |
+
"""Analyzes a legal document by extracting text, summarizing, and identifying entities."""
|
110 |
+
try:
|
111 |
+
content = await file.read()
|
112 |
+
text = extract_text_from_pdf(io.BytesIO(content))
|
113 |
+
|
114 |
+
summary = await summarize_legal_document(text)
|
115 |
+
|
116 |
+
return {"status": "success", "summary": summary}
|
117 |
+
except Exception as e:
|
118 |
+
return {"status": "error", "detail": str(e)}
|
119 |
+
|
120 |
+
# β
Chatbot API
|
121 |
+
@app.post("/chatbot")
|
122 |
+
async def chatbot_endpoint(query: dict):
|
123 |
+
"""Handles chatbot queries using Falcon 7B."""
|
124 |
+
try:
|
125 |
+
input_text = query.get("query", "")
|
126 |
+
if not input_text:
|
127 |
+
raise HTTPException(status_code=400, detail="Query cannot be empty.")
|
128 |
+
|
129 |
+
inputs = tokenizer(input_text, return_tensors="pt").to(device)
|
130 |
+
outputs = chatbot_model.generate(**inputs, max_length=200)
|
131 |
+
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
132 |
+
|
133 |
+
return {"status": "success", "answer": response}
|
134 |
+
except Exception as e:
|
135 |
+
return {"status": "error", "message": str(e)}
|
136 |
+
|
137 |
+
# β
Video Upload & Analysis API
|
138 |
+
@app.post("/analyze_video")
|
139 |
+
async def analyze_video(file: UploadFile = File(...)):
|
140 |
+
"""Extracts speech from video and analyzes it."""
|
141 |
+
try:
|
142 |
+
video_path = f"temp_{file.filename}"
|
143 |
+
with open(video_path, "wb") as f:
|
144 |
+
f.write(await file.read())
|
145 |
+
|
146 |
+
audio_path = extract_audio_from_video(video_path)
|
147 |
+
transcript = transcribe_audio(audio_path)
|
148 |
+
|
149 |
+
return {"status": "success", "transcript": transcript}
|
150 |
+
except Exception as e:
|
151 |
+
return {"status": "error", "message": str(e)}
|
152 |
+
|
153 |
+
# β
Run FastAPI Server
|
154 |
+
if __name__ == "__main__":
|
155 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|