File size: 5,929 Bytes
ead524b
 
71359f1
 
 
 
 
 
b1c7477
71359f1
 
ead524b
71359f1
 
 
 
 
 
 
 
 
 
 
ead524b
71359f1
 
ead524b
 
 
 
 
 
71359f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1c7477
71359f1
b1c7477
71359f1
 
b1c7477
71359f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1c7477
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162

   
import os
import io
import torch
import uvicorn
import spacy
import pdfplumber
import ffmpeg  # βœ… Replaced moviepy with ffmpeg-python
import librosa
import soundfile as sf
import subprocess
from fastapi import FastAPI, UploadFile, File, HTTPException
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer, util

# βœ… Suppress Warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

# βœ… Ensure GPU is Used
device = "cuda" if torch.cuda.is_available() else "cpu"

# βœ… Fix Spacy Installation (Prevent Permission Errors)
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm", "--user"])
    nlp = spacy.load("en_core_web_sm")

# βœ… Load NLP Models
try:
    summarizer = pipeline("summarization", model="nsi319/legal-pegasus", device=0 if torch.cuda.is_available() else -1)
    embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)
    ner_model = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER", device=0 if torch.cuda.is_available() else -1)
    speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=0 if torch.cuda.is_available() else -1)

except Exception as e:
    raise RuntimeError(f"Error loading models: {str(e)}")

# βœ… Load Falcon 7B for Chatbot
MODEL_NAME = "tiiuae/falcon-7b-instruct"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
chatbot_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

# βœ… Initialize FastAPI
app = FastAPI()

# βœ… PDF Text Extraction
def extract_text_from_pdf(pdf_file):
    """Extracts text from a PDF file using pdfplumber."""
    try:
        with pdfplumber.open(pdf_file) as pdf:
            text = "\n".join([page.extract_text() or "" for page in pdf.pages])
        if not text.strip():
            raise ValueError("No readable text found in PDF. It may be a scanned document.")
        return text
    except Exception as e:
        raise HTTPException(status_code=400, detail=f"PDF extraction failed: {str(e)}")

# βœ… Video-to-Audio Extraction (Using FFmpeg Instead of MoviePy)
def extract_audio_from_video(video_path):
    """Extracts audio from a video file using FFmpeg."""
    try:
        audio_path = video_path.replace(".mp4", ".wav")
        ffmpeg.input(video_path).output(audio_path, format="wav").run(overwrite_output=True)
        return audio_path
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Audio extraction failed: {str(e)}")

# βœ… Speech-to-Text Transcription (Fix for Long Audio)
def transcribe_audio(audio_path):
    """Transcribes speech to text using Whisper model with chunking for long files."""
    try:
        audio, sr = librosa.load(audio_path, sr=16000)
        duration = len(audio) / sr

        if duration > 30:
            chunk_size = 30 * sr  # 30-second chunks
            chunks = [audio[i:i + chunk_size] for i in range(0, len(audio), chunk_size)]

            transcripts = []
            for idx, chunk in enumerate(chunks):
                temp_chunk_path = f"temp_chunk_{idx}.wav"
                sf.write(temp_chunk_path, chunk, sr)
                result = speech_to_text(temp_chunk_path)
                transcripts.append(result["text"])
                os.remove(temp_chunk_path)

            return " ".join(transcripts)
        else:
            result = speech_to_text(audio_path)
            return result["text"]

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Speech-to-text failed: {str(e)}")

# βœ… Legal Document Summarization
async def summarize_legal_document(text):
    """Generates a summary of the legal document."""
    try:
        summary = summarizer(text[:1024], max_length=200, min_length=50, do_sample=False)
        return summary[0]['summary_text']
    except Exception as e:
        return "Summarization failed due to an internal error."

# βœ… Legal Document Analysis API
@app.post("/analyze_legal_document")
async def analyze_legal_document(file: UploadFile = File(...)):
    """Analyzes a legal document by extracting text, summarizing, and identifying entities."""
    try:
        content = await file.read()
        text = extract_text_from_pdf(io.BytesIO(content))

        summary = await summarize_legal_document(text)

        return {"status": "success", "summary": summary}
    except Exception as e:
        return {"status": "error", "detail": str(e)}

# βœ… Chatbot API
@app.post("/chatbot")
async def chatbot_endpoint(query: dict):
    """Handles chatbot queries using Falcon 7B."""
    try:
        input_text = query.get("query", "")
        if not input_text:
            raise HTTPException(status_code=400, detail="Query cannot be empty.")

        inputs = tokenizer(input_text, return_tensors="pt").to(device)
        outputs = chatbot_model.generate(**inputs, max_length=200)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)

        return {"status": "success", "answer": response}
    except Exception as e:
        return {"status": "error", "message": str(e)}

# βœ… Video Upload & Analysis API
@app.post("/analyze_video")
async def analyze_video(file: UploadFile = File(...)):
    """Extracts speech from video and analyzes it."""
    try:
        video_path = f"temp_{file.filename}"
        with open(video_path, "wb") as f:
            f.write(await file.read())

        audio_path = extract_audio_from_video(video_path)
        transcript = transcribe_audio(audio_path)

        return {"status": "success", "transcript": transcript}
    except Exception as e:
        return {"status": "error", "message": str(e)}

# βœ… Run FastAPI Server
if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860)