Spaces:

Izza-shahzad-13
/

lawsumm

Sleeping

File size: 2,469 Bytes

803b5ef

import os, re, pdfplumber, faiss, pickle
import numpy as np
from sentence_transformers import SentenceTransformer

BOOKS = {
    "Pakistan Penal Code": r"D:\FYP WEBSITE\summerization-app\RAG books\pakistan penal code_removed.pdf",
    "Code of Criminal Procedure": r"D:\FYP WEBSITE\summerization-app\RAG books\code of criminal procedure_removed.pdf",
    "Code of Civil Procedure": r"D:\FYP WEBSITE\summerization-app\RAG books\code of civil procedure_removed_removed.pdf",
    "Constitution of Pakistan": r"D:\FYP WEBSITE\summerization-app\RAG books\constitution of pakistan_removed.pdf",
    "Rules of Business": r"D:\FYP WEBSITE\summerization-app\RAG books\rules of business_removed.pdf"
}

def extract_text_from_pdf(pdf_path):
    if not os.path.exists(pdf_path):
        return ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            return "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])
    except:
        return ""

def clean_text(text):
    text = re.sub(r'Page\s*\d+\s*of\s*\d+', '', text)
    text = re.sub(r'\bPage\s*\d+\b', '', text)
    text = re.sub(r'\b\d+\s*/\s*\d+\b', '', text)
    return text.strip()

def split_sections_by_number(text):
    pattern = r"(\d+[A-Z]?(?:\(\d+\))?)\.\s*(.*?)(?=\n\d+[A-Z]?(?:\(\d+\))?\.)"
    return [{"section_id": s[0].strip(), "content": s[1].strip()} for s in re.findall(pattern, text, re.DOTALL)]

def create_faiss_index(structured_data, model):
    corpus = [section['content'] for section in structured_data]
    corpus_embeddings = model.encode(corpus, show_progress_bar=True)
    dim = corpus_embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(corpus_embeddings))
    return index

if __name__ == "__main__":
    model = SentenceTransformer('all-MiniLM-L6-v2')

    all_data = {}
    all_indices = {}

    for name, path in BOOKS.items():
        print(f"Processing {name}...")
        text = extract_text_from_pdf(path)
        clean = clean_text(text)
        sections = split_sections_by_number(clean)
        index = create_faiss_index(sections, model)

        all_data[name] = sections
        all_indices[name] = index

    with open("data/legal_data.pkl", "wb") as f:
        pickle.dump(all_data, f)

    for name, index in all_indices.items():
        faiss.write_index(index, f"data/{name.replace(' ', '_')}_faiss.index")

    print("All data processed and saved.")