Spaces:
Build error
Build error
import gradio as gr | |
import os | |
import fitz | |
import nltk | |
from nltk.tokenize import word_tokenize | |
from nltk.corpus import stopwords | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
# Load NLTK resources | |
nltk.download('punkt') | |
nltk.download('stopwords') | |
# Define your dataset directory | |
dataset_dir = '/content/LegalData' | |
# Load and preprocess the query | |
def extract_text_from_pdf(pdf_path): | |
pdf_text = "" | |
with fitz.open(pdf_path) as pdf_document: | |
for page_num in range(pdf_document.page_count): | |
page = pdf_document[page_num] | |
pdf_text += page.get_text() | |
return pdf_text | |
# Function to clean and tokenize text | |
def clean_and_tokenize(text): | |
tokens = word_tokenize(text.lower()) | |
tokens = [word for word in tokens if word.isalnum() and word not in stopwords.words('english')] | |
return ' '.join(tokens) | |
# Load and preprocess the documents in your dataset | |
documents = [] | |
for filename in os.listdir(dataset_dir): | |
if filename.endswith('.pdf'): | |
pdf_path = os.path.join(dataset_dir, filename) | |
pdf_text = extract_text_from_pdf(pdf_path) | |
clean_text = clean_and_tokenize(pdf_text) | |
documents.append(clean_text) | |
# Vectorize the documents | |
tfidf_vectorizer = TfidfVectorizer() | |
tfidf_matrix = tfidf_vectorizer.fit_transform(documents) | |
# Function to perform relevance matching and return top N documents | |
def perform_relevance_matching(query, uploaded_files): | |
# Vectorize the query | |
query_vector = tfidf_vectorizer.transform([clean_and_tokenize(query)]) | |
# Calculate cosine similarities between the query and documents | |
cosine_similarities = cosine_similarity(query_vector, tfidf_matrix) | |
# Rank documents by similarity score | |
document_scores = list(enumerate(cosine_similarities[0])) | |
sorted_documents = sorted(document_scores, key=lambda x: x[1], reverse=True) | |
# Extract the top N relevant documents | |
top_n = 5 | |
top_documents = [] | |
for i in range(min(top_n, len(sorted_documents))): | |
doc_index, score = sorted_documents[i] | |
document_text = documents[doc_index][:500] # Extract the first 500 characters of the document | |
top_documents.append((f"Document {doc_index + 1} (Similarity Score: {score:.4f})", document_text)) | |
return top_documents | |
# Create a Gradio interface | |
iface = gr.Interface( | |
fn=perform_relevance_matching, | |
inputs=["text", gr.File()], | |
outputs=gr.Table(), | |
live=True, | |
title="Legal Research Assistant", | |
description="Enter your legal query and upload files for relevance matching.", | |
) | |
# Launch the Gradio interface | |
iface.launch() | |