Spaces:
Build error
Build error
import os | |
import fitz # Corrected import statement | |
import nltk | |
from nltk.tokenize import word_tokenize | |
from nltk.corpus import stopwords | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
# Load NLTK resources | |
nltk.download('punkt') | |
nltk.download('stopwords') | |
# Define your dataset directory | |
dataset_dir = '/content/LegalData' | |
# Load and preprocess the query | |
query = "What are the legal implications of intellectual property rights?" | |
# Function to extract text from PDFs using PyMuPDF (fitz) | |
def extract_text_from_pdf(pdf_path): | |
pdf_text = "" | |
with fitz.open(pdf_path) as pdf_document: | |
for page_num in range(pdf_document.page_count): | |
page = pdf_document[page_num] | |
pdf_text += page.get_text() | |
return pdf_text | |
# Function to clean and tokenize text | |
def clean_and_tokenize(text): | |
tokens = word_tokenize(text.lower()) | |
tokens = [word for word in tokens if word.isalnum() and word not in stopwords.words('english')] | |
return ' '.join(tokens) | |
# Process and tokenize the documents in your dataset | |
documents = [] | |
for filename in os.listdir(dataset_dir): | |
if filename.endswith('.pdf'): | |
pdf_path = os.path.join(dataset_dir, filename) | |
pdf_text = extract_text_from_pdf(pdf_path) | |
clean_text = clean_and_tokenize(pdf_text) | |
documents.append(clean_text) | |
# Vectorize the documents | |
tfidf_vectorizer = TfidfVectorizer() | |
tfidf_matrix = tfidf_vectorizer.fit_transform(documents) | |
# Vectorize the query | |
query_vector = tfidf_vectorizer.transform([clean_and_tokenize(query)]) | |
# Calculate cosine similarities between the query and documents | |
cosine_similarities = cosine_similarity(query_vector, tfidf_matrix) | |
# Rank documents by similarity score | |
document_scores = list(enumerate(cosine_similarities[0])) | |
sorted_documents = sorted(document_scores, key=lambda x: x[1], reverse=True) | |
# Print the top N relevant documents | |
top_n = 5 | |
for i in range(top_n): | |
doc_index, score = sorted_documents[i] | |
print(f"Document {doc_index + 1} (Similarity Score: {score:.4f})") | |
print(documents[doc_index][:500]) # Print the first 500 characters of the document | |
print("\n") | |
# Implement answer extraction and answer generation steps for the top N documents. | |