import os import fitz # Corrected import statement import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity # Load NLTK resources nltk.download('punkt') nltk.download('stopwords') # Define your dataset directory dataset_dir = '/content/LegalData' # Load and preprocess the query query = "What are the legal implications of intellectual property rights?" # Function to extract text from PDFs using PyMuPDF (fitz) def extract_text_from_pdf(pdf_path): pdf_text = "" with fitz.open(pdf_path) as pdf_document: for page_num in range(pdf_document.page_count): page = pdf_document[page_num] pdf_text += page.get_text() return pdf_text # Function to clean and tokenize text def clean_and_tokenize(text): tokens = word_tokenize(text.lower()) tokens = [word for word in tokens if word.isalnum() and word not in stopwords.words('english')] return ' '.join(tokens) # Process and tokenize the documents in your dataset documents = [] for filename in os.listdir(dataset_dir): if filename.endswith('.pdf'): pdf_path = os.path.join(dataset_dir, filename) pdf_text = extract_text_from_pdf(pdf_path) clean_text = clean_and_tokenize(pdf_text) documents.append(clean_text) # Vectorize the documents tfidf_vectorizer = TfidfVectorizer() tfidf_matrix = tfidf_vectorizer.fit_transform(documents) # Vectorize the query query_vector = tfidf_vectorizer.transform([clean_and_tokenize(query)]) # Calculate cosine similarities between the query and documents cosine_similarities = cosine_similarity(query_vector, tfidf_matrix) # Rank documents by similarity score document_scores = list(enumerate(cosine_similarities[0])) sorted_documents = sorted(document_scores, key=lambda x: x[1], reverse=True) # Print the top N relevant documents top_n = 5 for i in range(top_n): doc_index, score = sorted_documents[i] print(f"Document {doc_index + 1} (Similarity Score: {score:.4f})") print(documents[doc_index][:500]) # Print the first 500 characters of the document print("\n") # Implement answer extraction and answer generation steps for the top N documents.