Spaces:

rishabh5752
/

LegalPaperSorter

Build error

App Files Files Community

rishabh5752 commited on Sep 14, 2023

Commit

b3722e5

1 Parent(s): 0f6f856

Create app.py

Browse files

Files changed (1) hide show

app.py +65 -0

app.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import os
+import fitz  # Corrected import statement
+import nltk
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+# Load NLTK resources
+nltk.download('punkt')
+nltk.download('stopwords')
+# Define your dataset directory
+dataset_dir = '/content/LegalData'
+# Load and preprocess the query
+query = "What are the legal implications of intellectual property rights?"
+# Function to extract text from PDFs using PyMuPDF (fitz)
+def extract_text_from_pdf(pdf_path):
+    pdf_text = ""
+    with fitz.open(pdf_path) as pdf_document:
+        for page_num in range(pdf_document.page_count):
+            page = pdf_document[page_num]
+            pdf_text += page.get_text()
+    return pdf_text
+# Function to clean and tokenize text
+def clean_and_tokenize(text):
+    tokens = word_tokenize(text.lower())
+    tokens = [word for word in tokens if word.isalnum() and word not in stopwords.words('english')]
+    return ' '.join(tokens)
+# Process and tokenize the documents in your dataset
+documents = []
+for filename in os.listdir(dataset_dir):
+    if filename.endswith('.pdf'):
+        pdf_path = os.path.join(dataset_dir, filename)
+        pdf_text = extract_text_from_pdf(pdf_path)
+        clean_text = clean_and_tokenize(pdf_text)
+        documents.append(clean_text)
+# Vectorize the documents
+tfidf_vectorizer = TfidfVectorizer()
+tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
+# Vectorize the query
+query_vector = tfidf_vectorizer.transform([clean_and_tokenize(query)])
+# Calculate cosine similarities between the query and documents
+cosine_similarities = cosine_similarity(query_vector, tfidf_matrix)
+# Rank documents by similarity score
+document_scores = list(enumerate(cosine_similarities[0]))
+sorted_documents = sorted(document_scores, key=lambda x: x[1], reverse=True)
+# Print the top N relevant documents
+top_n = 5
+for i in range(top_n):
+    doc_index, score = sorted_documents[i]
+    print(f"Document {doc_index + 1} (Similarity Score: {score:.4f})")
+    print(documents[doc_index][:500])  # Print the first 500 characters of the document
+    print("\n")
+# Implement answer extraction and answer generation steps for the top N documents.