rishabh5752 commited on
Commit
b3722e5
·
1 Parent(s): 0f6f856

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -0
app.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import fitz # Corrected import statement
3
+ import nltk
4
+ from nltk.tokenize import word_tokenize
5
+ from nltk.corpus import stopwords
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+
9
+ # Load NLTK resources
10
+ nltk.download('punkt')
11
+ nltk.download('stopwords')
12
+
13
+ # Define your dataset directory
14
+ dataset_dir = '/content/LegalData'
15
+
16
+ # Load and preprocess the query
17
+ query = "What are the legal implications of intellectual property rights?"
18
+
19
+ # Function to extract text from PDFs using PyMuPDF (fitz)
20
+ def extract_text_from_pdf(pdf_path):
21
+ pdf_text = ""
22
+ with fitz.open(pdf_path) as pdf_document:
23
+ for page_num in range(pdf_document.page_count):
24
+ page = pdf_document[page_num]
25
+ pdf_text += page.get_text()
26
+ return pdf_text
27
+
28
+ # Function to clean and tokenize text
29
+ def clean_and_tokenize(text):
30
+ tokens = word_tokenize(text.lower())
31
+ tokens = [word for word in tokens if word.isalnum() and word not in stopwords.words('english')]
32
+ return ' '.join(tokens)
33
+
34
+ # Process and tokenize the documents in your dataset
35
+ documents = []
36
+ for filename in os.listdir(dataset_dir):
37
+ if filename.endswith('.pdf'):
38
+ pdf_path = os.path.join(dataset_dir, filename)
39
+ pdf_text = extract_text_from_pdf(pdf_path)
40
+ clean_text = clean_and_tokenize(pdf_text)
41
+ documents.append(clean_text)
42
+
43
+ # Vectorize the documents
44
+ tfidf_vectorizer = TfidfVectorizer()
45
+ tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
46
+
47
+ # Vectorize the query
48
+ query_vector = tfidf_vectorizer.transform([clean_and_tokenize(query)])
49
+
50
+ # Calculate cosine similarities between the query and documents
51
+ cosine_similarities = cosine_similarity(query_vector, tfidf_matrix)
52
+
53
+ # Rank documents by similarity score
54
+ document_scores = list(enumerate(cosine_similarities[0]))
55
+ sorted_documents = sorted(document_scores, key=lambda x: x[1], reverse=True)
56
+
57
+ # Print the top N relevant documents
58
+ top_n = 5
59
+ for i in range(top_n):
60
+ doc_index, score = sorted_documents[i]
61
+ print(f"Document {doc_index + 1} (Similarity Score: {score:.4f})")
62
+ print(documents[doc_index][:500]) # Print the first 500 characters of the document
63
+ print("\n")
64
+
65
+ # Implement answer extraction and answer generation steps for the top N documents.