rishabh5752 commited on
Commit
f1f60a1
·
1 Parent(s): b3722e5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -22
app.py CHANGED
@@ -1,5 +1,6 @@
 
1
  import os
2
- import fitz # Corrected import statement
3
  import nltk
4
  from nltk.tokenize import word_tokenize
5
  from nltk.corpus import stopwords
@@ -14,9 +15,6 @@ nltk.download('stopwords')
14
  dataset_dir = '/content/LegalData'
15
 
16
  # Load and preprocess the query
17
- query = "What are the legal implications of intellectual property rights?"
18
-
19
- # Function to extract text from PDFs using PyMuPDF (fitz)
20
  def extract_text_from_pdf(pdf_path):
21
  pdf_text = ""
22
  with fitz.open(pdf_path) as pdf_document:
@@ -31,7 +29,7 @@ def clean_and_tokenize(text):
31
  tokens = [word for word in tokens if word.isalnum() and word not in stopwords.words('english')]
32
  return ' '.join(tokens)
33
 
34
- # Process and tokenize the documents in your dataset
35
  documents = []
36
  for filename in os.listdir(dataset_dir):
37
  if filename.endswith('.pdf'):
@@ -44,22 +42,37 @@ for filename in os.listdir(dataset_dir):
44
  tfidf_vectorizer = TfidfVectorizer()
45
  tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
46
 
47
- # Vectorize the query
48
- query_vector = tfidf_vectorizer.transform([clean_and_tokenize(query)])
49
-
50
- # Calculate cosine similarities between the query and documents
51
- cosine_similarities = cosine_similarity(query_vector, tfidf_matrix)
52
-
53
- # Rank documents by similarity score
54
- document_scores = list(enumerate(cosine_similarities[0]))
55
- sorted_documents = sorted(document_scores, key=lambda x: x[1], reverse=True)
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
- # Print the top N relevant documents
58
- top_n = 5
59
- for i in range(top_n):
60
- doc_index, score = sorted_documents[i]
61
- print(f"Document {doc_index + 1} (Similarity Score: {score:.4f})")
62
- print(documents[doc_index][:500]) # Print the first 500 characters of the document
63
- print("\n")
 
 
64
 
65
- # Implement answer extraction and answer generation steps for the top N documents.
 
 
1
+ import gradio as gr
2
  import os
3
+ import fitz
4
  import nltk
5
  from nltk.tokenize import word_tokenize
6
  from nltk.corpus import stopwords
 
15
  dataset_dir = '/content/LegalData'
16
 
17
  # Load and preprocess the query
 
 
 
18
  def extract_text_from_pdf(pdf_path):
19
  pdf_text = ""
20
  with fitz.open(pdf_path) as pdf_document:
 
29
  tokens = [word for word in tokens if word.isalnum() and word not in stopwords.words('english')]
30
  return ' '.join(tokens)
31
 
32
+ # Load and preprocess the documents in your dataset
33
  documents = []
34
  for filename in os.listdir(dataset_dir):
35
  if filename.endswith('.pdf'):
 
42
  tfidf_vectorizer = TfidfVectorizer()
43
  tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
44
 
45
+ # Function to perform relevance matching and return top N documents
46
+ def perform_relevance_matching(query, uploaded_files):
47
+ # Vectorize the query
48
+ query_vector = tfidf_vectorizer.transform([clean_and_tokenize(query)])
49
+
50
+ # Calculate cosine similarities between the query and documents
51
+ cosine_similarities = cosine_similarity(query_vector, tfidf_matrix)
52
+
53
+ # Rank documents by similarity score
54
+ document_scores = list(enumerate(cosine_similarities[0]))
55
+ sorted_documents = sorted(document_scores, key=lambda x: x[1], reverse=True)
56
+
57
+ # Extract the top N relevant documents
58
+ top_n = 5
59
+ top_documents = []
60
+ for i in range(min(top_n, len(sorted_documents))):
61
+ doc_index, score = sorted_documents[i]
62
+ document_text = documents[doc_index][:500] # Extract the first 500 characters of the document
63
+ top_documents.append((f"Document {doc_index + 1} (Similarity Score: {score:.4f})", document_text))
64
+
65
+ return top_documents
66
 
67
+ # Create a Gradio interface
68
+ iface = gr.Interface(
69
+ fn=perform_relevance_matching,
70
+ inputs=["text", gr.File()],
71
+ outputs=gr.Table(),
72
+ live=True,
73
+ title="Legal Research Assistant",
74
+ description="Enter your legal query and upload files for relevance matching.",
75
+ )
76
 
77
+ # Launch the Gradio interface
78
+ iface.launch()