rishabh5752 commited on
Commit
f119672
·
1 Parent(s): f1f60a1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -23
app.py CHANGED
@@ -11,10 +11,7 @@ from sklearn.metrics.pairwise import cosine_similarity
11
  nltk.download('punkt')
12
  nltk.download('stopwords')
13
 
14
- # Define your dataset directory
15
- dataset_dir = '/content/LegalData'
16
-
17
- # Load and preprocess the query
18
  def extract_text_from_pdf(pdf_path):
19
  pdf_text = ""
20
  with fitz.open(pdf_path) as pdf_document:
@@ -29,26 +26,37 @@ def clean_and_tokenize(text):
29
  tokens = [word for word in tokens if word.isalnum() and word not in stopwords.words('english')]
30
  return ' '.join(tokens)
31
 
32
- # Load and preprocess the documents in your dataset
33
- documents = []
34
- for filename in os.listdir(dataset_dir):
35
- if filename.endswith('.pdf'):
36
- pdf_path = os.path.join(dataset_dir, filename)
37
- pdf_text = extract_text_from_pdf(pdf_path)
38
- clean_text = clean_and_tokenize(pdf_text)
39
- documents.append(clean_text)
40
-
41
- # Vectorize the documents
42
- tfidf_vectorizer = TfidfVectorizer()
43
- tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
44
 
45
  # Function to perform relevance matching and return top N documents
46
- def perform_relevance_matching(query, uploaded_files):
47
- # Vectorize the query
48
- query_vector = tfidf_vectorizer.transform([clean_and_tokenize(query)])
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
- # Calculate cosine similarities between the query and documents
51
- cosine_similarities = cosine_similarity(query_vector, tfidf_matrix)
52
 
53
  # Rank documents by similarity score
54
  document_scores = list(enumerate(cosine_similarities[0]))
@@ -67,11 +75,15 @@ def perform_relevance_matching(query, uploaded_files):
67
  # Create a Gradio interface
68
  iface = gr.Interface(
69
  fn=perform_relevance_matching,
70
- inputs=["text", gr.File()],
 
 
 
 
71
  outputs=gr.Table(),
72
  live=True,
73
  title="Legal Research Assistant",
74
- description="Enter your legal query and upload files for relevance matching.",
75
  )
76
 
77
  # Launch the Gradio interface
 
11
  nltk.download('punkt')
12
  nltk.download('stopwords')
13
 
14
+ # Function to extract text from PDFs using PyMuPDF (fitz)
 
 
 
15
  def extract_text_from_pdf(pdf_path):
16
  pdf_text = ""
17
  with fitz.open(pdf_path) as pdf_document:
 
26
  tokens = [word for word in tokens if word.isalnum() and word not in stopwords.words('english')]
27
  return ' '.join(tokens)
28
 
29
+ # Function to preprocess the documents in the specified directory
30
+ def preprocess_documents(dataset_dir):
31
+ documents = []
32
+ for filename in os.listdir(dataset_dir):
33
+ if filename.endswith('.pdf'):
34
+ pdf_path = os.path.join(dataset_dir, filename)
35
+ pdf_text = extract_text_from_pdf(pdf_path)
36
+ clean_text = clean_and_tokenize(pdf_text)
37
+ documents.append(clean_text)
38
+ return documents
 
 
39
 
40
  # Function to perform relevance matching and return top N documents
41
+ def perform_relevance_matching(query, *uploaded_files, dataset_dir):
42
+ # Preprocess the documents in the specified dataset directory
43
+ documents = preprocess_documents(dataset_dir)
44
+
45
+ # Combine the user-uploaded files into a single document
46
+ uploaded_documents = []
47
+ for file in uploaded_files:
48
+ uploaded_text = extract_text_from_pdf(file.name)
49
+ uploaded_documents.append(uploaded_text)
50
+
51
+ # Combine the uploaded documents and query
52
+ combined_documents = uploaded_documents + [query]
53
+
54
+ # Vectorize the combined documents
55
+ tfidf_vectorizer = TfidfVectorizer()
56
+ tfidf_matrix = tfidf_vectorizer.fit_transform(documents + combined_documents)
57
 
58
+ # Calculate cosine similarities between the combined documents and the dataset
59
+ cosine_similarities = cosine_similarity(tfidf_matrix[-len(combined_documents):], tfidf_matrix[:-len(combined_documents)])
60
 
61
  # Rank documents by similarity score
62
  document_scores = list(enumerate(cosine_similarities[0]))
 
75
  # Create a Gradio interface
76
  iface = gr.Interface(
77
  fn=perform_relevance_matching,
78
+ inputs=[
79
+ "text", # Query input
80
+ gr.File(multiple=True), # Allow multiple file uploads
81
+ "text" # Dataset directory input
82
+ ],
83
  outputs=gr.Table(),
84
  live=True,
85
  title="Legal Research Assistant",
86
+ description="Enter your legal query, upload files, and specify the dataset directory.",
87
  )
88
 
89
  # Launch the Gradio interface