Spaces:

rishabh5752
/

LegalPaperSorter

Build error

App Files Files Community

rishabh5752 commited on Sep 14, 2023

Commit

f119672

1 Parent(s): f1f60a1

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -23

app.py CHANGED Viewed

@@ -11,10 +11,7 @@ from sklearn.metrics.pairwise import cosine_similarity
 nltk.download('punkt')
 nltk.download('stopwords')
-# Define your dataset directory
-dataset_dir = '/content/LegalData'
-# Load and preprocess the query
 def extract_text_from_pdf(pdf_path):
     pdf_text = ""
     with fitz.open(pdf_path) as pdf_document:
@@ -29,26 +26,37 @@ def clean_and_tokenize(text):
     tokens = [word for word in tokens if word.isalnum() and word not in stopwords.words('english')]
     return ' '.join(tokens)
-# Load and preprocess the documents in your dataset
-documents = []
-for filename in os.listdir(dataset_dir):
-    if filename.endswith('.pdf'):
-        pdf_path = os.path.join(dataset_dir, filename)
-        pdf_text = extract_text_from_pdf(pdf_path)
-        clean_text = clean_and_tokenize(pdf_text)
-        documents.append(clean_text)
-# Vectorize the documents
-tfidf_vectorizer = TfidfVectorizer()
-tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
 # Function to perform relevance matching and return top N documents
-def perform_relevance_matching(query, uploaded_files):
-    # Vectorize the query
-    query_vector = tfidf_vectorizer.transform([clean_and_tokenize(query)])
-    # Calculate cosine similarities between the query and documents
-    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix)
     # Rank documents by similarity score
     document_scores = list(enumerate(cosine_similarities[0]))
@@ -67,11 +75,15 @@ def perform_relevance_matching(query, uploaded_files):
 # Create a Gradio interface
 iface = gr.Interface(
     fn=perform_relevance_matching,
-    inputs=["text", gr.File()],
     outputs=gr.Table(),
     live=True,
     title="Legal Research Assistant",
-    description="Enter your legal query and upload files for relevance matching.",
 )
 # Launch the Gradio interface

 nltk.download('punkt')
 nltk.download('stopwords')
+# Function to extract text from PDFs using PyMuPDF (fitz)
 def extract_text_from_pdf(pdf_path):
     pdf_text = ""
     with fitz.open(pdf_path) as pdf_document:
     tokens = [word for word in tokens if word.isalnum() and word not in stopwords.words('english')]
     return ' '.join(tokens)
+# Function to preprocess the documents in the specified directory
+def preprocess_documents(dataset_dir):
+    documents = []
+    for filename in os.listdir(dataset_dir):
+        if filename.endswith('.pdf'):
+            pdf_path = os.path.join(dataset_dir, filename)
+            pdf_text = extract_text_from_pdf(pdf_path)
+            clean_text = clean_and_tokenize(pdf_text)
+            documents.append(clean_text)
+    return documents
 # Function to perform relevance matching and return top N documents
+def perform_relevance_matching(query, *uploaded_files, dataset_dir):
+    # Preprocess the documents in the specified dataset directory
+    documents = preprocess_documents(dataset_dir)
+    # Combine the user-uploaded files into a single document
+    uploaded_documents = []
+    for file in uploaded_files:
+        uploaded_text = extract_text_from_pdf(file.name)
+        uploaded_documents.append(uploaded_text)
+    # Combine the uploaded documents and query
+    combined_documents = uploaded_documents + [query]
+    # Vectorize the combined documents
+    tfidf_vectorizer = TfidfVectorizer()
+    tfidf_matrix = tfidf_vectorizer.fit_transform(documents + combined_documents)
+    # Calculate cosine similarities between the combined documents and the dataset
+    cosine_similarities = cosine_similarity(tfidf_matrix[-len(combined_documents):], tfidf_matrix[:-len(combined_documents)])
     # Rank documents by similarity score
     document_scores = list(enumerate(cosine_similarities[0]))
 # Create a Gradio interface
 iface = gr.Interface(
     fn=perform_relevance_matching,
+    inputs=[
+        "text",  # Query input
+        gr.File(multiple=True),  # Allow multiple file uploads
+        "text"  # Dataset directory input
+    ],
     outputs=gr.Table(),
     live=True,
     title="Legal Research Assistant",
+    description="Enter your legal query, upload files, and specify the dataset directory.",
 )
 # Launch the Gradio interface