rishabh5752 commited on
Commit
268e7b8
·
1 Parent(s): 328226a

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -91
app.py DELETED
@@ -1,91 +0,0 @@
1
- import gradio as gr
2
- import os
3
- import PyPDF2 # Import PyPDF2 for PDF text extraction
4
- import nltk
5
- from nltk.tokenize import word_tokenize
6
- from nltk.corpus import stopwords
7
- from sklearn.feature_extraction.text import TfidfVectorizer
8
- from sklearn.metrics.pairwise import cosine_similarity
9
-
10
- # Load NLTK resources
11
- nltk.download('punkt')
12
- nltk.download('stopwords')
13
-
14
- # Function to extract text from PDFs using PyPDF2
15
- def extract_text_from_pdf(pdf_path):
16
- pdf_text = ""
17
- with open(pdf_path, 'rb') as pdf_file:
18
- pdf_reader = PyPDF2.PdfFileReader(pdf_file)
19
- for page_num in range(pdf_reader.getNumPages()):
20
- page = pdf_reader.getPage(page_num)
21
- pdf_text += page.extractText()
22
- return pdf_text
23
-
24
- # Function to clean and tokenize text
25
- def clean_and_tokenize(text):
26
- tokens = word_tokenize(text.lower())
27
- tokens = [word for word in tokens if word.isalnum() and word not in stopwords.words('english')]
28
- return ' '.join(tokens)
29
-
30
- # Function to preprocess the documents in the specified directory
31
- def preprocess_documents(dataset_dir):
32
- documents = []
33
- for filename in os.listdir(dataset_dir):
34
- if filename.endswith('.pdf'):
35
- pdf_path = os.path.join(dataset_dir, filename)
36
- pdf_text = extract_text_from_pdf(pdf_path)
37
- clean_text = clean_and_tokenize(pdf_text)
38
- documents.append(clean_text)
39
- return documents
40
-
41
- # Function to perform relevance matching and return top N documents
42
- def perform_relevance_matching(query, *uploaded_files, dataset_dir):
43
- # Preprocess the documents in the specified dataset directory
44
- documents = preprocess_documents(dataset_dir)
45
-
46
- # Combine the user-uploaded files into a single document
47
- uploaded_documents = []
48
- for file in uploaded_files:
49
- uploaded_text = extract_text_from_pdf(file.name)
50
- uploaded_documents.append(uploaded_text)
51
-
52
- # Combine the uploaded documents and query
53
- combined_documents = uploaded_documents + [query]
54
-
55
- # Vectorize the combined documents
56
- tfidf_vectorizer = TfidfVectorizer()
57
- tfidf_matrix = tfidf_vectorizer.fit_transform(documents + combined_documents)
58
-
59
- # Calculate cosine similarities between the combined documents and the dataset
60
- cosine_similarities = cosine_similarity(tfidf_matrix[-len(combined_documents):], tfidf_matrix[:-len(combined_documents)])
61
-
62
- # Rank documents by similarity score
63
- document_scores = list(enumerate(cosine_similarities[0]))
64
- sorted_documents = sorted(document_scores, key=lambda x: x[1], reverse=True)
65
-
66
- # Extract the top N relevant documents
67
- top_n = 5
68
- top_documents = []
69
- for i in range(min(top_n, len(sorted_documents))):
70
- doc_index, score = sorted_documents[i]
71
- document_text = documents[doc_index][:500] # Extract the first 500 characters of the document
72
- top_documents.append((f"Document {doc_index + 1} (Similarity Score: {score:.4f})", document_text))
73
-
74
- return top_documents
75
-
76
- # Create a Gradio interface
77
- iface = gr.Interface(
78
- fn=perform_relevance_matching,
79
- inputs=[
80
- "text", # Query input
81
- gr.File(multiple=True), # Allow multiple file uploads
82
- "text" # Dataset directory input
83
- ],
84
- outputs=gr.Table(),
85
- live=True,
86
- title="Legal Research Assistant",
87
- description="Enter your legal query, upload files, and specify the dataset directory.",
88
- )
89
-
90
- # Launch the Gradio interface
91
- iface.launch()