rishabh5752 commited on
Commit
328226a
·
1 Parent(s): 9d4ed7f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -7
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import gradio as gr
2
  import os
3
-
4
- import fitz
5
  import nltk
6
  from nltk.tokenize import word_tokenize
7
  from nltk.corpus import stopwords
@@ -12,13 +11,14 @@ from sklearn.metrics.pairwise import cosine_similarity
12
  nltk.download('punkt')
13
  nltk.download('stopwords')
14
 
15
- # Function to extract text from PDFs using PyMuPDF (fitz)
16
  def extract_text_from_pdf(pdf_path):
17
  pdf_text = ""
18
- with fitz.open(pdf_path) as pdf_document:
19
- for page_num in range(pdf_document.page_count):
20
- page = pdf_document[page_num]
21
- pdf_text += page.get_text()
 
22
  return pdf_text
23
 
24
  # Function to clean and tokenize text
 
1
  import gradio as gr
2
  import os
3
+ import PyPDF2 # Import PyPDF2 for PDF text extraction
 
4
  import nltk
5
  from nltk.tokenize import word_tokenize
6
  from nltk.corpus import stopwords
 
11
  nltk.download('punkt')
12
  nltk.download('stopwords')
13
 
14
+ # Function to extract text from PDFs using PyPDF2
15
  def extract_text_from_pdf(pdf_path):
16
  pdf_text = ""
17
+ with open(pdf_path, 'rb') as pdf_file:
18
+ pdf_reader = PyPDF2.PdfFileReader(pdf_file)
19
+ for page_num in range(pdf_reader.getNumPages()):
20
+ page = pdf_reader.getPage(page_num)
21
+ pdf_text += page.extractText()
22
  return pdf_text
23
 
24
  # Function to clean and tokenize text