Spaces:

rishabh5752
/

LegalPaperSorter

Build error

rishabh5752 commited on Sep 14, 2023

Commit

328226a

1 Parent(s): 9d4ed7f

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import gradio as gr
 import os
-import fitz
 import nltk
 from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords
@@ -12,13 +11,14 @@ from sklearn.metrics.pairwise import cosine_similarity
 nltk.download('punkt')
 nltk.download('stopwords')
-# Function to extract text from PDFs using PyMuPDF (fitz)
 def extract_text_from_pdf(pdf_path):
     pdf_text = ""
-    with fitz.open(pdf_path) as pdf_document:
-        for page_num in range(pdf_document.page_count):
-            page = pdf_document[page_num]
-            pdf_text += page.get_text()
     return pdf_text
 # Function to clean and tokenize text

 import gradio as gr
 import os
+import PyPDF2  # Import PyPDF2 for PDF text extraction
 import nltk
 from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords
 nltk.download('punkt')
 nltk.download('stopwords')
+# Function to extract text from PDFs using PyPDF2
 def extract_text_from_pdf(pdf_path):
     pdf_text = ""
+    with open(pdf_path, 'rb') as pdf_file:
+        pdf_reader = PyPDF2.PdfFileReader(pdf_file)
+        for page_num in range(pdf_reader.getNumPages()):
+            page = pdf_reader.getPage(page_num)
+            pdf_text += page.extractText()
     return pdf_text
 # Function to clean and tokenize text