Spaces:
Build error
Build error
Commit
·
328226a
1
Parent(s):
9d4ed7f
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import os
|
| 3 |
-
|
| 4 |
-
import fitz
|
| 5 |
import nltk
|
| 6 |
from nltk.tokenize import word_tokenize
|
| 7 |
from nltk.corpus import stopwords
|
|
@@ -12,13 +11,14 @@ from sklearn.metrics.pairwise import cosine_similarity
|
|
| 12 |
nltk.download('punkt')
|
| 13 |
nltk.download('stopwords')
|
| 14 |
|
| 15 |
-
# Function to extract text from PDFs using
|
| 16 |
def extract_text_from_pdf(pdf_path):
|
| 17 |
pdf_text = ""
|
| 18 |
-
with
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
|
|
|
| 22 |
return pdf_text
|
| 23 |
|
| 24 |
# Function to clean and tokenize text
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import os
|
| 3 |
+
import PyPDF2 # Import PyPDF2 for PDF text extraction
|
|
|
|
| 4 |
import nltk
|
| 5 |
from nltk.tokenize import word_tokenize
|
| 6 |
from nltk.corpus import stopwords
|
|
|
|
| 11 |
nltk.download('punkt')
|
| 12 |
nltk.download('stopwords')
|
| 13 |
|
| 14 |
+
# Function to extract text from PDFs using PyPDF2
|
| 15 |
def extract_text_from_pdf(pdf_path):
|
| 16 |
pdf_text = ""
|
| 17 |
+
with open(pdf_path, 'rb') as pdf_file:
|
| 18 |
+
pdf_reader = PyPDF2.PdfFileReader(pdf_file)
|
| 19 |
+
for page_num in range(pdf_reader.getNumPages()):
|
| 20 |
+
page = pdf_reader.getPage(page_num)
|
| 21 |
+
pdf_text += page.extractText()
|
| 22 |
return pdf_text
|
| 23 |
|
| 24 |
# Function to clean and tokenize text
|