Spaces:
Runtime error
Runtime error
added files to run
Browse files- extract.py +21 -0
- preprocess.py +25 -0
- requirements.txt +5 -0
- retrieve.py +35 -0
extract.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pdfplumber
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def extract_text_from_pdfs(pdf_files):
|
| 5 |
+
"""
|
| 6 |
+
Extracts text from a list of PDF files.
|
| 7 |
+
|
| 8 |
+
Args:
|
| 9 |
+
pdf_files (list): List of paths to PDF files.
|
| 10 |
+
|
| 11 |
+
Returns:
|
| 12 |
+
list: List of extracted text from each PDF.
|
| 13 |
+
"""
|
| 14 |
+
all_texts = []
|
| 15 |
+
for pdf_file in pdf_files:
|
| 16 |
+
with pdfplumber.open(pdf_file) as pdf:
|
| 17 |
+
text = ""
|
| 18 |
+
for page in pdf.pages:
|
| 19 |
+
text += page.extract_text()
|
| 20 |
+
all_texts.append(text)
|
| 21 |
+
return all_texts
|
preprocess.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import string
|
| 2 |
+
|
| 3 |
+
import nltk
|
| 4 |
+
|
| 5 |
+
nltk.download('punkt')
|
| 6 |
+
from nltk.tokenize import word_tokenize
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def preprocess_text(texts):
|
| 10 |
+
"""
|
| 11 |
+
Preprocesses a list of texts by converting to lowercase, removing punctuation, and tokenizing.
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
texts (list): List of text strings to preprocess.
|
| 15 |
+
|
| 16 |
+
Returns:
|
| 17 |
+
list: List of preprocessed and tokenized texts.
|
| 18 |
+
"""
|
| 19 |
+
processed_texts = []
|
| 20 |
+
for text in texts:
|
| 21 |
+
text = text.lower()
|
| 22 |
+
text = text.translate(str.maketrans('', '', string.punctuation))
|
| 23 |
+
tokens = word_tokenize(text)
|
| 24 |
+
processed_texts.append(tokens)
|
| 25 |
+
return processed_texts
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
PyMuPDF
|
| 3 |
+
nltk
|
| 4 |
+
scikit-learn
|
| 5 |
+
openai
|
retrieve.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def create_vectorizer(processed_texts):
|
| 6 |
+
"""
|
| 7 |
+
Creates a TF-IDF vectorizer and transforms the texts.
|
| 8 |
+
|
| 9 |
+
Args:
|
| 10 |
+
processed_texts (list): List of preprocessed and tokenized texts.
|
| 11 |
+
|
| 12 |
+
Returns:
|
| 13 |
+
tuple: TF-IDF vectorizer and transformed text matrix.
|
| 14 |
+
"""
|
| 15 |
+
vectorizer = TfidfVectorizer()
|
| 16 |
+
X = vectorizer.fit_transform([' '.join(text) for text in processed_texts])
|
| 17 |
+
return vectorizer, X
|
| 18 |
+
|
| 19 |
+
def retrieve(query, X, vectorizer, top_k=5):
|
| 20 |
+
"""
|
| 21 |
+
Retrieves the top-k most relevant texts for a given query.
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
query (str): Query string.
|
| 25 |
+
X (matrix): TF-IDF transformed text matrix.
|
| 26 |
+
vectorizer (TfidfVectorizer): TF-IDF vectorizer.
|
| 27 |
+
top_k (int): Number of top results to retrieve.
|
| 28 |
+
|
| 29 |
+
Returns:
|
| 30 |
+
list: Indices of the top-k most relevant texts.
|
| 31 |
+
"""
|
| 32 |
+
query_vec = vectorizer.transform([query])
|
| 33 |
+
scores = np.dot(X, query_vec.T).toarray()
|
| 34 |
+
top_indices = np.argsort(scores, axis=0)[-top_k:][::-1]
|
| 35 |
+
return top_indices.flatten()
|