Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -8,17 +8,17 @@ import gradio as gr
|
|
8 |
import os
|
9 |
from sklearn.neighbors import NearestNeighbors
|
10 |
|
|
|
|
|
11 |
|
12 |
def download_pdf(url, output_path):
|
13 |
urllib.request.urlretrieve(url, output_path)
|
14 |
|
15 |
-
|
16 |
def preprocess(text):
|
17 |
text = text.replace('\n', ' ')
|
18 |
text = re.sub('\s+', ' ', text)
|
19 |
return text
|
20 |
|
21 |
-
|
22 |
def pdf_to_text(path, start_page=1, end_page=None):
|
23 |
doc = fitz.open(path)
|
24 |
total_pages = doc.page_count
|
@@ -28,7 +28,7 @@ def pdf_to_text(path, start_page=1, end_page=None):
|
|
28 |
|
29 |
text_list = []
|
30 |
|
31 |
-
for i in range(start_page
|
32 |
text = doc.load_page(i).get_text("text")
|
33 |
text = preprocess(text)
|
34 |
text_list.append(text)
|
@@ -36,22 +36,23 @@ def pdf_to_text(path, start_page=1, end_page=None):
|
|
36 |
doc.close()
|
37 |
return text_list
|
38 |
|
39 |
-
|
40 |
def text_to_chunks(texts, word_length=150, start_page=1):
|
41 |
text_toks = [t.split(' ') for t in texts]
|
|
|
42 |
chunks = []
|
|
|
43 |
for idx, words in enumerate(text_toks):
|
44 |
for i in range(0, len(words), word_length):
|
45 |
-
chunk = words[i:i
|
46 |
-
if (i
|
47 |
-
text_toks
|
|
|
48 |
continue
|
49 |
chunk = ' '.join(chunk).strip()
|
50 |
-
chunk = f'[Page no. {idx
|
51 |
chunks.append(chunk)
|
52 |
return chunks
|
53 |
|
54 |
-
|
55 |
class SemanticSearch:
|
56 |
|
57 |
def __init__(self):
|
@@ -78,7 +79,7 @@ class SemanticSearch:
|
|
78 |
def get_text_embedding(self, texts, batch=1000):
|
79 |
embeddings = []
|
80 |
for i in range(0, len(texts), batch):
|
81 |
-
text_batch = texts[i:(i
|
82 |
emb_batch = self.use(text_batch)
|
83 |
embeddings.append(emb_batch)
|
84 |
embeddings = np.vstack(embeddings)
|
|
|
8 |
import os
|
9 |
from sklearn.neighbors import NearestNeighbors
|
10 |
|
11 |
+
OPENAI_API_KEY = "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"
|
12 |
+
PDF_URL = "https://www.westlondon.nhs.uk/download_file/view/1459/615"
|
13 |
|
14 |
def download_pdf(url, output_path):
|
15 |
urllib.request.urlretrieve(url, output_path)
|
16 |
|
|
|
17 |
def preprocess(text):
|
18 |
text = text.replace('\n', ' ')
|
19 |
text = re.sub('\s+', ' ', text)
|
20 |
return text
|
21 |
|
|
|
22 |
def pdf_to_text(path, start_page=1, end_page=None):
|
23 |
doc = fitz.open(path)
|
24 |
total_pages = doc.page_count
|
|
|
28 |
|
29 |
text_list = []
|
30 |
|
31 |
+
for i in range(start_page-1, end_page):
|
32 |
text = doc.load_page(i).get_text("text")
|
33 |
text = preprocess(text)
|
34 |
text_list.append(text)
|
|
|
36 |
doc.close()
|
37 |
return text_list
|
38 |
|
|
|
39 |
def text_to_chunks(texts, word_length=150, start_page=1):
|
40 |
text_toks = [t.split(' ') for t in texts]
|
41 |
+
page_nums = []
|
42 |
chunks = []
|
43 |
+
|
44 |
for idx, words in enumerate(text_toks):
|
45 |
for i in range(0, len(words), word_length):
|
46 |
+
chunk = words[i:i+word_length]
|
47 |
+
if (i+word_length) > len(words) and (len(chunk) < word_length) and (
|
48 |
+
len(text_toks) != (idx+1)):
|
49 |
+
text_toks[idx+1] = chunk + text_toks[idx+1]
|
50 |
continue
|
51 |
chunk = ' '.join(chunk).strip()
|
52 |
+
chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
|
53 |
chunks.append(chunk)
|
54 |
return chunks
|
55 |
|
|
|
56 |
class SemanticSearch:
|
57 |
|
58 |
def __init__(self):
|
|
|
79 |
def get_text_embedding(self, texts, batch=1000):
|
80 |
embeddings = []
|
81 |
for i in range(0, len(texts), batch):
|
82 |
+
text_batch = texts[i:(i+batch)]
|
83 |
emb_batch = self.use(text_batch)
|
84 |
embeddings.append(emb_batch)
|
85 |
embeddings = np.vstack(embeddings)
|