Tj commited on
Commit
fe904d3
·
1 Parent(s): 8cc7dcc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -10
app.py CHANGED
@@ -8,17 +8,17 @@ import gradio as gr
8
  import os
9
  from sklearn.neighbors import NearestNeighbors
10
 
 
 
11
 
12
  def download_pdf(url, output_path):
13
  urllib.request.urlretrieve(url, output_path)
14
 
15
-
16
  def preprocess(text):
17
  text = text.replace('\n', ' ')
18
  text = re.sub('\s+', ' ', text)
19
  return text
20
 
21
-
22
  def pdf_to_text(path, start_page=1, end_page=None):
23
  doc = fitz.open(path)
24
  total_pages = doc.page_count
@@ -28,7 +28,7 @@ def pdf_to_text(path, start_page=1, end_page=None):
28
 
29
  text_list = []
30
 
31
- for i in range(start_page - 1, end_page):
32
  text = doc.load_page(i).get_text("text")
33
  text = preprocess(text)
34
  text_list.append(text)
@@ -36,22 +36,23 @@ def pdf_to_text(path, start_page=1, end_page=None):
36
  doc.close()
37
  return text_list
38
 
39
-
40
  def text_to_chunks(texts, word_length=150, start_page=1):
41
  text_toks = [t.split(' ') for t in texts]
 
42
  chunks = []
 
43
  for idx, words in enumerate(text_toks):
44
  for i in range(0, len(words), word_length):
45
- chunk = words[i:i + word_length]
46
- if (i + word_length) > len(words) and (len(chunk) < word_length) and (len(text_toks) != (idx + 1)):
47
- text_toks[idx + 1] = chunk + text_toks[idx + 1]
 
48
  continue
49
  chunk = ' '.join(chunk).strip()
50
- chunk = f'[Page no. {idx + start_page}]' + ' ' + '"' + chunk + '"'
51
  chunks.append(chunk)
52
  return chunks
53
 
54
-
55
  class SemanticSearch:
56
 
57
  def __init__(self):
@@ -78,7 +79,7 @@ class SemanticSearch:
78
  def get_text_embedding(self, texts, batch=1000):
79
  embeddings = []
80
  for i in range(0, len(texts), batch):
81
- text_batch = texts[i:(i + batch)]
82
  emb_batch = self.use(text_batch)
83
  embeddings.append(emb_batch)
84
  embeddings = np.vstack(embeddings)
 
8
  import os
9
  from sklearn.neighbors import NearestNeighbors
10
 
11
+ OPENAI_API_KEY = "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"
12
+ PDF_URL = "https://www.westlondon.nhs.uk/download_file/view/1459/615"
13
 
14
  def download_pdf(url, output_path):
15
  urllib.request.urlretrieve(url, output_path)
16
 
 
17
  def preprocess(text):
18
  text = text.replace('\n', ' ')
19
  text = re.sub('\s+', ' ', text)
20
  return text
21
 
 
22
  def pdf_to_text(path, start_page=1, end_page=None):
23
  doc = fitz.open(path)
24
  total_pages = doc.page_count
 
28
 
29
  text_list = []
30
 
31
+ for i in range(start_page-1, end_page):
32
  text = doc.load_page(i).get_text("text")
33
  text = preprocess(text)
34
  text_list.append(text)
 
36
  doc.close()
37
  return text_list
38
 
 
39
  def text_to_chunks(texts, word_length=150, start_page=1):
40
  text_toks = [t.split(' ') for t in texts]
41
+ page_nums = []
42
  chunks = []
43
+
44
  for idx, words in enumerate(text_toks):
45
  for i in range(0, len(words), word_length):
46
+ chunk = words[i:i+word_length]
47
+ if (i+word_length) > len(words) and (len(chunk) < word_length) and (
48
+ len(text_toks) != (idx+1)):
49
+ text_toks[idx+1] = chunk + text_toks[idx+1]
50
  continue
51
  chunk = ' '.join(chunk).strip()
52
+ chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
53
  chunks.append(chunk)
54
  return chunks
55
 
 
56
  class SemanticSearch:
57
 
58
  def __init__(self):
 
79
  def get_text_embedding(self, texts, batch=1000):
80
  embeddings = []
81
  for i in range(0, len(texts), batch):
82
+ text_batch = texts[i:(i+batch)]
83
  emb_batch = self.use(text_batch)
84
  embeddings.append(emb_batch)
85
  embeddings = np.vstack(embeddings)