pdfChatter

Sleeping

App Files Files Community

Tj commited on May 8, 2023

Commit

fe904d3

1 Parent(s): 8cc7dcc

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -10

app.py CHANGED Viewed

@@ -8,17 +8,17 @@ import gradio as gr
 import os
 from sklearn.neighbors import NearestNeighbors
 def download_pdf(url, output_path):
     urllib.request.urlretrieve(url, output_path)
 def preprocess(text):
     text = text.replace('\n', ' ')
     text = re.sub('\s+', ' ', text)
     return text
 def pdf_to_text(path, start_page=1, end_page=None):
     doc = fitz.open(path)
     total_pages = doc.page_count
@@ -28,7 +28,7 @@ def pdf_to_text(path, start_page=1, end_page=None):
     text_list = []
-    for i in range(start_page - 1, end_page):
         text = doc.load_page(i).get_text("text")
         text = preprocess(text)
         text_list.append(text)
@@ -36,22 +36,23 @@ def pdf_to_text(path, start_page=1, end_page=None):
     doc.close()
     return text_list
 def text_to_chunks(texts, word_length=150, start_page=1):
     text_toks = [t.split(' ') for t in texts]
     chunks = []
     for idx, words in enumerate(text_toks):
         for i in range(0, len(words), word_length):
-            chunk = words[i:i + word_length]
-            if (i + word_length) > len(words) and (len(chunk) < word_length) and (len(text_toks) != (idx + 1)):
-                text_toks[idx + 1] = chunk + text_toks[idx + 1]
                 continue
             chunk = ' '.join(chunk).strip()
-            chunk = f'[Page no. {idx + start_page}]' + ' ' + '"' + chunk + '"'
             chunks.append(chunk)
     return chunks
 class SemanticSearch:
     def __init__(self):
@@ -78,7 +79,7 @@ class SemanticSearch:
     def get_text_embedding(self, texts, batch=1000):
         embeddings = []
         for i in range(0, len(texts), batch):
-            text_batch = texts[i:(i + batch)]
             emb_batch = self.use(text_batch)
             embeddings.append(emb_batch)
         embeddings = np.vstack(embeddings)

 import os
 from sklearn.neighbors import NearestNeighbors
+OPENAI_API_KEY = "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"
+PDF_URL = "https://www.westlondon.nhs.uk/download_file/view/1459/615"
 def download_pdf(url, output_path):
     urllib.request.urlretrieve(url, output_path)
 def preprocess(text):
     text = text.replace('\n', ' ')
     text = re.sub('\s+', ' ', text)
     return text
 def pdf_to_text(path, start_page=1, end_page=None):
     doc = fitz.open(path)
     total_pages = doc.page_count
     text_list = []
+    for i in range(start_page-1, end_page):
         text = doc.load_page(i).get_text("text")
         text = preprocess(text)
         text_list.append(text)
     doc.close()
     return text_list
 def text_to_chunks(texts, word_length=150, start_page=1):
     text_toks = [t.split(' ') for t in texts]
+    page_nums = []
     chunks = []
     for idx, words in enumerate(text_toks):
         for i in range(0, len(words), word_length):
+            chunk = words[i:i+word_length]
+            if (i+word_length) > len(words) and (len(chunk) < word_length) and (
+                len(text_toks) != (idx+1)):
+                text_toks[idx+1] = chunk + text_toks[idx+1]
                 continue
             chunk = ' '.join(chunk).strip()
+            chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
             chunks.append(chunk)
     return chunks
 class SemanticSearch:
     def __init__(self):
     def get_text_embedding(self, texts, batch=1000):
         embeddings = []
         for i in range(0, len(texts), batch):
+            text_batch = texts[i:(i+batch)]
             emb_batch = self.use(text_batch)
             embeddings.append(emb_batch)
         embeddings = np.vstack(embeddings)