Spaces:
Paused
Paused
Commit
·
94c41db
1
Parent(s):
a8118cc
removed uncleaned summaries
Browse files- all_summaries.csv +2 -2
- related_summaries.py +13 -2
- utils.py +45 -0
all_summaries.csv
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ee0536b4e7e2297521a1f11b6f18e788d77f70129c8651fbbb3e7044782e3675
|
| 3 |
+
size 28540700
|
related_summaries.py
CHANGED
|
@@ -2,6 +2,7 @@ import pandas as pd
|
|
| 2 |
import torch
|
| 3 |
from transformers import AutoTokenizer, AutoModel, set_seed
|
| 4 |
from tqdm import tqdm
|
|
|
|
| 5 |
|
| 6 |
from model import MimicTransformer
|
| 7 |
set_seed(42)
|
|
@@ -17,7 +18,7 @@ mimic.eval()
|
|
| 17 |
mimic.cuda()
|
| 18 |
tokenizer = mimic.tokenizer
|
| 19 |
|
| 20 |
-
summaries = pd.read_csv('
|
| 21 |
|
| 22 |
def mean_pooling(model_output, attention_mask):
|
| 23 |
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
|
|
@@ -34,10 +35,20 @@ def get_model_outputs(text):
|
|
| 34 |
|
| 35 |
return_tensors = torch.zeros(size=(10000, 738))
|
| 36 |
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
res = get_model_outputs(text=summary)
|
| 39 |
return_tensors[i, :] = res.detach().cpu()
|
| 40 |
# sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
|
| 41 |
# sentence_embeddings = sentence_embeddings/sentence_embeddings.norm(dim=1)[:,None]
|
| 42 |
|
|
|
|
| 43 |
torch.save(return_tensors, f='discharge_embeddings.pt')
|
|
|
|
| 2 |
import torch
|
| 3 |
from transformers import AutoTokenizer, AutoModel, set_seed
|
| 4 |
from tqdm import tqdm
|
| 5 |
+
from utils import clean_text
|
| 6 |
|
| 7 |
from model import MimicTransformer
|
| 8 |
set_seed(42)
|
|
|
|
| 18 |
mimic.cuda()
|
| 19 |
tokenizer = mimic.tokenizer
|
| 20 |
|
| 21 |
+
summaries = pd.read_csv('all_summaries_backup.csv')['SUMMARIES']
|
| 22 |
|
| 23 |
def mean_pooling(model_output, attention_mask):
|
| 24 |
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
|
|
|
|
| 35 |
|
| 36 |
return_tensors = torch.zeros(size=(10000, 738))
|
| 37 |
|
| 38 |
+
non_defunct_summaries = []
|
| 39 |
+
|
| 40 |
+
for i, summary in tqdm(enumerate(summaries[:50000])):
|
| 41 |
+
cleaned = clean_text(summary)
|
| 42 |
+
if len(non_defunct_summaries) == 10000:
|
| 43 |
+
break
|
| 44 |
+
if len(cleaned) > 100:
|
| 45 |
+
non_defunct_summaries.append(cleaned)
|
| 46 |
+
|
| 47 |
+
for i, summary in tqdm(enumerate(non_defunct_summaries)):
|
| 48 |
res = get_model_outputs(text=summary)
|
| 49 |
return_tensors[i, :] = res.detach().cpu()
|
| 50 |
# sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
|
| 51 |
# sentence_embeddings = sentence_embeddings/sentence_embeddings.norm(dim=1)[:,None]
|
| 52 |
|
| 53 |
+
pd.DataFrame(data={'SUMMARIES':non_defunct_summaries}).to_csv('all_summaries.csv', index=False)
|
| 54 |
torch.save(return_tensors, f='discharge_embeddings.pt')
|
utils.py
CHANGED
|
@@ -3,6 +3,7 @@ import json
|
|
| 3 |
import pandas as pd
|
| 4 |
import ssl
|
| 5 |
import torch
|
|
|
|
| 6 |
from pprint import pprint
|
| 7 |
from captum.attr import visualization
|
| 8 |
|
|
@@ -20,6 +21,50 @@ class PyTMinMaxScalerVectorized(object):
|
|
| 20 |
tensor.mul_(scale).sub_(tensor.min(dim=0, keepdim=True)[0])
|
| 21 |
return tensor
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
def get_drg_link(drg_code):
|
| 24 |
return f'https://www.aapc.com/codes/icd9-codes/{drg_code}'
|
| 25 |
|
|
|
|
| 3 |
import pandas as pd
|
| 4 |
import ssl
|
| 5 |
import torch
|
| 6 |
+
import re
|
| 7 |
from pprint import pprint
|
| 8 |
from captum.attr import visualization
|
| 9 |
|
|
|
|
| 21 |
tensor.mul_(scale).sub_(tensor.min(dim=0, keepdim=True)[0])
|
| 22 |
return tensor
|
| 23 |
|
| 24 |
+
def find_end(text):
|
| 25 |
+
"""Find the end of the report."""
|
| 26 |
+
ends = [len(text)]
|
| 27 |
+
patterns = [
|
| 28 |
+
re.compile(r'BY ELECTRONICALLY SIGNING THIS REPORT', re.I),
|
| 29 |
+
re.compile(r'\n {3,}DR.', re.I),
|
| 30 |
+
re.compile(r'[ ]{1,}RADLINE ', re.I),
|
| 31 |
+
re.compile(r'.*electronically signed on', re.I),
|
| 32 |
+
re.compile(r'M\[0KM\[0KM')
|
| 33 |
+
]
|
| 34 |
+
for pattern in patterns:
|
| 35 |
+
matchobj = pattern.search(text)
|
| 36 |
+
if matchobj:
|
| 37 |
+
ends.append(matchobj.start())
|
| 38 |
+
return min(ends)
|
| 39 |
+
|
| 40 |
+
def pattern_repl(matchobj):
|
| 41 |
+
"""
|
| 42 |
+
Return a replacement string to be used for match object
|
| 43 |
+
"""
|
| 44 |
+
return ' '.rjust(len(matchobj.group(0)))
|
| 45 |
+
|
| 46 |
+
def clean_text(text):
|
| 47 |
+
"""
|
| 48 |
+
Clean text
|
| 49 |
+
"""
|
| 50 |
+
|
| 51 |
+
# Replace [**Patterns**] with spaces.
|
| 52 |
+
text = re.sub(r'\[\*\*.*?\*\*\]', pattern_repl, text)
|
| 53 |
+
# Replace `_` with spaces.
|
| 54 |
+
text = re.sub(r'_', ' ', text)
|
| 55 |
+
|
| 56 |
+
start = 0
|
| 57 |
+
end = find_end(text)
|
| 58 |
+
new_text = ''
|
| 59 |
+
if start > 0:
|
| 60 |
+
new_text += ' ' * start
|
| 61 |
+
new_text = text[start:end]
|
| 62 |
+
|
| 63 |
+
# make sure the new text has the same length of old text.
|
| 64 |
+
if len(text) - end > 0:
|
| 65 |
+
new_text += ' ' * (len(text) - end)
|
| 66 |
+
return new_text
|
| 67 |
+
|
| 68 |
def get_drg_link(drg_code):
|
| 69 |
return f'https://www.aapc.com/codes/icd9-codes/{drg_code}'
|
| 70 |
|