dexay commited on
Commit
03c1a90
Β·
1 Parent(s): 44746f4

Update postt.py

Browse files
Files changed (1) hide show
  1. postt.py +22 -1
postt.py CHANGED
@@ -42,4 +42,25 @@ def postcor(blist):
42
  for rw in remrow:
43
  crlist.remove(rw)
44
 
45
- return crlist
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  for rw in remrow:
43
  crlist.remove(rw)
44
 
45
+ return crlist
46
+
47
+
48
+ def precor(text):
49
+ lines = text
50
+ lines = lines.replace("breast and prostate cancer","breast cancer and prostate cancer")
51
+ lines = lines.replace("prostate and breast cancer","prostate cancer and breast cancer")
52
+ lines = lines.replace("breast, prostate and ovarian cancer","breast cancer, prostate cancer and ovarian cancer")
53
+ lines = re.sub(r"\[\d*\]", "", lines) # notes
54
+ lines = re.sub(r'\(\s?(figure|Figure|table|Table|fig\.|Fig\.|tab\.|Tab\.)(\s?\w)*\s?\)',"", lines) # (figure)|(table)
55
+ lines = re.sub(r'www\.(?:[-\w.]|(?:%[\da-fA-F]{2}))+', "", lines) # www.ex.com
56
+ lines = re.sub(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', "", lines) # http urls
57
+ lines = re.sub(r'[a-zA-Z0-9]+[_\-.]*[a-zA-Z0-9]*@[a-zA-Z0-9]+\.\w+', "", lines) # emails
58
+ lines = re.sub(r'\(\)', "", lines)
59
+ lines = re.sub(r'\[\s?[0-9]+(\–|,|\-)\s?[0-9]+\s?\]', "", lines)
60
+ lines = re.sub(r'\(\s?[0-9]+(\–|,|\-)\s?[0-9]+\s?\)', "", lines)
61
+ lines = re.sub(r'\[\s?[0-9]+(\–|,|\-)?\s?[0-9]*\s?\]', "", lines)
62
+ lines = re.sub(r'\(\s?[0-9]+(\–|,|\-)?\s?[0-9]*\s?\)', "", lines)
63
+ punc = ";.,?([)]"
64
+ for e in punc:
65
+ lines = lines.replace(e, " "+e+" ")
66
+ return lines