Update postt.py
Browse files
postt.py
CHANGED
@@ -42,4 +42,25 @@ def postcor(blist):
|
|
42 |
for rw in remrow:
|
43 |
crlist.remove(rw)
|
44 |
|
45 |
-
return crlist
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
for rw in remrow:
|
43 |
crlist.remove(rw)
|
44 |
|
45 |
+
return crlist
|
46 |
+
|
47 |
+
|
48 |
+
def precor(text):
|
49 |
+
lines = text
|
50 |
+
lines = lines.replace("breast and prostate cancer","breast cancer and prostate cancer")
|
51 |
+
lines = lines.replace("prostate and breast cancer","prostate cancer and breast cancer")
|
52 |
+
lines = lines.replace("breast, prostate and ovarian cancer","breast cancer, prostate cancer and ovarian cancer")
|
53 |
+
lines = re.sub(r"\[\d*\]", "", lines) # notes
|
54 |
+
lines = re.sub(r'\(\s?(figure|Figure|table|Table|fig\.|Fig\.|tab\.|Tab\.)(\s?\w)*\s?\)',"", lines) # (figure)|(table)
|
55 |
+
lines = re.sub(r'www\.(?:[-\w.]|(?:%[\da-fA-F]{2}))+', "", lines) # www.ex.com
|
56 |
+
lines = re.sub(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', "", lines) # http urls
|
57 |
+
lines = re.sub(r'[a-zA-Z0-9]+[_\-.]*[a-zA-Z0-9]*@[a-zA-Z0-9]+\.\w+', "", lines) # emails
|
58 |
+
lines = re.sub(r'\(\)', "", lines)
|
59 |
+
lines = re.sub(r'\[\s?[0-9]+(\β|,|\-)\s?[0-9]+\s?\]', "", lines)
|
60 |
+
lines = re.sub(r'\(\s?[0-9]+(\β|,|\-)\s?[0-9]+\s?\)', "", lines)
|
61 |
+
lines = re.sub(r'\[\s?[0-9]+(\β|,|\-)?\s?[0-9]*\s?\]', "", lines)
|
62 |
+
lines = re.sub(r'\(\s?[0-9]+(\β|,|\-)?\s?[0-9]*\s?\)', "", lines)
|
63 |
+
punc = ";.,?([)]"
|
64 |
+
for e in punc:
|
65 |
+
lines = lines.replace(e, " "+e+" ")
|
66 |
+
return lines
|