Always open text file for write with UTF-8 (#3688)
Browse files### What problem does this PR solve?
Always open text file for write with UTF-8. Close #932
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
- api/utils/file_utils.py +1 -1
- deepdoc/parser/resume/entities/schools.py +6 -3
- deepdoc/vision/t_ocr.py +1 -1
- deepdoc/vision/t_recognizer.py +1 -1
- rag/benchmark.py +2 -2
api/utils/file_utils.py
CHANGED
|
@@ -146,7 +146,7 @@ def rewrite_yaml_conf(conf_path, config):
|
|
| 146 |
|
| 147 |
|
| 148 |
def rewrite_json_file(filepath, json_data):
|
| 149 |
-
with open(filepath, "w") as f:
|
| 150 |
json.dump(json_data, f, indent=4, separators=(",", ": "))
|
| 151 |
f.close()
|
| 152 |
|
|
|
|
| 146 |
|
| 147 |
|
| 148 |
def rewrite_json_file(filepath, json_data):
|
| 149 |
+
with open(filepath, "w", encoding='utf-8') as f:
|
| 150 |
json.dump(json_data, f, indent=4, separators=(",", ": "))
|
| 151 |
f.close()
|
| 152 |
|
deepdoc/parser/resume/entities/schools.py
CHANGED
|
@@ -11,7 +11,10 @@
|
|
| 11 |
# limitations under the License.
|
| 12 |
#
|
| 13 |
|
| 14 |
-
import os
|
|
|
|
|
|
|
|
|
|
| 15 |
import pandas as pd
|
| 16 |
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
| 17 |
TBL = pd.read_csv(os.path.join(current_file_path, "res/schools.csv"), sep="\t", header=0).fillna("")
|
|
@@ -23,7 +26,7 @@ GOOD_SCH = set([re.sub(r"[,. &()()]+", "", c) for c in GOOD_SCH])
|
|
| 23 |
def loadRank(fnm):
|
| 24 |
global TBL
|
| 25 |
TBL["rank"] = 1000000
|
| 26 |
-
with open(fnm, "r",encoding='
|
| 27 |
while True:
|
| 28 |
l = f.readline()
|
| 29 |
if not l:break
|
|
@@ -32,7 +35,7 @@ def loadRank(fnm):
|
|
| 32 |
nm,rk = l[0].strip(),int(l[1])
|
| 33 |
#assert len(TBL[((TBL.name_cn == nm) | (TBL.name_en == nm))]),f"<{nm}>"
|
| 34 |
TBL.loc[((TBL.name_cn == nm) | (TBL.name_en == nm)), "rank"] = rk
|
| 35 |
-
except Exception
|
| 36 |
pass
|
| 37 |
|
| 38 |
|
|
|
|
| 11 |
# limitations under the License.
|
| 12 |
#
|
| 13 |
|
| 14 |
+
import os
|
| 15 |
+
import json
|
| 16 |
+
import re
|
| 17 |
+
import copy
|
| 18 |
import pandas as pd
|
| 19 |
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
| 20 |
TBL = pd.read_csv(os.path.join(current_file_path, "res/schools.csv"), sep="\t", header=0).fillna("")
|
|
|
|
| 26 |
def loadRank(fnm):
|
| 27 |
global TBL
|
| 28 |
TBL["rank"] = 1000000
|
| 29 |
+
with open(fnm, "r", encoding='utf-8') as f:
|
| 30 |
while True:
|
| 31 |
l = f.readline()
|
| 32 |
if not l:break
|
|
|
|
| 35 |
nm,rk = l[0].strip(),int(l[1])
|
| 36 |
#assert len(TBL[((TBL.name_cn == nm) | (TBL.name_en == nm))]),f"<{nm}>"
|
| 37 |
TBL.loc[((TBL.name_cn == nm) | (TBL.name_en == nm)), "rank"] = rk
|
| 38 |
+
except Exception:
|
| 39 |
pass
|
| 40 |
|
| 41 |
|
deepdoc/vision/t_ocr.py
CHANGED
|
@@ -41,7 +41,7 @@ def main(args):
|
|
| 41 |
"score": 1} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]]
|
| 42 |
img = draw_box(images[i], bxs, ["ocr"], 1.)
|
| 43 |
img.save(outputs[i], quality=95)
|
| 44 |
-
with open(outputs[i] + ".txt", "w+") as f:
|
| 45 |
f.write("\n".join([o["text"] for o in bxs]))
|
| 46 |
|
| 47 |
|
|
|
|
| 41 |
"score": 1} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]]
|
| 42 |
img = draw_box(images[i], bxs, ["ocr"], 1.)
|
| 43 |
img.save(outputs[i], quality=95)
|
| 44 |
+
with open(outputs[i] + ".txt", "w+", encoding='utf-8') as f:
|
| 45 |
f.write("\n".join([o["text"] for o in bxs]))
|
| 46 |
|
| 47 |
|
deepdoc/vision/t_recognizer.py
CHANGED
|
@@ -50,7 +50,7 @@ def main(args):
|
|
| 50 |
if args.mode.lower() == "tsr":
|
| 51 |
#lyt = [t for t in lyt if t["type"] == "table column"]
|
| 52 |
html = get_table_html(images[i], lyt, ocr)
|
| 53 |
-
with open(outputs[i] + ".html", "w+") as f:
|
| 54 |
f.write(html)
|
| 55 |
lyt = [{
|
| 56 |
"type": t["label"],
|
|
|
|
| 50 |
if args.mode.lower() == "tsr":
|
| 51 |
#lyt = [t for t in lyt if t["type"] == "table column"]
|
| 52 |
html = get_table_html(images[i], lyt, ocr)
|
| 53 |
+
with open(outputs[i] + ".html", "w+", encoding='utf-8') as f:
|
| 54 |
f.write(html)
|
| 55 |
lyt = [{
|
| 56 |
"type": t["label"],
|
rag/benchmark.py
CHANGED
|
@@ -237,8 +237,8 @@ class Benchmark:
|
|
| 237 |
scores = sorted(scores, key=lambda kk: kk[1])
|
| 238 |
for score in scores[:10]:
|
| 239 |
f.write('- text: ' + str(texts[score[0]]) + '\t qrel: ' + str(score[1]) + '\n')
|
| 240 |
-
json.dump(qrels, open(os.path.join(file_path, dataset + '.qrels.json'), "w+"), indent=2)
|
| 241 |
-
json.dump(run, open(os.path.join(file_path, dataset + '.run.json'), "w+"), indent=2)
|
| 242 |
print(os.path.join(file_path, dataset + '_result.md'), 'Saved!')
|
| 243 |
|
| 244 |
def __call__(self, dataset, file_path, miracl_corpus=''):
|
|
|
|
| 237 |
scores = sorted(scores, key=lambda kk: kk[1])
|
| 238 |
for score in scores[:10]:
|
| 239 |
f.write('- text: ' + str(texts[score[0]]) + '\t qrel: ' + str(score[1]) + '\n')
|
| 240 |
+
json.dump(qrels, open(os.path.join(file_path, dataset + '.qrels.json'), "w+", encoding='utf-8'), indent=2)
|
| 241 |
+
json.dump(run, open(os.path.join(file_path, dataset + '.run.json'), "w+", encoding='utf-8'), indent=2)
|
| 242 |
print(os.path.join(file_path, dataset + '_result.md'), 'Saved!')
|
| 243 |
|
| 244 |
def __call__(self, dataset, file_path, miracl_corpus=''):
|