yaowenxu commited on
Commit
b50ea7f
·
1 Parent(s): 7afc79c

Update app.py

Browse files

Signed-off-by: Michael_Xu <[email protected]>

.gitattributes CHANGED
@@ -1,36 +1,3 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- data/learner_examplar_1.1.json filter=lfs diff=lfs merge=lfs -text
 
1
+ # This file is used to manage Git LFS (Large File Storage) for specific files in the repository.
2
+ data/eng/1.0/learner_examplar_1.0.json filter=lfs diff=lfs merge=lfs -text
3
+ data/eng/1.1/learner_examplar_1.1.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Dockerfile CHANGED
@@ -1,6 +1,8 @@
1
  FROM python:3.10
2
 
3
- # 安装系统依赖
 
 
4
  RUN apt-get update && apt-get install -y \
5
  git \
6
  cmake \
@@ -16,28 +18,19 @@ RUN pip install -U --no-cache-dir \
16
  spacy==3.5.0 \
17
  torch==1.13.1
18
 
19
- # 复制依赖文件
20
  COPY requirements.txt .
21
 
22
- # 安装 Python 依赖
23
  RUN pip install -r requirements.txt
24
 
25
  RUN pip install -U --no-cache-dir \
26
  numpy==1.24.1
27
 
28
- # 下载 spaCy 模型
29
  RUN python -m spacy download en_core_web_sm
30
 
31
- # 安装 ffrecord 库
32
  RUN pip install git+https://github.com/HFAiLab/ffrecord.git
33
 
34
- # 设置工作目录
35
  WORKDIR /app
36
 
37
- # 复制应用文件
38
- COPY . .
39
-
40
- # 复制应用代码
41
  COPY . .
42
 
43
  ENV PYTHONPATH=/app
 
1
  FROM python:3.10
2
 
3
+ LABEL maintainer="CxGrammar Team"
4
+ LABEL org.opencontainers.image.source=https://github.com/cxgrammar/cxglearner
5
+
6
  RUN apt-get update && apt-get install -y \
7
  git \
8
  cmake \
 
18
  spacy==3.5.0 \
19
  torch==1.13.1
20
 
 
21
  COPY requirements.txt .
22
 
 
23
  RUN pip install -r requirements.txt
24
 
25
  RUN pip install -U --no-cache-dir \
26
  numpy==1.24.1
27
 
 
28
  RUN python -m spacy download en_core_web_sm
29
 
 
30
  RUN pip install git+https://github.com/HFAiLab/ffrecord.git
31
 
 
32
  WORKDIR /app
33
 
 
 
 
 
34
  COPY . .
35
 
36
  ENV PYTHONPATH=/app
README.md CHANGED
@@ -9,4 +9,18 @@ license: mit
9
  short_description: The Parser Component of CxGLearner
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  short_description: The Parser Component of CxGLearner
10
  ---
11
 
12
+ # CxGParser
13
+ CxG Induction Tools.
14
+
15
+ ## Citation
16
+ If you use GxGLearner in your research, please cite [CoELM: Construction-Enhanced Language Modeling](https://aclanthology.org/2024.acl-long.542/).
17
+
18
+ ```
19
+ @inproceedings{xu2024coelm,
20
+ title={CoELM: Construction-Enhanced Language Modeling},
21
+ author={Xu, Lvxiaowei and Gong, Zhilin and Dai, Jianhua and Wang, Tianxiang and Cai, Ming and Peng, Jiawei},
22
+ booktitle={Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
23
+ pages={10061--10081},
24
+ year={2024}
25
+ }
26
+ ```
app.py CHANGED
@@ -1,95 +1,133 @@
1
- import gradio as gr
2
- import pandas as pd
3
- import numpy as np
4
- import tempfile
5
- import random
6
- import os
7
  import json
 
 
8
  from pathlib import Path
9
 
 
 
 
 
10
  from cxglearner.parser import Parser
11
  from cxglearner.config import DefaultConfigs, Config
12
  from cxglearner.utils import init_logger
13
  from cxglearner.utils.utils_cxs import convert_slots_to_str
14
 
15
- temp_dir = tempfile.gettempdir()
16
- log_dir = Path(temp_dir) / "logs"
17
- log_dir.mkdir(exist_ok=True)
18
- cahce_dir = Path(temp_dir) / "cache"
 
 
19
 
 
 
20
  config = Config(DefaultConfigs.eng)
21
- config.experiment.log_path = log_dir / "eng.log"
22
  logger = init_logger(config)
23
- parser = Parser(config=config, version="1.1", logger=logger, cache_dir=cahce_dir)
24
- examples = [["she should be more polite with the customers."]]
25
- MAX_EXAMPLAR = 10
26
 
27
- with open("data/learner_examplar_1.1.json", "r", encoding="utf-8") as fp:
28
- examplars = json.load(fp)
29
 
30
- logger.debug(len(examplars))
 
 
 
 
 
 
 
 
 
31
 
32
  def fill_input_box(example):
33
  return example[0]
34
 
35
 
36
- def parse_text(text):
37
- if not text: return gr.Dataframe(), gr.update(choices=[], value=None), gr.Dataframe()
 
 
 
 
 
 
 
 
 
38
  encoded_elements = parser.encoder.encode(text, raw=True)
39
- tokens, upos, xpos = np.array(encoded_elements["lexical"]), np.array(encoded_elements["upos"]["spaCy"]), np.array(
40
- encoded_elements["xpos"]["spaCy"])
41
  encoded_elements = np.vstack((tokens, upos, xpos))
 
42
  radio_parsed = parser.parse(text)
43
- radio_parsed = ["{} | {} | {}-{}".format(cxs[0],
44
- convert_slots_to_str(parser.cxs_decoder[cxs[0]], parser.encoder, logger), cxs[1] + 1, cxs[2])
45
- for cxs in radio_parsed[0]]
46
  if len(radio_parsed) == 0:
47
  radio_display = gr.Radio(label="Constructions", choices=[])
48
  else:
49
- radio_display = gr.Radio(
50
- label="Constructions", choices=radio_parsed, interactive=True, value=radio_parsed[0]
51
- )
52
  if len(radio_parsed) == 0:
53
- cons_df = pd.DataFrame()
54
  else:
55
  cxs = radio_parsed[0]
56
  index, cxs, ranges = cxs.split("|")
57
  cxs = cxs.strip()
 
 
 
 
 
 
 
 
58
  if cxs in examplars:
59
  exams = random.choices(examplars[cxs], k=min(MAX_EXAMPLAR, len(examplars[cxs])))
60
- cons_df = pd.DataFrame(exams, columns=[cxs])
61
  else:
62
- cons_df = pd.DataFrame()
63
- return encoded_elements, radio_display, cons_df
 
 
64
 
 
 
 
65
 
66
- def refresh_examplar(option: str):
67
- print(option)
68
  index, cxs, ranges = option.split("|")
69
  index = eval(index)
70
  cxs = cxs.strip()
 
 
 
 
 
 
 
 
71
  if cxs in examplars:
72
  exams = random.choices(examplars[cxs], k=min(MAX_EXAMPLAR, len(examplars[cxs])))
73
- return pd.DataFrame(exams, columns=[cxs])
 
74
  return pd.DataFrame()
75
 
76
 
77
- def clear_text():
78
- return "", pd.DataFrame(), gr.Radio(label="Constructions", choices=[]), pd.DataFrame()
79
-
80
-
81
  with gr.Blocks() as demo:
82
  with gr.Column():
83
  gr.Markdown("## CxGLearner Parser")
84
  with gr.Row():
85
  input_text = gr.Textbox(label="Input Text", placeholder="Enter a sentence here...")
86
-
 
 
 
 
 
87
  with gr.Row():
88
- dataset = gr.Dataset(components=[input_text],
89
- samples=examples,
90
- label="Click an example")
91
- clear_buttton = gr.Button("Clear")
92
- parser_button = gr.Button("Parse")
93
 
94
  with gr.Column():
95
  gr.Markdown("### Results of Encoding and Parsing")
@@ -100,9 +138,10 @@ with gr.Blocks() as demo:
100
  gr.Markdown("### Examplars")
101
  cons_display = gr.Dataframe()
102
 
103
- parser_button.click(fn=parse_text, inputs=[input_text], outputs=[enc_display, cxs_display, cons_display])
104
- clear_buttton.click(fn=clear_text, inputs=[], outputs=[input_text, enc_display, cxs_display, cons_display])
105
  dataset.click(fn=fill_input_box, inputs=dataset, outputs=input_text)
106
- cxs_display.select(refresh_examplar, inputs=[cxs_display], outputs=cons_display)
 
 
 
107
 
108
- demo.launch()
 
1
+ import warnings
2
+ warnings.filterwarnings("ignore", category=UserWarning)
3
+
 
 
 
4
  import json
5
+ import random
6
+ import tempfile
7
  from pathlib import Path
8
 
9
+ import numpy as np
10
+ import pandas as pd
11
+ import gradio as gr
12
+
13
  from cxglearner.parser import Parser
14
  from cxglearner.config import DefaultConfigs, Config
15
  from cxglearner.utils import init_logger
16
  from cxglearner.utils.utils_cxs import convert_slots_to_str
17
 
18
+ MAX_EXAMPLAR = 8
19
+
20
+ examples = [
21
+ ["She should be more polite with the customers."],
22
+ ["The advantage of a bad memory is that one enjoys several times the same good things for the first time."],
23
+ ]
24
 
25
+ cache_dir = Path(tempfile.gettempdir()) / "cxg"
26
+ cache_dir.mkdir(exist_ok=True)
27
  config = Config(DefaultConfigs.eng)
28
+ config.experiment.log_path = cache_dir / "cxg.log"
29
  logger = init_logger(config)
 
 
 
30
 
31
+ parser_1_0 = Parser(config=config, version="1.0", logger=logger, cache_dir=cache_dir)
32
+ parser_1_1 = Parser(config=config, version="1.1", logger=logger, cache_dir=cache_dir)
33
 
34
+ examplars_1_0 = json.load(open("data/eng/1.0/learner_examplar_1.0.json", "r", encoding="utf-8"))
35
+ examplars_1_1 = json.load(open("data/eng/1.1/learner_examplar_1.1.json", "r", encoding="utf-8"))
36
+
37
+ metadata = {
38
+ "English": {
39
+ "1.0": [parser_1_0, examplars_1_0],
40
+ "1.1": [parser_1_1, examplars_1_1],
41
+ },
42
+ "Chinese": {},
43
+ }
44
 
45
  def fill_input_box(example):
46
  return example[0]
47
 
48
 
49
+ def clear_text():
50
+ return "", pd.DataFrame(), gr.Radio(label="Constructions", choices=[]), pd.DataFrame()
51
+
52
+
53
+ def parse_text(text, language, version):
54
+ if not text:
55
+ return pd.DataFrame(), gr.Radio(label="Constructions", choices=[]), pd.DataFrame()
56
+
57
+ print(language, version, text)
58
+
59
+ parser = metadata[language][version][0]
60
  encoded_elements = parser.encoder.encode(text, raw=True)
61
+ tokens, upos, xpos = np.array(encoded_elements["lexical"]), np.array(encoded_elements["upos"]["spaCy"]), np.array(encoded_elements["xpos"]["spaCy"])
 
62
  encoded_elements = np.vstack((tokens, upos, xpos))
63
+
64
  radio_parsed = parser.parse(text)
65
+ radio_parsed = ["{} | {} | {}-{}".format(cxs[0],convert_slots_to_str(parser.cxs_decoder[cxs[0]], parser.encoder, logger), cxs[1] + 1, cxs[2]) for cxs in radio_parsed[0]]
66
+
 
67
  if len(radio_parsed) == 0:
68
  radio_display = gr.Radio(label="Constructions", choices=[])
69
  else:
70
+ radio_display = gr.Radio(label="Constructions", choices=radio_parsed, interactive=True, value=radio_parsed[0])
71
+
 
72
  if len(radio_parsed) == 0:
73
+ cons_display = pd.DataFrame()
74
  else:
75
  cxs = radio_parsed[0]
76
  index, cxs, ranges = cxs.split("|")
77
  cxs = cxs.strip()
78
+
79
+ examplars = metadata[language][version][1]
80
+
81
+ columns_name = cxs
82
+
83
+ if version == "1.0":
84
+ cxs = cxs.replace('Ġ', '')
85
+
86
  if cxs in examplars:
87
  exams = random.choices(examplars[cxs], k=min(MAX_EXAMPLAR, len(examplars[cxs])))
88
+ cons_display = pd.DataFrame(exams, columns=[columns_name])
89
  else:
90
+ cons_display = pd.DataFrame()
91
+
92
+ return encoded_elements, radio_display, cons_display
93
+
94
 
95
+ def refresh_examplar(option, language, version):
96
+
97
+ print(language, version, option)
98
 
 
 
99
  index, cxs, ranges = option.split("|")
100
  index = eval(index)
101
  cxs = cxs.strip()
102
+
103
+ examplars = metadata[language][version][1]
104
+
105
+ columns_name = cxs
106
+
107
+ if version == "1.0":
108
+ cxs = cxs.replace('Ġ', '')
109
+
110
  if cxs in examplars:
111
  exams = random.choices(examplars[cxs], k=min(MAX_EXAMPLAR, len(examplars[cxs])))
112
+ return pd.DataFrame(exams, columns=[columns_name])
113
+
114
  return pd.DataFrame()
115
 
116
 
 
 
 
 
117
  with gr.Blocks() as demo:
118
  with gr.Column():
119
  gr.Markdown("## CxGLearner Parser")
120
  with gr.Row():
121
  input_text = gr.Textbox(label="Input Text", placeholder="Enter a sentence here...")
122
+
123
+ with gr.Row():
124
+ dataset = gr.Dataset(components=[input_text], samples=examples, label="Make a Choice")
125
+ with gr.Row():
126
+ language_radio = gr.Radio(["English", "Chinese"], value="English", interactive=False, label="Which language would you like to parse?")
127
+ version_radio = gr.Radio(["1.1", "1.0"], value="1.1", interactive=True, label="Which version would you like to use?")
128
  with gr.Row():
129
+ clear_buttton = gr.Button("Clear")
130
+ parser_button = gr.Button("Parse")
 
 
 
131
 
132
  with gr.Column():
133
  gr.Markdown("### Results of Encoding and Parsing")
 
138
  gr.Markdown("### Examplars")
139
  cons_display = gr.Dataframe()
140
 
 
 
141
  dataset.click(fn=fill_input_box, inputs=dataset, outputs=input_text)
142
+ clear_buttton.click(fn=clear_text, inputs=[], outputs=[input_text, enc_display, cxs_display, cons_display])
143
+ parser_button.click(fn=parse_text, inputs=[input_text, language_radio, version_radio], outputs=[enc_display, cxs_display, cons_display])
144
+ cxs_display.change(refresh_examplar, inputs=[cxs_display, language_radio, version_radio], outputs=cons_display)
145
+
146
 
147
+ demo.launch()
data/eng/1.0/learner_examplar_1.0.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f41818549b054007efc199805241c12ad84dfc8d9da36a2b68d8695d28b1ecbd
3
+ size 22783497
data/{learner_examplar_1.1.json → eng/1.1/learner_examplar_1.1.json} RENAMED
File without changes
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
  unidecode
2
  beautifulsoup4
3
- cxglearner==1.3.1
4
  gradio
 
1
  unidecode
2
  beautifulsoup4
3
+ cxglearner==1.3.2
4
  gradio