File size: 9,614 Bytes
0f8d97e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
import io
import gradio as gr
import spacy
from spacy import displacy

from bib_tokenizers import create_references_tokenizer


nlp = None
nlp = spacy.load("spacy-pipelines/model-best")
# return score for each token:
# with threshold set to zero each suggested span is returned, and span == token,
# because suggester is configured to suggest spans with len(span) == 1:
#     [components.spancat.suggester]
#     @misc = "spacy.ngram_suggester.v1"
#     sizes = [1]
nlp.get_pipe("spancat").cfg["threshold"] = 0.0  #  see )
print(nlp.get_pipe("spancat").cfg)


def create_bib_item_start_scorer_for_doc(doc, spanskey="sc"):

    span_group = doc.spans[spanskey]
    assert not span_group.has_overlap
    assert len(span_group) == len(
        doc
    ), "Check suggester config and the spancat threshold to make sure that spangroup contains single token span for each token"

    spans_idx = {
        offset: span.start
        for span in span_group
        for offset in range(span.start_char, span.end_char + 1)
    }

    def scorer(char_offset, fuzzy_in_tokens=(0, 0)):
        i = spans_idx[char_offset]

        span = span_group[i]
        assert i == span.start

        # fuzzines might improve fault tolerance if the model made a small mistake,
        # e.g., if a number from prev line is classified as "citation number",
        #    see example at https://www.deeplearningbook.org/contents/bib.html
        # if fuzzy == (0,0), it return score for the selected span only
        return span, max(
            span_group.attrs["scores"][i]
            for i in range(i - fuzzy_in_tokens[0], i + fuzzy_in_tokens[1] + 1)
            if i >= 0 and i < len(doc.text)
        )

    return scorer


nlp_blank = spacy.blank("en")
nlp_blank.tokenizer = create_references_tokenizer()(nlp_blank)


def split_up_references(
    references: str, is_eol_mode=False, nlp=nlp, nlp_blank=nlp_blank
):
    """
    Args:
        references - a references section, ideally without a header
        nlp - a model that splits up references into separate sentences
        nlp_blank - a blank nlp with the same tokenizer/language
    """

    normalized_references = references.replace("\n", " ")

    # the model trained on 'normalized' references - the ones without '\n'
    doc = nlp(normalized_references)

    # 'transfer' annotations from doc without '\n' (normalized references) to the target doc created from the original input string
    # the problem here is that docs differ in a number of tokens
    # however, it should be easy to align on characters level because both '\n' and ' ' are whitespace, so spans have the same boundaries

    target_doc = nlp_blank(references)
    target_tokens_idx = {
        offset: t.i for t in target_doc for offset in range(t.idx, t.idx + len(t))
    }

    # senter annotations
    for i, t in enumerate(target_doc):
        t.is_sent_start = i == 0
    if is_eol_mode:
        # use SpanCat scores to set sentence boundaries on the target doc
        char_offset = 0
        f = io.StringIO(references)
        token_scorer = create_bib_item_start_scorer_for_doc(doc)
        threshold = 0.2
        lines = [line for line in f]
        lines_len_in_tokens = [
            _len for _len in map(lambda line: len(nlp_blank.tokenizer(line)), lines)
        ]
        for line_num, line in enumerate(lines):
            fuzzy = (
                0 if line_num == 0 else lines_len_in_tokens[line_num - 1] // 4,
                lines_len_in_tokens[line_num] // 4,
            )
            span, score = token_scorer(char_offset, fuzzy_in_tokens=fuzzy)
            print(span, score)
            if score > threshold:
                target_doc[target_tokens_idx[char_offset]].is_sent_start = True
            char_offset += len(line)
    else:
        # copy SentenceRecognizer annotations from doc without '\n' to the target doc
        for t in doc:
            if t.is_sent_start:
                target_doc[target_tokens_idx[t.idx]].is_sent_start = True

    # copy ner annotations:
    target_doc.ents = [
        target_doc.char_span(ent.start_char, ent.end_char, ent.label_)
        for ent in doc.ents
        # remove entities crossing sentence boundaries
        if not any([t.is_sent_start for t in ent if t.i != ent.start])
    ]

    return target_doc


def text_analysis(text, is_eol_mode):

    html = ""

    doc_with_linebreaks = split_up_references(
        text, is_eol_mode=is_eol_mode, nlp=nlp, nlp_blank=nlp_blank
    )

    for i, sent in enumerate(doc_with_linebreaks.sents):
        bib_item_doc = sent.as_doc()
        bib_item_doc.user_data = {"title": f"***** Bib Item {i+1}: *****"}
        html += displacy.render(bib_item_doc, style="ent")

    html = (
        "<div style='max-width:100%; max-height:360px; overflow:auto'>"
        + html
        + "</div>"
    )

    return html


demo = gr.Blocks()
with demo:

    textbox = gr.components.Textbox(
        label="Unparsed Bibliography Section",
        placeholder="Enter bibliography here...",
        lines=20,
    )
    is_eol_mode = gr.components.Checkbox(
        label="a line does not contain more than one bibitem (Multiline bibitems are supported regardless of this choice)"
    )
    html = gr.components.HTML(label="Parsed Bib Items")
    textbox.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
    is_eol_mode.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])

    gr.Examples(
        examples=[
            [
                """[1] B. Foxman, R. Barlow, H. D'Arcy, B. Gillespie, and J. D. Sobel, "Urinary tract infection: self-reported incidence and associated costs," Ann Epidemiol, vol. 10, pp. 509-515, 2000. [2] B. Foxman, "Epidemiology of urinary tract infections: incidence, morbidity, and economic costs," Am J Med, vol. 113, pp. 5-13, 2002. [3] L. Nicolle, "Urinary tract infections in the elderly," Clin Geriatr Med, vol. 25, pp. 423-436, 2009."""
            ],
            [
                """Barth, Fredrik, ed.
	1969	Ethnic groups and boundaries: The social organization of culture difference. Oslo: Scandinavian University Press.
Bondokji, Neven
	2016	The Expectation Gap in Humanitarian Operations: Field Perspectives from Jordan. Asian Journal of Peace Building 4(1):1-28.
Bourdieu, Pierre
		The forms of capital In Handbook of Theory and Research for the Sociology of Education. J. Richardson, ed. Pp. 241-258. New York: Greenwood Publishesrs.
Carrion, Doris
	2015	Are Syrian Refguees a Security Threat to the MIddle East Vol. 2016. London Reuters.
CFR
	2016	The Global Humanitarian Regime: Priorities and Prospects for Reform. Council on Foerign Relations, International Institutues and Global Governance Program"""
            ],
            [
                """(2)	Hofmann, M.H. et al. Aberrant splicing caused by single nucleotide polymorphism c.516G>T [Q172H], a marker of CYP2B6*6, is responsible for decreased expression and activity of CYP2B6 in liver. J Pharmacol Exp Ther  325, 284-92 (2008).
(3) Zanger, U.M. & Klein, K. Pharmacogenetics of cytochrome P450 2B6 (CYP2B6): advances on polymorphisms, mechanisms, and clinical relevance. Front Genet  4, 24 (2013).
(4) Holzinger, E.R. et al. Genome-wide association study of plasma efavirenz pharmacokinetics in AIDS Clinical Trials Group protocols implicates several CYP2B6 variants. Pharmacogenet Genomics  22, 858-67 (2012).
"""
            ],
            [
                """[Ein05] Albert Einstein. Zur Elektrodynamik bewegter K ̈orper. (German)
[On the electrodynamics of moving bodies]. Annalen der Physik,
322(10):891–921, 1905. 
[GMS93] Michel Goossens, Frank Mittelbach, and Alexander Samarin. The LATEX Companion. Addison-Wesley, Reading, Massachusetts, 1993. 
[Knu] Donald Knuth. Knuth: Computers and typesetting."""
            ],
            [
                """References
Bartkiewicz, A., Szymczak, M., Cohen, R. J., & Richards, A. M. S. 2005, MN- RAS, 361, 623
Bartkiewicz, A., Szymczak, M., & van Langevelde, H. J. 2016, A&A, 587, A104
Benjamin, R. A., Churchwell, E., Babler, B. L., et al. 2003, PASP, 115, 953 
Beuther, H., Mottram, J. C., Ahmadi, A., et al. 2018, A&A, 617, A100
Beuther, H., Walsh, A. J., Thorwirth, S., et al. 2007, A&A, 466, 989
Brogan, C. L., Hunter, T. R., Cyganowski, C. J., et al. 2011, ApJ, 739, L16
Brown, A. T., Little, L. T., MacDonald, G. H., Riley, P. W., & Matheson, D. N.
1981, MNRAS, 195, 607
Brown, R. D. & Cragg, D. M. 1991, ApJ, 378, 445
Carrasco-González, C., Sanna, A., Rodríguez-Kamenetzky, A., et al. 2021, ApJ,
914, L1
Cesaroni, R., Walmsley, C. M., & Churchwell, E. 1992, A&A, 256, 618
Cheung, A. C., Rank, D. M., Townes, C. H., Thornton, D. D., & Welch, W. J.
1968, Phys. Rev. Lett., 21, 1701
Churchwell, E., Babler, B. L., Meade, M. R., et al. 2009, PASP, 121, 213
Cohen, R. J. & Brebner, G. C. 1985, MNRAS, 216, 51P
Comito, C., Schilke, P., Endesfelder, U., Jiménez-Serra, I., & Martín-Pintado, J.
2007, A&A, 469, 207
Curiel, S., Ho, P. T. P., Patel, N. A., et al. 2006, ApJ, 638, 878
Danby, G., Flower, D. R., Valiron, P., Schilke, P., & Walmsley, C. M. 1988,
MNRAS, 235, 229
De Buizer, J. M., Liu, M., Tan, J. C., et al. 2017, ApJ, 843, 33
De Buizer, J. M., Radomski, J. T., Telesco, C. M., & Piña, R. K. 2003, ApJ, 598,
1127
Dzib, S., Loinard, L., Rodríguez, L. F., Mioduszewski, A. J., & Torres, R. M.
2011, ApJ, 733, 71
Flower, D. R., Offer, A., & Schilke, P. 1990, MNRAS, 244, 4P
Galván-Madrid, R., Keto, E., Zhang, Q., et al. 2009, ApJ, 706, 1036"""
            ],
        ],
        inputs=textbox,
    )
demo.launch(share=False, server_name="0.0.0.0", server_port=7080)