Spaces:
Sleeping
Sleeping
File size: 5,440 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
# Natural Language Toolkit: PanLex Corpus Reader
#
# Copyright (C) 2001-2023 NLTK Project
# Author: David Kamholz <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
CorpusReader for PanLex Lite, a stripped down version of PanLex distributed
as an SQLite database. See the README.txt in the panlex_lite corpus directory
for more information on PanLex Lite.
"""
import os
import sqlite3
from nltk.corpus.reader.api import CorpusReader
class PanLexLiteCorpusReader(CorpusReader):
MEANING_Q = """
SELECT dnx2.mn, dnx2.uq, dnx2.ap, dnx2.ui, ex2.tt, ex2.lv
FROM dnx
JOIN ex ON (ex.ex = dnx.ex)
JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
JOIN ex ex2 ON (ex2.ex = dnx2.ex)
WHERE dnx.ex != dnx2.ex AND ex.tt = ? AND ex.lv = ?
ORDER BY dnx2.uq DESC
"""
TRANSLATION_Q = """
SELECT s.tt, sum(s.uq) AS trq FROM (
SELECT ex2.tt, max(dnx.uq) AS uq
FROM dnx
JOIN ex ON (ex.ex = dnx.ex)
JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
JOIN ex ex2 ON (ex2.ex = dnx2.ex)
WHERE dnx.ex != dnx2.ex AND ex.lv = ? AND ex.tt = ? AND ex2.lv = ?
GROUP BY ex2.tt, dnx.ui
) s
GROUP BY s.tt
ORDER BY trq DESC, s.tt
"""
def __init__(self, root):
self._c = sqlite3.connect(os.path.join(root, "db.sqlite")).cursor()
self._uid_lv = {}
self._lv_uid = {}
for row in self._c.execute("SELECT uid, lv FROM lv"):
self._uid_lv[row[0]] = row[1]
self._lv_uid[row[1]] = row[0]
def language_varieties(self, lc=None):
"""
Return a list of PanLex language varieties.
:param lc: ISO 639 alpha-3 code. If specified, filters returned varieties
by this code. If unspecified, all varieties are returned.
:return: the specified language varieties as a list of tuples. The first
element is the language variety's seven-character uniform identifier,
and the second element is its default name.
:rtype: list(tuple)
"""
if lc is None:
return self._c.execute("SELECT uid, tt FROM lv ORDER BY uid").fetchall()
else:
return self._c.execute(
"SELECT uid, tt FROM lv WHERE lc = ? ORDER BY uid", (lc,)
).fetchall()
def meanings(self, expr_uid, expr_tt):
"""
Return a list of meanings for an expression.
:param expr_uid: the expression's language variety, as a seven-character
uniform identifier.
:param expr_tt: the expression's text.
:return: a list of Meaning objects.
:rtype: list(Meaning)
"""
expr_lv = self._uid_lv[expr_uid]
mn_info = {}
for i in self._c.execute(self.MEANING_Q, (expr_tt, expr_lv)):
mn = i[0]
uid = self._lv_uid[i[5]]
if not mn in mn_info:
mn_info[mn] = {
"uq": i[1],
"ap": i[2],
"ui": i[3],
"ex": {expr_uid: [expr_tt]},
}
if not uid in mn_info[mn]["ex"]:
mn_info[mn]["ex"][uid] = []
mn_info[mn]["ex"][uid].append(i[4])
return [Meaning(mn, mn_info[mn]) for mn in mn_info]
def translations(self, from_uid, from_tt, to_uid):
"""
Return a list of translations for an expression into a single language
variety.
:param from_uid: the source expression's language variety, as a
seven-character uniform identifier.
:param from_tt: the source expression's text.
:param to_uid: the target language variety, as a seven-character
uniform identifier.
:return: a list of translation tuples. The first element is the expression
text and the second element is the translation quality.
:rtype: list(tuple)
"""
from_lv = self._uid_lv[from_uid]
to_lv = self._uid_lv[to_uid]
return self._c.execute(self.TRANSLATION_Q, (from_lv, from_tt, to_lv)).fetchall()
class Meaning(dict):
"""
Represents a single PanLex meaning. A meaning is a translation set derived
from a single source.
"""
def __init__(self, mn, attr):
super().__init__(**attr)
self["mn"] = mn
def id(self):
"""
:return: the meaning's id.
:rtype: int
"""
return self["mn"]
def quality(self):
"""
:return: the meaning's source's quality (0=worst, 9=best).
:rtype: int
"""
return self["uq"]
def source(self):
"""
:return: the meaning's source id.
:rtype: int
"""
return self["ap"]
def source_group(self):
"""
:return: the meaning's source group id.
:rtype: int
"""
return self["ui"]
def expressions(self):
"""
:return: the meaning's expressions as a dictionary whose keys are language
variety uniform identifiers and whose values are lists of expression
texts.
:rtype: dict
"""
return self["ex"]
|