Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,50 +1,91 @@
|
|
1 |
import gradio as gr
|
2 |
|
3 |
-
# ---
|
|
|
|
|
4 |
LEXICON = {
|
5 |
"राम", "वन", "गच्छति", "गुरु", "इन्द्र", "तत्", "अपि",
|
6 |
-
"धर्म", "क्षेत्र", "कुरु", "क्षेत्रे"
|
|
|
|
|
7 |
}
|
8 |
|
9 |
-
# ---
|
|
|
|
|
10 |
REVERSE_SANDHI_RULES = [
|
11 |
-
|
12 |
-
("
|
13 |
-
("
|
14 |
-
("
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
]
|
16 |
|
17 |
def generate_candidates(word):
|
18 |
candidates = []
|
19 |
for i in range(1, len(word)):
|
20 |
left, right = word[:i], word[i:]
|
|
|
21 |
# Direct split
|
22 |
if left in LEXICON and right in LEXICON:
|
23 |
candidates.append((left, right))
|
24 |
-
|
|
|
25 |
for ch, expansions in REVERSE_SANDHI_RULES:
|
26 |
if left.endswith(ch):
|
27 |
for exp in expansions:
|
28 |
-
l_base = left[:-
|
29 |
r_base = exp.split("+")[1] + right
|
30 |
if l_base in LEXICON and r_base in LEXICON:
|
31 |
candidates.append((l_base, r_base))
|
|
|
32 |
# Deduplicate
|
33 |
candidates = list(set(candidates))
|
34 |
return candidates or [("No plausible split found", "")]
|
35 |
|
36 |
def sandhi_splitter(word):
|
37 |
-
|
|
|
|
|
|
|
|
|
38 |
formatted = [" + ".join(c) for c in candidates]
|
39 |
return "\n".join(formatted)
|
40 |
|
|
|
41 |
with gr.Blocks() as demo:
|
42 |
-
gr.
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
if __name__ == "__main__":
|
50 |
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
|
3 |
+
# --- Load Sanskrit Lexicon ---
|
4 |
+
# In a production setup, you might load from a file with thousands of entries.
|
5 |
+
# Here, we include an expanded illustrative lexicon; extend with MW/GRETIL for full coverage.
|
6 |
LEXICON = {
|
7 |
"राम", "वन", "गच्छति", "गुरु", "इन्द्र", "तत्", "अपि",
|
8 |
+
"धर्म", "क्षेत्र", "कुरु", "क्षेत्रे", "अस्ति", "शिव", "शक्ति",
|
9 |
+
"पाणि", "पतिः", "सीता", "लक्ष्मण", "हनुमान", "विष्णु", "देव", "गज"
|
10 |
+
# Add more entries or load from a full CSV
|
11 |
}
|
12 |
|
13 |
+
# --- Expanded Reverse Sandhi Rules ---
|
14 |
+
# Format: (left_end, expansions)
|
15 |
+
# expansions: "Left+Right" representing the split form.
|
16 |
REVERSE_SANDHI_RULES = [
|
17 |
+
# Vowel Sandhi
|
18 |
+
("ा", ["अ+अ"]), # ā -> a + a
|
19 |
+
("े", ["अ+इ", "अ+ई"]), # e -> a+i or a+ī
|
20 |
+
("ो", ["अ+उ", "अ+ऊ"]), # o -> a+u or a+ū
|
21 |
+
# Consonant Sandhi: t/d to tt
|
22 |
+
("त्त", ["त्+त", "त्+द"]),
|
23 |
+
# Visarga restoration
|
24 |
+
("ः", ["ः+"]),
|
25 |
+
# Anusvara restoration (ṃ before consonants)
|
26 |
+
("ं", ["म्+", "न्+"]),
|
27 |
]
|
28 |
|
29 |
def generate_candidates(word):
|
30 |
candidates = []
|
31 |
for i in range(1, len(word)):
|
32 |
left, right = word[:i], word[i:]
|
33 |
+
|
34 |
# Direct split
|
35 |
if left in LEXICON and right in LEXICON:
|
36 |
candidates.append((left, right))
|
37 |
+
|
38 |
+
# Rule-based reverse sandhi
|
39 |
for ch, expansions in REVERSE_SANDHI_RULES:
|
40 |
if left.endswith(ch):
|
41 |
for exp in expansions:
|
42 |
+
l_base = left[:-len(ch)] + exp.split("+")[0]
|
43 |
r_base = exp.split("+")[1] + right
|
44 |
if l_base in LEXICON and r_base in LEXICON:
|
45 |
candidates.append((l_base, r_base))
|
46 |
+
|
47 |
# Deduplicate
|
48 |
candidates = list(set(candidates))
|
49 |
return candidates or [("No plausible split found", "")]
|
50 |
|
51 |
def sandhi_splitter(word):
|
52 |
+
word = word.strip()
|
53 |
+
if not word:
|
54 |
+
return "Please enter a word."
|
55 |
+
|
56 |
+
candidates = generate_candidates(word)
|
57 |
formatted = [" + ".join(c) for c in candidates]
|
58 |
return "\n".join(formatted)
|
59 |
|
60 |
+
# --- Gradio App ---
|
61 |
with gr.Blocks() as demo:
|
62 |
+
with gr.Row():
|
63 |
+
with gr.Column(scale=3):
|
64 |
+
gr.Markdown("## Sanskrit Sandhi-Splitter (Prototype, Extended Rules)")
|
65 |
+
gr.Markdown(
|
66 |
+
"**Instructions:**\n"
|
67 |
+
"1. Enter a **Sanskrit compound word** in Devanagari (e.g. धर्मक्षेत्रे).\n"
|
68 |
+
"2. Click **Split Sandhi** to see possible splits.\n"
|
69 |
+
"3. Candidate splits are based on a small dictionary and reverse sandhi rules.\n\n"
|
70 |
+
"**Contact:** For issues, mail **[email protected]**"
|
71 |
+
)
|
72 |
+
inp = gr.Textbox(label="Compound Word (e.g. धर्मक्षेत्रे)")
|
73 |
+
btn = gr.Button("Split Sandhi")
|
74 |
+
out = gr.Textbox(label="Candidate Splits", lines=5)
|
75 |
+
|
76 |
+
btn.click(fn=sandhi_splitter, inputs=inp, outputs=out)
|
77 |
+
|
78 |
+
with gr.Column(scale=1):
|
79 |
+
gr.Markdown(
|
80 |
+
"### How to Use This Tool\n"
|
81 |
+
"- Input any Sanskrit **compound** word.\n"
|
82 |
+
"- Works best with **Devanagari script**.\n"
|
83 |
+
"- Multiple possible splits may appear.\n\n"
|
84 |
+
"### Notes\n"
|
85 |
+
"- Uses **rule-based splitting** + lexicon check.\n"
|
86 |
+
"- Limited lexicon in demo – extend with MW/GRETIL for accuracy.\n\n"
|
87 |
+
"**Support:** [email protected]"
|
88 |
+
)
|
89 |
|
90 |
if __name__ == "__main__":
|
91 |
demo.launch()
|