Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- app.py +11 -20
- lexicon.csv +21 -0
app.py
CHANGED
@@ -1,29 +1,21 @@
|
|
1 |
import gradio as gr
|
|
|
2 |
|
3 |
-
#
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
"पाणि", "पतिः", "सीता", "लक्ष्मण", "हनुमान", "विष्णु", "देव", "गज"
|
10 |
-
# Add more entries or load from a full CSV
|
11 |
-
}
|
12 |
|
13 |
# --- Expanded Reverse Sandhi Rules ---
|
14 |
-
# Format: (left_end, expansions)
|
15 |
-
# expansions: "Left+Right" representing the split form.
|
16 |
REVERSE_SANDHI_RULES = [
|
17 |
-
# Vowel Sandhi
|
18 |
("ा", ["अ+अ"]), # ā -> a + a
|
19 |
("े", ["अ+इ", "अ+ई"]), # e -> a+i or a+ī
|
20 |
("ो", ["अ+उ", "अ+ऊ"]), # o -> a+u or a+ū
|
21 |
-
# Consonant Sandhi: t/d to tt
|
22 |
("त्त", ["त्+त", "त्+द"]),
|
23 |
-
#
|
24 |
-
("
|
25 |
-
# Anusvara restoration (ṃ before consonants)
|
26 |
-
("ं", ["म्+", "न्+"]),
|
27 |
]
|
28 |
|
29 |
def generate_candidates(word):
|
@@ -44,7 +36,6 @@ def generate_candidates(word):
|
|
44 |
if l_base in LEXICON and r_base in LEXICON:
|
45 |
candidates.append((l_base, r_base))
|
46 |
|
47 |
-
# Deduplicate
|
48 |
candidates = list(set(candidates))
|
49 |
return candidates or [("No plausible split found", "")]
|
50 |
|
@@ -61,7 +52,7 @@ def sandhi_splitter(word):
|
|
61 |
with gr.Blocks() as demo:
|
62 |
with gr.Row():
|
63 |
with gr.Column(scale=3):
|
64 |
-
gr.Markdown("## Sanskrit Sandhi-Splitter (
|
65 |
gr.Markdown(
|
66 |
"**Instructions:**\n"
|
67 |
"1. Enter a **Sanskrit compound word** in Devanagari (e.g. धर्मक्षेत्रे).\n"
|
@@ -83,7 +74,7 @@ with gr.Blocks() as demo:
|
|
83 |
"- Multiple possible splits may appear.\n\n"
|
84 |
"### Notes\n"
|
85 |
"- Uses **rule-based splitting** + lexicon check.\n"
|
86 |
-
"-
|
87 |
"**Support:** [email protected]"
|
88 |
)
|
89 |
|
|
|
1 |
import gradio as gr
|
2 |
+
import csv
|
3 |
|
4 |
+
# Load lexicon from CSV
|
5 |
+
LEXICON = set()
|
6 |
+
with open("lexicon.csv", "r", encoding="utf-8") as f:
|
7 |
+
for row in csv.reader(f):
|
8 |
+
if row:
|
9 |
+
LEXICON.add(row[0].strip())
|
|
|
|
|
|
|
10 |
|
11 |
# --- Expanded Reverse Sandhi Rules ---
|
|
|
|
|
12 |
REVERSE_SANDHI_RULES = [
|
|
|
13 |
("ा", ["अ+अ"]), # ā -> a + a
|
14 |
("े", ["अ+इ", "अ+ई"]), # e -> a+i or a+ī
|
15 |
("ो", ["अ+उ", "अ+ऊ"]), # o -> a+u or a+ū
|
|
|
16 |
("त्त", ["त्+त", "त्+द"]),
|
17 |
+
("ः", ["ः+"]), # visarga restoration
|
18 |
+
("ं", ["म्+", "न्+"]), # anusvara restoration
|
|
|
|
|
19 |
]
|
20 |
|
21 |
def generate_candidates(word):
|
|
|
36 |
if l_base in LEXICON and r_base in LEXICON:
|
37 |
candidates.append((l_base, r_base))
|
38 |
|
|
|
39 |
candidates = list(set(candidates))
|
40 |
return candidates or [("No plausible split found", "")]
|
41 |
|
|
|
52 |
with gr.Blocks() as demo:
|
53 |
with gr.Row():
|
54 |
with gr.Column(scale=3):
|
55 |
+
gr.Markdown("## Sanskrit Sandhi-Splitter (Extended Prototype)")
|
56 |
gr.Markdown(
|
57 |
"**Instructions:**\n"
|
58 |
"1. Enter a **Sanskrit compound word** in Devanagari (e.g. धर्मक्षेत्रे).\n"
|
|
|
74 |
"- Multiple possible splits may appear.\n\n"
|
75 |
"### Notes\n"
|
76 |
"- Uses **rule-based splitting** + lexicon check.\n"
|
77 |
+
"- Starter lexicon included (~20 entries); extend with full MW dictionary for accuracy.\n\n"
|
78 |
"**Support:** [email protected]"
|
79 |
)
|
80 |
|
lexicon.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
राम
|
2 |
+
वन
|
3 |
+
गच्छति
|
4 |
+
गुरु
|
5 |
+
इन्द्र
|
6 |
+
तत्
|
7 |
+
अपि
|
8 |
+
धर्म
|
9 |
+
क्षेत्र
|
10 |
+
कुरु
|
11 |
+
अस्ति
|
12 |
+
शिव
|
13 |
+
शक्ति
|
14 |
+
पाणि
|
15 |
+
पतिः
|
16 |
+
सीता
|
17 |
+
लक्ष्मण
|
18 |
+
हनुमान
|
19 |
+
विष्णु
|
20 |
+
देव
|
21 |
+
गज
|