dindizz commited on
Commit
f120f79
·
verified ·
1 Parent(s): ef4e2a3

Upload 2 files

Browse files
Files changed (1) hide show
  1. app.py +19 -51
app.py CHANGED
@@ -1,82 +1,50 @@
1
  import gradio as gr
2
- import csv
3
 
4
- # Load lexicon from CSV
5
- LEXICON = set()
6
- with open("lexicon.csv", "r", encoding="utf-8") as f:
7
- for row in csv.reader(f):
8
- if row:
9
- LEXICON.add(row[0].strip())
10
 
11
- # --- Expanded Reverse Sandhi Rules ---
12
  REVERSE_SANDHI_RULES = [
13
- ("ा", ["अ+अ"]), # ā -> a + a
14
- ("े", ["अ+इ", "अ+ई"]), # e -> a+i or a+ī
15
- ("ो", ["अ+उ", "अ+ऊ"]), # o -> a+u or a+ū
16
- ("त्त", ["त्+त", "त्+द"]),
17
  ("ः", ["ः+"]), # visarga restoration
18
- ("ं", ["म्+", "न्+"]), # anusvara restoration
19
  ]
20
 
21
  def generate_candidates(word):
22
  candidates = []
23
  for i in range(1, len(word)):
24
  left, right = word[:i], word[i:]
25
-
26
  # Direct split
27
  if left in LEXICON and right in LEXICON:
28
  candidates.append((left, right))
29
-
30
- # Rule-based reverse sandhi
31
  for ch, expansions in REVERSE_SANDHI_RULES:
32
  if left.endswith(ch):
33
  for exp in expansions:
34
- l_base = left[:-len(ch)] + exp.split("+")[0]
35
  r_base = exp.split("+")[1] + right
36
  if l_base in LEXICON and r_base in LEXICON:
37
  candidates.append((l_base, r_base))
38
-
39
  candidates = list(set(candidates))
40
  return candidates or [("No plausible split found", "")]
41
 
42
  def sandhi_splitter(word):
43
- word = word.strip()
44
- if not word:
45
- return "Please enter a word."
46
-
47
- candidates = generate_candidates(word)
48
  formatted = [" + ".join(c) for c in candidates]
49
  return "\n".join(formatted)
50
 
51
- # --- Gradio App ---
52
  with gr.Blocks() as demo:
53
- with gr.Row():
54
- with gr.Column(scale=3):
55
- gr.Markdown("## Sanskrit Sandhi-Splitter (Extended Prototype)")
56
- gr.Markdown(
57
- "**Instructions:**\n"
58
- "1. Enter a **Sanskrit compound word** in Devanagari (e.g. धर्मक्षेत्रे).\n"
59
- "2. Click **Split Sandhi** to see possible splits.\n"
60
- "3. Candidate splits are based on a small dictionary and reverse sandhi rules.\n\n"
61
- "**Contact:** For issues, mail **[email protected]**"
62
- )
63
- inp = gr.Textbox(label="Compound Word (e.g. धर्मक्षेत्रे)")
64
- btn = gr.Button("Split Sandhi")
65
- out = gr.Textbox(label="Candidate Splits", lines=5)
66
-
67
- btn.click(fn=sandhi_splitter, inputs=inp, outputs=out)
68
-
69
- with gr.Column(scale=1):
70
- gr.Markdown(
71
- "### How to Use This Tool\n"
72
- "- Input any Sanskrit **compound** word.\n"
73
- "- Works best with **Devanagari script**.\n"
74
- "- Multiple possible splits may appear.\n\n"
75
- "### Notes\n"
76
- "- Uses **rule-based splitting** + lexicon check.\n"
77
- "- Starter lexicon included (~20 entries); extend with full MW dictionary for accuracy.\n\n"
78
- "**Support:** [email protected]"
79
- )
80
 
81
  if __name__ == "__main__":
82
  demo.launch()
 
1
  import gradio as gr
 
2
 
3
+ # --- Minimal Sanskrit lexicon (extend with real data) ---
4
+ LEXICON = {
5
+ "राम", "वन", "गच्छति", "गुरु", "इन्द्र", "तत्", "अपि",
6
+ "धर्म", "क्षेत्र", "कुरु", "क्षेत्रे"
7
+ }
 
8
 
9
+ # --- Basic Reverse Sandhi Rules ---
10
  REVERSE_SANDHI_RULES = [
11
+ ("ा", ["अ+अ"]), # ā a + a
12
+ ("े", ["अ+इ", "अ+ई"]), # e a+i or a+ī
13
+ ("ो", ["अ+उ", "अ+ऊ"]), # o a+u or a+ū
 
14
  ("ः", ["ः+"]), # visarga restoration
 
15
  ]
16
 
17
  def generate_candidates(word):
18
  candidates = []
19
  for i in range(1, len(word)):
20
  left, right = word[:i], word[i:]
 
21
  # Direct split
22
  if left in LEXICON and right in LEXICON:
23
  candidates.append((left, right))
24
+ # Apply reverse sandhi substitutions
 
25
  for ch, expansions in REVERSE_SANDHI_RULES:
26
  if left.endswith(ch):
27
  for exp in expansions:
28
+ l_base = left[:-1] + exp.split("+")[0]
29
  r_base = exp.split("+")[1] + right
30
  if l_base in LEXICON and r_base in LEXICON:
31
  candidates.append((l_base, r_base))
32
+ # Deduplicate
33
  candidates = list(set(candidates))
34
  return candidates or [("No plausible split found", "")]
35
 
36
  def sandhi_splitter(word):
37
+ candidates = generate_candidates(word.strip())
 
 
 
 
38
  formatted = [" + ".join(c) for c in candidates]
39
  return "\n".join(formatted)
40
 
 
41
  with gr.Blocks() as demo:
42
+ gr.Markdown("## Sanskrit Sandhi-Splitter (Prototype)")
43
+ gr.Markdown("Enter a Sanskrit compound word (Devanagari) to see possible splits.")
44
+ inp = gr.Textbox(label="Compound Word (e.g. धर्मक्षेत्रे)")
45
+ out = gr.Textbox(label="Candidate Splits")
46
+ btn = gr.Button("Split Sandhi")
47
+ btn.click(fn=sandhi_splitter, inputs=inp, outputs=out)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  if __name__ == "__main__":
50
  demo.launch()