dindizz commited on
Commit
394de29
·
verified ·
1 Parent(s): f90a91e

Upload 3 files

Browse files
Files changed (2) hide show
  1. app.py +11 -20
  2. lexicon.csv +21 -0
app.py CHANGED
@@ -1,29 +1,21 @@
1
  import gradio as gr
 
2
 
3
- # --- Load Sanskrit Lexicon ---
4
- # In a production setup, you might load from a file with thousands of entries.
5
- # Here, we include an expanded illustrative lexicon; extend with MW/GRETIL for full coverage.
6
- LEXICON = {
7
- "राम", "वन", "गच्छति", "गुरु", "इन्द्र", "तत्", "अपि",
8
- "धर्म", "क्षेत्र", "कुरु", "क्षेत्रे", "अस्ति", "शिव", "शक्ति",
9
- "पाणि", "पतिः", "सीता", "लक्ष्मण", "हनुमान", "विष्णु", "देव", "गज"
10
- # Add more entries or load from a full CSV
11
- }
12
 
13
  # --- Expanded Reverse Sandhi Rules ---
14
- # Format: (left_end, expansions)
15
- # expansions: "Left+Right" representing the split form.
16
  REVERSE_SANDHI_RULES = [
17
- # Vowel Sandhi
18
  ("ा", ["अ+अ"]), # ā -> a + a
19
  ("े", ["अ+इ", "अ+ई"]), # e -> a+i or a+ī
20
  ("ो", ["अ+उ", "अ+ऊ"]), # o -> a+u or a+ū
21
- # Consonant Sandhi: t/d to tt
22
  ("त्त", ["त्+त", "त्+द"]),
23
- # Visarga restoration
24
- ("", ["ः+"]),
25
- # Anusvara restoration (ṃ before consonants)
26
- ("ं", ["म्+", "न्+"]),
27
  ]
28
 
29
  def generate_candidates(word):
@@ -44,7 +36,6 @@ def generate_candidates(word):
44
  if l_base in LEXICON and r_base in LEXICON:
45
  candidates.append((l_base, r_base))
46
 
47
- # Deduplicate
48
  candidates = list(set(candidates))
49
  return candidates or [("No plausible split found", "")]
50
 
@@ -61,7 +52,7 @@ def sandhi_splitter(word):
61
  with gr.Blocks() as demo:
62
  with gr.Row():
63
  with gr.Column(scale=3):
64
- gr.Markdown("## Sanskrit Sandhi-Splitter (Prototype, Extended Rules)")
65
  gr.Markdown(
66
  "**Instructions:**\n"
67
  "1. Enter a **Sanskrit compound word** in Devanagari (e.g. धर्मक्षेत्रे).\n"
@@ -83,7 +74,7 @@ with gr.Blocks() as demo:
83
  "- Multiple possible splits may appear.\n\n"
84
  "### Notes\n"
85
  "- Uses **rule-based splitting** + lexicon check.\n"
86
- "- Limited lexicon in demo extend with MW/GRETIL for accuracy.\n\n"
87
  "**Support:** [email protected]"
88
  )
89
 
 
1
  import gradio as gr
2
+ import csv
3
 
4
+ # Load lexicon from CSV
5
+ LEXICON = set()
6
+ with open("lexicon.csv", "r", encoding="utf-8") as f:
7
+ for row in csv.reader(f):
8
+ if row:
9
+ LEXICON.add(row[0].strip())
 
 
 
10
 
11
  # --- Expanded Reverse Sandhi Rules ---
 
 
12
  REVERSE_SANDHI_RULES = [
 
13
  ("ा", ["अ+अ"]), # ā -> a + a
14
  ("े", ["अ+इ", "अ+ई"]), # e -> a+i or a+ī
15
  ("ो", ["अ+उ", "अ+ऊ"]), # o -> a+u or a+ū
 
16
  ("त्त", ["त्+त", "त्+द"]),
17
+ ("ः", ["ः+"]), # visarga restoration
18
+ ("", ["म्+", "न्+"]), # anusvara restoration
 
 
19
  ]
20
 
21
  def generate_candidates(word):
 
36
  if l_base in LEXICON and r_base in LEXICON:
37
  candidates.append((l_base, r_base))
38
 
 
39
  candidates = list(set(candidates))
40
  return candidates or [("No plausible split found", "")]
41
 
 
52
  with gr.Blocks() as demo:
53
  with gr.Row():
54
  with gr.Column(scale=3):
55
+ gr.Markdown("## Sanskrit Sandhi-Splitter (Extended Prototype)")
56
  gr.Markdown(
57
  "**Instructions:**\n"
58
  "1. Enter a **Sanskrit compound word** in Devanagari (e.g. धर्मक्षेत्रे).\n"
 
74
  "- Multiple possible splits may appear.\n\n"
75
  "### Notes\n"
76
  "- Uses **rule-based splitting** + lexicon check.\n"
77
+ "- Starter lexicon included (~20 entries); extend with full MW dictionary for accuracy.\n\n"
78
  "**Support:** [email protected]"
79
  )
80
 
lexicon.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ राम
2
+ वन
3
+ गच्छति
4
+ गुरु
5
+ इन्द्र
6
+ तत्
7
+ अपि
8
+ धर्म
9
+ क्षेत्र
10
+ कुरु
11
+ अस्ति
12
+ शिव
13
+ शक्ति
14
+ पाणि
15
+ पतिः
16
+ सीता
17
+ लक्ष्मण
18
+ हनुमान
19
+ विष्णु
20
+ देव
21
+ गज