Spaces:

dindizz
/

sandhisplitter

Sleeping

App Files Files Community

dindizz commited on 17 days ago

Commit

394de29

verified ·

1 Parent(s): f90a91e

Upload 3 files

Browse files

Files changed (2) hide show

app.py +11 -20
lexicon.csv +21 -0

app.py CHANGED Viewed

@@ -1,29 +1,21 @@
 import gradio as gr
-# --- Load Sanskrit Lexicon ---
-# In a production setup, you might load from a file with thousands of entries.
-# Here, we include an expanded illustrative lexicon; extend with MW/GRETIL for full coverage.
-LEXICON = {
-    "राम", "वन", "गच्छति", "गुरु", "इन्द्र", "तत्", "अपि",
-    "धर्म", "क्षेत्र", "कुरु", "क्षेत्रे", "अस्ति", "शिव", "शक्ति",
-    "पाणि", "पतिः", "सीता", "लक्ष्मण", "हनुमान", "विष्णु", "देव", "गज"
-    # Add more entries or load from a full CSV
-}
 # --- Expanded Reverse Sandhi Rules ---
-# Format: (left_end, expansions)
-# expansions: "Left+Right" representing the split form.
 REVERSE_SANDHI_RULES = [
-    # Vowel Sandhi
     ("ा", ["अ+अ"]),          # ā -> a + a
     ("े", ["अ+इ", "अ+ई"]),  # e -> a+i or a+ī
     ("ो", ["अ+उ", "अ+ऊ"]),  # o -> a+u or a+ū
-    # Consonant Sandhi: t/d to tt
     ("त्त", ["त्+त", "त्+द"]),
-    # Visarga restoration
-    ("ः", ["ः+"]),
-    # Anusvara restoration (ṃ before consonants)
-    ("ं", ["म्+", "न्+"]),
 ]
 def generate_candidates(word):
@@ -44,7 +36,6 @@ def generate_candidates(word):
                     if l_base in LEXICON and r_base in LEXICON:
                         candidates.append((l_base, r_base))
-    # Deduplicate
     candidates = list(set(candidates))
     return candidates or [("No plausible split found", "")]
@@ -61,7 +52,7 @@ def sandhi_splitter(word):
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column(scale=3):
-            gr.Markdown("## Sanskrit Sandhi-Splitter (Prototype, Extended Rules)")
             gr.Markdown(
                 "**Instructions:**\n"
                 "1. Enter a **Sanskrit compound word** in Devanagari (e.g. धर्मक्षेत्रे).\n"
@@ -83,7 +74,7 @@ with gr.Blocks() as demo:
                 "- Multiple possible splits may appear.\n\n"
                 "### Notes\n"
                 "- Uses **rule-based splitting** + lexicon check.\n"
-                "- Limited lexicon in demo – extend with MW/GRETIL for accuracy.\n\n"
                 "**Support:** [email protected]"
             )

 import gradio as gr
+import csv
+# Load lexicon from CSV
+LEXICON = set()
+with open("lexicon.csv", "r", encoding="utf-8") as f:
+    for row in csv.reader(f):
+        if row:
+            LEXICON.add(row[0].strip())
 # --- Expanded Reverse Sandhi Rules ---
 REVERSE_SANDHI_RULES = [
     ("ा", ["अ+अ"]),          # ā -> a + a
     ("े", ["अ+इ", "अ+ई"]),  # e -> a+i or a+ī
     ("ो", ["अ+उ", "अ+ऊ"]),  # o -> a+u or a+ū
     ("त्त", ["त्+त", "त्+द"]),
+    ("ः", ["ः+"]),          # visarga restoration
+    ("ं", ["म्+", "न्+"]),   # anusvara restoration
 ]
 def generate_candidates(word):
                     if l_base in LEXICON and r_base in LEXICON:
                         candidates.append((l_base, r_base))
     candidates = list(set(candidates))
     return candidates or [("No plausible split found", "")]
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column(scale=3):
+            gr.Markdown("## Sanskrit Sandhi-Splitter (Extended Prototype)")
             gr.Markdown(
                 "**Instructions:**\n"
                 "1. Enter a **Sanskrit compound word** in Devanagari (e.g. धर्मक्षेत्रे).\n"
                 "- Multiple possible splits may appear.\n\n"
                 "### Notes\n"
                 "- Uses **rule-based splitting** + lexicon check.\n"
+                "- Starter lexicon included (~20 entries); extend with full MW dictionary for accuracy.\n\n"
                 "**Support:** [email protected]"
             )

lexicon.csv ADDED Viewed

	@@ -0,0 +1,21 @@

+राम
+वन
+गच्छति
+गुरु
+इन्द्र
+तत्
+अपि
+धर्म
+क्षेत्र
+कुरु
+अस्ति
+शिव
+शक्ति
+पाणि
+पतिः
+सीता
+लक्ष्मण
+हनुमान
+विष्णु
+देव
+गज