Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -12,7 +12,27 @@ def encode_text(hindi_text):
|
|
12 |
token_ids = tokenizer.encode(hindi_text)
|
13 |
return token_ids
|
14 |
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
def decode_tokens(token_ids):
|
17 |
"""
|
18 |
Decodes the given token IDs into Hindi text.
|
@@ -36,14 +56,15 @@ with gr.Blocks() as app:
|
|
36 |
gr.Markdown("### Encode Hindi Text to Token IDs")
|
37 |
hindi_text_input = gr.Textbox(label="Enter Hindi Text")
|
38 |
token_ids_output = gr.Textbox(label="Token IDs (Encoded)", interactive=False)
|
|
|
39 |
encode_button = gr.Button("Encode")
|
40 |
|
41 |
# Example for encoding
|
42 |
encode_example = gr.Examples(
|
43 |
examples=["मेरा भारत महान॥", "आपका घर कितनी दूर है?", "स्वतंत्रता दिवस", "द क्विक ब्राउन फॉक्स जम्प्स ओवर ए लेज़ी डॉग।"],
|
44 |
inputs=hindi_text_input,
|
45 |
-
outputs=token_ids_output,
|
46 |
-
fn=
|
47 |
)
|
48 |
|
49 |
with gr.Column():
|
@@ -54,7 +75,11 @@ with gr.Blocks() as app:
|
|
54 |
|
55 |
|
56 |
|
57 |
-
encode_button.click(
|
|
|
|
|
|
|
|
|
58 |
decode_button.click(decode_tokens, inputs=token_ids_input, outputs=decoded_text_output)
|
59 |
|
60 |
app.launch()
|
|
|
12 |
token_ids = tokenizer.encode(hindi_text)
|
13 |
return token_ids
|
14 |
|
15 |
+
def encode_text_with_compression(hindi_text):
|
16 |
+
"""
|
17 |
+
Encodes the given Hindi text into token IDs and calculates the compression ratio.
|
18 |
+
"""
|
19 |
+
# Get token IDs
|
20 |
+
token_ids = tokenizer.encode(hindi_text)
|
21 |
+
|
22 |
+
# Calculate the original text size in bytes
|
23 |
+
text_byte_length = len(hindi_text.encode('utf-8'))
|
24 |
+
|
25 |
+
# Calculate the number of token IDs
|
26 |
+
token_id_length = len(token_ids)
|
27 |
+
|
28 |
+
# Compression ratio
|
29 |
+
if text_byte_length > 0:
|
30 |
+
compression_ratio = token_id_length / text_byte_length
|
31 |
+
else:
|
32 |
+
compression_ratio = 0 # Handle edge case for empty input
|
33 |
+
|
34 |
+
return token_ids, f"{compression_ratio:.2f}"
|
35 |
+
|
36 |
def decode_tokens(token_ids):
|
37 |
"""
|
38 |
Decodes the given token IDs into Hindi text.
|
|
|
56 |
gr.Markdown("### Encode Hindi Text to Token IDs")
|
57 |
hindi_text_input = gr.Textbox(label="Enter Hindi Text")
|
58 |
token_ids_output = gr.Textbox(label="Token IDs (Encoded)", interactive=False)
|
59 |
+
compression_ratio_output = gr.Textbox(label="Compression Ratio", interactive=False)
|
60 |
encode_button = gr.Button("Encode")
|
61 |
|
62 |
# Example for encoding
|
63 |
encode_example = gr.Examples(
|
64 |
examples=["मेरा भारत महान॥", "आपका घर कितनी दूर है?", "स्वतंत्रता दिवस", "द क्विक ब्राउन फॉक्स जम्प्स ओवर ए लेज़ी डॉग।"],
|
65 |
inputs=hindi_text_input,
|
66 |
+
outputs=[token_ids_output, compression_ratio_output],
|
67 |
+
fn=encode_text_with_compression
|
68 |
)
|
69 |
|
70 |
with gr.Column():
|
|
|
75 |
|
76 |
|
77 |
|
78 |
+
encode_button.click(
|
79 |
+
encode_text_with_compression,
|
80 |
+
inputs=hindi_text_input,
|
81 |
+
outputs=[token_ids_output, compression_ratio_output]
|
82 |
+
)
|
83 |
decode_button.click(decode_tokens, inputs=token_ids_input, outputs=decoded_text_output)
|
84 |
|
85 |
app.launch()
|