piyushgrover commited on
Commit
30a712b
·
verified ·
1 Parent(s): 758e465

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -4
app.py CHANGED
@@ -12,7 +12,27 @@ def encode_text(hindi_text):
12
  token_ids = tokenizer.encode(hindi_text)
13
  return token_ids
14
 
15
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  def decode_tokens(token_ids):
17
  """
18
  Decodes the given token IDs into Hindi text.
@@ -36,14 +56,15 @@ with gr.Blocks() as app:
36
  gr.Markdown("### Encode Hindi Text to Token IDs")
37
  hindi_text_input = gr.Textbox(label="Enter Hindi Text")
38
  token_ids_output = gr.Textbox(label="Token IDs (Encoded)", interactive=False)
 
39
  encode_button = gr.Button("Encode")
40
 
41
  # Example for encoding
42
  encode_example = gr.Examples(
43
  examples=["मेरा भारत महान॥", "आपका घर कितनी दूर है?", "स्वतंत्रता दिवस", "द क्विक ब्राउन फॉक्स जम्प्स ओवर ए लेज़ी डॉग।"],
44
  inputs=hindi_text_input,
45
- outputs=token_ids_output,
46
- fn=encode_text
47
  )
48
 
49
  with gr.Column():
@@ -54,7 +75,11 @@ with gr.Blocks() as app:
54
 
55
 
56
 
57
- encode_button.click(encode_text, inputs=hindi_text_input, outputs=token_ids_output)
 
 
 
 
58
  decode_button.click(decode_tokens, inputs=token_ids_input, outputs=decoded_text_output)
59
 
60
  app.launch()
 
12
  token_ids = tokenizer.encode(hindi_text)
13
  return token_ids
14
 
15
+ def encode_text_with_compression(hindi_text):
16
+ """
17
+ Encodes the given Hindi text into token IDs and calculates the compression ratio.
18
+ """
19
+ # Get token IDs
20
+ token_ids = tokenizer.encode(hindi_text)
21
+
22
+ # Calculate the original text size in bytes
23
+ text_byte_length = len(hindi_text.encode('utf-8'))
24
+
25
+ # Calculate the number of token IDs
26
+ token_id_length = len(token_ids)
27
+
28
+ # Compression ratio
29
+ if text_byte_length > 0:
30
+ compression_ratio = token_id_length / text_byte_length
31
+ else:
32
+ compression_ratio = 0 # Handle edge case for empty input
33
+
34
+ return token_ids, f"{compression_ratio:.2f}"
35
+
36
  def decode_tokens(token_ids):
37
  """
38
  Decodes the given token IDs into Hindi text.
 
56
  gr.Markdown("### Encode Hindi Text to Token IDs")
57
  hindi_text_input = gr.Textbox(label="Enter Hindi Text")
58
  token_ids_output = gr.Textbox(label="Token IDs (Encoded)", interactive=False)
59
+ compression_ratio_output = gr.Textbox(label="Compression Ratio", interactive=False)
60
  encode_button = gr.Button("Encode")
61
 
62
  # Example for encoding
63
  encode_example = gr.Examples(
64
  examples=["मेरा भारत महान॥", "आपका घर कितनी दूर है?", "स्वतंत्रता दिवस", "द क्विक ब्राउन फॉक्स जम्प्स ओवर ए लेज़ी डॉग।"],
65
  inputs=hindi_text_input,
66
+ outputs=[token_ids_output, compression_ratio_output],
67
+ fn=encode_text_with_compression
68
  )
69
 
70
  with gr.Column():
 
75
 
76
 
77
 
78
+ encode_button.click(
79
+ encode_text_with_compression,
80
+ inputs=hindi_text_input,
81
+ outputs=[token_ids_output, compression_ratio_output]
82
+ )
83
  decode_button.click(decode_tokens, inputs=token_ids_input, outputs=decoded_text_output)
84
 
85
  app.launch()