Spaces:

anuragshas
/

en-hi-transliteration

Sleeping

App Files Files Community

en-hi-transliteration / xlit_src.py

anuragshas

Initial Commit

f2874d4 over 3 years ago

raw

history blame

30.1 kB

	import torch
	import torch.nn as nn
	import numpy as np
	import random
	import enum
	import traceback

	import os
	import sys
	import json

	F_DIR = os.path.dirname(os.path.realpath(__file__))


	class XlitError(enum.Enum):
	lang_err = "Unsupported langauge ID requested ;( Please check available languages."
	string_err = "String passed is incompatable ;("
	internal_err = "Internal crash ;("
	unknown_err = "Unknown Failure"
	loading_err = "Loading failed ;( Check if metadata/paths are correctly configured."


	class Encoder(nn.Module):
	"""
	Simple RNN based encoder network
	"""

	def __init__(
	self,
	input_dim,
	embed_dim,
	hidden_dim,
	rnn_type="gru",
	layers=1,
	bidirectional=False,
	dropout=0,
	device="cpu",
	):
	super(Encoder, self).__init__()

	self.input_dim = input_dim # src_vocab_sz
	self.enc_embed_dim = embed_dim
	self.enc_hidden_dim = hidden_dim
	self.enc_rnn_type = rnn_type
	self.enc_layers = layers
	self.enc_directions = 2 if bidirectional else 1
	self.device = device

	self.embedding = nn.Embedding(self.input_dim, self.enc_embed_dim)

	if self.enc_rnn_type == "gru":
	self.enc_rnn = nn.GRU(
	input_size=self.enc_embed_dim,
	hidden_size=self.enc_hidden_dim,
	num_layers=self.enc_layers,
	bidirectional=bidirectional,
	)
	elif self.enc_rnn_type == "lstm":
	self.enc_rnn = nn.LSTM(
	input_size=self.enc_embed_dim,
	hidden_size=self.enc_hidden_dim,
	num_layers=self.enc_layers,
	bidirectional=bidirectional,
	)
	else:
	raise Exception("unknown RNN type mentioned")

	def forward(self, x, x_sz, hidden=None):
	"""
	x_sz: (batch_size, 1) - Unpadded sequence lengths used for pack_pad

	Return:
	output: (batch_size, max_length, hidden_dim)
	hidden: (n_layer*num_directions, batch_size, hidden_dim) \| if LSTM tuple -(h_n, c_n)

	"""
	batch_sz = x.shape[0]
	# x: batch_size, max_length, enc_embed_dim
	x = self.embedding(x)

	## pack the padded data
	# x: max_length, batch_size, enc_embed_dim -> for pack_pad
	x = x.permute(1, 0, 2)
	x = nn.utils.rnn.pack_padded_sequence(x, x_sz, enforce_sorted=False) # unpad

	# output: packed_size, batch_size, enc_embed_dim --> hidden from all timesteps
	# hidden: n_layer**num_directions, batch_size, hidden_dim \| if LSTM (h_n, c_n)
	output, hidden = self.enc_rnn(x)

	## pad the sequence to the max length in the batch
	# output: max_length, batch_size, enc_emb_dim*directions)
	output, _ = nn.utils.rnn.pad_packed_sequence(output)

	# output: batch_size, max_length, hidden_dim
	output = output.permute(1, 0, 2)

	return output, hidden


	class Decoder(nn.Module):
	"""
	Used as decoder stage
	"""

	def __init__(
	self,
	output_dim,
	embed_dim,
	hidden_dim,
	rnn_type="gru",
	layers=1,
	use_attention=True,
	enc_outstate_dim=None, # enc_directions * enc_hidden_dim
	dropout=0,
	device="cpu",
	):
	super(Decoder, self).__init__()

	self.output_dim = output_dim # tgt_vocab_sz
	self.dec_hidden_dim = hidden_dim
	self.dec_embed_dim = embed_dim
	self.dec_rnn_type = rnn_type
	self.dec_layers = layers
	self.use_attention = use_attention
	self.device = device
	if self.use_attention:
	self.enc_outstate_dim = enc_outstate_dim if enc_outstate_dim else hidden_dim
	else:
	self.enc_outstate_dim = 0

	self.embedding = nn.Embedding(self.output_dim, self.dec_embed_dim)

	if self.dec_rnn_type == "gru":
	self.dec_rnn = nn.GRU(
	input_size=self.dec_embed_dim
	+ self.enc_outstate_dim, # to concat attention_output
	hidden_size=self.dec_hidden_dim, # previous Hidden
	num_layers=self.dec_layers,
	batch_first=True,
	)
	elif self.dec_rnn_type == "lstm":
	self.dec_rnn = nn.LSTM(
	input_size=self.dec_embed_dim
	+ self.enc_outstate_dim, # to concat attention_output
	hidden_size=self.dec_hidden_dim, # previous Hidden
	num_layers=self.dec_layers,
	batch_first=True,
	)
	else:
	raise Exception("unknown RNN type mentioned")

	self.fc = nn.Sequential(
	nn.Linear(self.dec_hidden_dim, self.dec_embed_dim),
	nn.LeakyReLU(),
	# nn.Linear(self.dec_embed_dim, self.dec_embed_dim), nn.LeakyReLU(), # removing to reduce size
	nn.Linear(self.dec_embed_dim, self.output_dim),
	)

	##----- Attention ----------
	if self.use_attention:
	self.W1 = nn.Linear(self.enc_outstate_dim, self.dec_hidden_dim)
	self.W2 = nn.Linear(self.dec_hidden_dim, self.dec_hidden_dim)
	self.V = nn.Linear(self.dec_hidden_dim, 1)

	def attention(self, x, hidden, enc_output):
	"""
	x: (batch_size, 1, dec_embed_dim) -> after Embedding
	enc_output: batch_size, max_length, enc_hidden_dim *num_directions
	hidden: n_layers, batch_size, hidden_size \| if LSTM (h_n, c_n)
	"""

	## perform addition to calculate the score

	# hidden_with_time_axis: batch_size, 1, hidden_dim
	## hidden_with_time_axis = hidden.permute(1, 0, 2) ## replaced with below 2lines
	hidden_with_time_axis = torch.sum(hidden, axis=0)

	hidden_with_time_axis = hidden_with_time_axis.unsqueeze(1)

	# score: batch_size, max_length, hidden_dim
	score = torch.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis))

	# attention_weights: batch_size, max_length, 1
	# we get 1 at the last axis because we are applying score to self.V
	attention_weights = torch.softmax(self.V(score), dim=1)

	# context_vector shape after sum == (batch_size, hidden_dim)
	context_vector = attention_weights * enc_output
	context_vector = torch.sum(context_vector, dim=1)
	# context_vector: batch_size, 1, hidden_dim
	context_vector = context_vector.unsqueeze(1)

	# attend_out (batch_size, 1, dec_embed_dim + hidden_size)
	attend_out = torch.cat((context_vector, x), -1)

	return attend_out, attention_weights

	def forward(self, x, hidden, enc_output):
	"""
	x: (batch_size, 1)
	enc_output: batch_size, max_length, dec_embed_dim
	hidden: n_layer, batch_size, hidden_size \| lstm: (h_n, c_n)
	"""
	if (hidden is None) and (self.use_attention is False):
	raise Exception("No use of a decoder with No attention and No Hidden")

	batch_sz = x.shape[0]

	if hidden is None:
	# hidden: n_layers, batch_size, hidden_dim
	hid_for_att = torch.zeros(
	(self.dec_layers, batch_sz, self.dec_hidden_dim)
	).to(self.device)
	elif self.dec_rnn_type == "lstm":
	hid_for_att = hidden[0] # h_n
	else:
	hid_for_att = hidden

	# x (batch_size, 1, dec_embed_dim) -> after embedding
	x = self.embedding(x)

	if self.use_attention:
	# x (batch_size, 1, dec_embed_dim + hidden_size) -> after attention
	# aw: (batch_size, max_length, 1)
	x, aw = self.attention(x, hid_for_att, enc_output)
	else:
	x, aw = x, 0

	# passing the concatenated vector to the GRU
	# output: (batch_size, n_layers, hidden_size)
	# hidden: n_layers, batch_size, hidden_size \| if LSTM (h_n, c_n)
	output, hidden = (
	self.dec_rnn(x, hidden) if hidden is not None else self.dec_rnn(x)
	)

	# output :shp: (batch_size * 1, hidden_size)
	output = output.view(-1, output.size(2))

	# output :shp: (batch_size * 1, output_dim)
	output = self.fc(output)

	return output, hidden, aw


	class Seq2Seq(nn.Module):
	"""
	Used to construct seq2seq architecture with encoder decoder objects
	"""

	def __init__(
	self, encoder, decoder, pass_enc2dec_hid=False, dropout=0, device="cpu"
	):
	super(Seq2Seq, self).__init__()

	self.encoder = encoder
	self.decoder = decoder
	self.device = device
	self.pass_enc2dec_hid = pass_enc2dec_hid

	if self.pass_enc2dec_hid:
	assert (
	decoder.dec_hidden_dim == encoder.enc_hidden_dim
	), "Hidden Dimension of encoder and decoder must be same, or unset `pass_enc2dec_hid`"
	if decoder.use_attention:
	assert (
	decoder.enc_outstate_dim
	== encoder.enc_directions * encoder.enc_hidden_dim
	), "Set `enc_out_dim` correctly in decoder"
	assert (
	self.pass_enc2dec_hid or decoder.use_attention
	), "No use of a decoder with No attention and No Hidden from Encoder"

	def forward(self, src, tgt, src_sz, teacher_forcing_ratio=0):
	"""
	src: (batch_size, sequence_len.padded)
	tgt: (batch_size, sequence_len.padded)
	src_sz: [batch_size, 1] - Unpadded sequence lengths
	"""
	batch_size = tgt.shape[0]

	# enc_output: (batch_size, padded_seq_length, enc_hidden_dim*num_direction)
	# enc_hidden: (enc_layers*num_direction, batch_size, hidden_dim)
	enc_output, enc_hidden = self.encoder(src, src_sz)

	if self.pass_enc2dec_hid:
	# dec_hidden: dec_layers, batch_size , dec_hidden_dim
	dec_hidden = enc_hidden
	else:
	# dec_hidden -> Will be initialized to zeros internally
	dec_hidden = None

	# pred_vecs: (batch_size, output_dim, sequence_sz) -> shape required for CELoss
	pred_vecs = torch.zeros(batch_size, self.decoder.output_dim, tgt.size(1)).to(
	self.device
	)

	# dec_input: (batch_size, 1)
	dec_input = tgt[:, 0].unsqueeze(1) # initialize to start token
	pred_vecs[:, 1, 0] = 1 # Initialize to start tokens all batches
	for t in range(1, tgt.size(1)):
	# dec_hidden: dec_layers, batch_size , dec_hidden_dim
	# dec_output: batch_size, output_dim
	# dec_input: (batch_size, 1)
	dec_output, dec_hidden, _ = self.decoder(
	dec_input,
	dec_hidden,
	enc_output,
	)
	pred_vecs[:, :, t] = dec_output

	# # prediction: batch_size
	prediction = torch.argmax(dec_output, dim=1)

	# Teacher Forcing
	if random.random() < teacher_forcing_ratio:
	dec_input = tgt[:, t].unsqueeze(1)
	else:
	dec_input = prediction.unsqueeze(1)

	return pred_vecs # (batch_size, output_dim, sequence_sz)

	def inference(self, src, max_tgt_sz=50, debug=0):
	"""
	single input only, No batch Inferencing
	src: (sequence_len)
	debug: if True will return attention weights also
	"""
	batch_size = 1
	start_tok = src[0]
	end_tok = src[-1]
	src_sz = torch.tensor([len(src)])
	src_ = src.unsqueeze(0)

	# enc_output: (batch_size, padded_seq_length, enc_hidden_dim*num_direction)
	# enc_hidden: (enc_layers*num_direction, batch_size, hidden_dim)
	enc_output, enc_hidden = self.encoder(src_, src_sz)

	if self.pass_enc2dec_hid:
	# dec_hidden: dec_layers, batch_size , dec_hidden_dim
	dec_hidden = enc_hidden
	else:
	# dec_hidden -> Will be initialized to zeros internally
	dec_hidden = None

	# pred_arr: (sequence_sz, 1) -> shape required for CELoss
	pred_arr = torch.zeros(max_tgt_sz, 1).to(self.device)
	if debug:
	attend_weight_arr = torch.zeros(max_tgt_sz, len(src)).to(self.device)

	# dec_input: (batch_size, 1)
	dec_input = start_tok.view(1, 1) # initialize to start token
	pred_arr[0] = start_tok.view(1, 1) # initialize to start token
	for t in range(max_tgt_sz):
	# dec_hidden: dec_layers, batch_size , dec_hidden_dim
	# dec_output: batch_size, output_dim
	# dec_input: (batch_size, 1)
	dec_output, dec_hidden, aw = self.decoder(
	dec_input,
	dec_hidden,
	enc_output,
	)
	# prediction :shp: (1,1)
	prediction = torch.argmax(dec_output, dim=1)
	dec_input = prediction.unsqueeze(1)
	pred_arr[t] = prediction
	if debug:
	attend_weight_arr[t] = aw.squeeze(-1)

	if torch.eq(prediction, end_tok):
	break

	if debug:
	return pred_arr.squeeze(), attend_weight_arr
	# pred_arr :shp: (sequence_len)
	return pred_arr.squeeze().to(dtype=torch.long)

	def active_beam_inference(self, src, beam_width=3, max_tgt_sz=50):
	"""Active beam Search based decoding
	src: (sequence_len)
	"""

	def _avg_score(p_tup):
	"""Used for Sorting
	TODO: Dividing by length of sequence power alpha as hyperparam
	"""
	return p_tup[0]

	batch_size = 1
	start_tok = src[0]
	end_tok = src[-1]
	src_sz = torch.tensor([len(src)])
	src_ = src.unsqueeze(0)

	# enc_output: (batch_size, padded_seq_length, enc_hidden_dim*num_direction)
	# enc_hidden: (enc_layers*num_direction, batch_size, hidden_dim)
	enc_output, enc_hidden = self.encoder(src_, src_sz)

	if self.pass_enc2dec_hid:
	# dec_hidden: dec_layers, batch_size , dec_hidden_dim
	init_dec_hidden = enc_hidden
	else:
	# dec_hidden -> Will be initialized to zeros internally
	init_dec_hidden = None

	# top_pred[][0] = Σ-log_softmax
	# top_pred[][1] = sequence torch.tensor shape: (1)
	# top_pred[][2] = dec_hidden
	top_pred_list = [(0, start_tok.unsqueeze(0), init_dec_hidden)]

	for t in range(max_tgt_sz):
	cur_pred_list = []

	for p_tup in top_pred_list:
	if p_tup[1][-1] == end_tok:
	cur_pred_list.append(p_tup)
	continue

	# dec_hidden: dec_layers, 1, hidden_dim
	# dec_output: 1, output_dim
	dec_output, dec_hidden, _ = self.decoder(
	x=p_tup[1][-1].view(1, 1), # dec_input: (1,1)
	hidden=p_tup[2],
	enc_output=enc_output,
	)

	## π{prob} = Σ{log(prob)} -> to prevent diminishing
	# dec_output: (1, output_dim)
	dec_output = nn.functional.log_softmax(dec_output, dim=1)
	# pred_topk.values & pred_topk.indices: (1, beam_width)
	pred_topk = torch.topk(dec_output, k=beam_width, dim=1)

	for i in range(beam_width):
	sig_logsmx_ = p_tup[0] + pred_topk.values[0][i]
	# seq_tensor_ : (seq_len)
	seq_tensor_ = torch.cat((p_tup[1], pred_topk.indices[0][i].view(1)))

	cur_pred_list.append((sig_logsmx_, seq_tensor_, dec_hidden))

	cur_pred_list.sort(key=_avg_score, reverse=True) # Maximized order
	top_pred_list = cur_pred_list[:beam_width]

	# check if end_tok of all topk
	end_flags_ = [1 if t[1][-1] == end_tok else 0 for t in top_pred_list]
	if beam_width == sum(end_flags_):
	break

	pred_tnsr_list = [t[1] for t in top_pred_list]

	return pred_tnsr_list

	def passive_beam_inference(self, src, beam_width=7, max_tgt_sz=50):
	"""
	Passive Beam search based inference
	src: (sequence_len)
	"""

	def _avg_score(p_tup):
	"""Used for Sorting
	TODO: Dividing by length of sequence power alpha as hyperparam
	"""
	return p_tup[0]

	def _beam_search_topk(topk_obj, start_tok, beam_width):
	"""search for sequence with maxim prob
	topk_obj[x]: .values & .indices shape:(1, beam_width)
	"""
	# top_pred_list[x]: tuple(prob, seq_tensor)
	top_pred_list = [
	(0, start_tok.unsqueeze(0)),
	]

	for obj in topk_obj:
	new_lst_ = list()
	for itm in top_pred_list:
	for i in range(beam_width):
	sig_logsmx_ = itm[0] + obj.values[0][i]
	seq_tensor_ = torch.cat((itm[1], obj.indices[0][i].view(1)))
	new_lst_.append((sig_logsmx_, seq_tensor_))

	new_lst_.sort(key=_avg_score, reverse=True)
	top_pred_list = new_lst_[:beam_width]
	return top_pred_list

	batch_size = 1
	start_tok = src[0]
	end_tok = src[-1]
	src_sz = torch.tensor([len(src)])
	src_ = src.unsqueeze(0)

	enc_output, enc_hidden = self.encoder(src_, src_sz)

	if self.pass_enc2dec_hid:
	# dec_hidden: dec_layers, batch_size , dec_hidden_dim
	dec_hidden = enc_hidden
	else:
	# dec_hidden -> Will be initialized to zeros internally
	dec_hidden = None

	# dec_input: (1, 1)
	dec_input = start_tok.view(1, 1) # initialize to start token

	topk_obj = []
	for t in range(max_tgt_sz):
	dec_output, dec_hidden, aw = self.decoder(
	dec_input,
	dec_hidden,
	enc_output,
	)

	## π{prob} = Σ{log(prob)} -> to prevent diminishing
	# dec_output: (1, output_dim)
	dec_output = nn.functional.log_softmax(dec_output, dim=1)
	# pred_topk.values & pred_topk.indices: (1, beam_width)
	pred_topk = torch.topk(dec_output, k=beam_width, dim=1)

	topk_obj.append(pred_topk)

	# dec_input: (1, 1)
	dec_input = pred_topk.indices[0][0].view(1, 1)
	if torch.eq(dec_input, end_tok):
	break

	top_pred_list = _beam_search_topk(topk_obj, start_tok, beam_width)
	pred_tnsr_list = [t[1] for t in top_pred_list]

	return pred_tnsr_list


	class GlyphStrawboss:
	def __init__(self, glyphs="en"):
	"""list of letters in a language in unicode
	lang: List with unicodes
	"""
	if glyphs == "en":
	# Smallcase alone
	self.glyphs = [chr(alpha) for alpha in range(97, 123)] + ["é", "è", "á"]
	else:
	self.dossier = json.load(open(glyphs, encoding="utf-8"))
	self.numsym_map = self.dossier["numsym_map"]
	self.glyphs = self.dossier["glyphs"]

	self.indoarab_num = [chr(alpha) for alpha in range(48, 58)]

	self.char2idx = {}
	self.idx2char = {}
	self._create_index()

	def _create_index(self):

	self.char2idx["_"] = 0 # pad
	self.char2idx["$"] = 1 # start
	self.char2idx["#"] = 2 # end
	self.char2idx["*"] = 3 # Mask
	self.char2idx["'"] = 4 # apostrophe U+0027
	self.char2idx["%"] = 5 # unused
	self.char2idx["!"] = 6 # unused
	self.char2idx["?"] = 7
	self.char2idx[":"] = 8
	self.char2idx[" "] = 9
	self.char2idx["-"] = 10
	self.char2idx[","] = 11
	self.char2idx["."] = 12
	self.char2idx["("] = 13
	self.char2idx[")"] = 14
	self.char2idx["/"] = 15
	self.char2idx["^"] = 16

	for idx, char in enumerate(self.indoarab_num):
	self.char2idx[char] = idx + 17
	# letter to index mapping
	for idx, char in enumerate(self.glyphs):
	self.char2idx[char] = idx + 27 # +20 token initially

	# index to letter mapping
	for char, idx in self.char2idx.items():
	self.idx2char[idx] = char

	def size(self):
	return len(self.char2idx)

	def word2xlitvec(self, word):
	"""Converts given string of gyphs(word) to vector(numpy)
	Also adds tokens for start and end
	"""
	try:
	vec = [self.char2idx["$"]] # start token
	for i in list(word):
	vec.append(self.char2idx[i])
	vec.append(self.char2idx["#"]) # end token

	vec = np.asarray(vec, dtype=np.int64)
	return vec

	except Exception as error:
	print("Error In word:", word, "Error Char not in Token:", error)
	sys.exit()

	def xlitvec2word(self, vector):
	"""Converts vector(numpy) to string of glyphs(word)"""
	char_list = []
	for i in vector:
	char_list.append(self.idx2char[i])

	word = "".join(char_list).replace("$", "").replace("#", "") # remove tokens
	word = word.replace("_", "").replace("*", "") # remove tokens
	return word


	class XlitPiston:
	"""
	For handling prediction & post-processing of transliteration for a single language
	Class dependency: Seq2Seq, GlyphStrawboss
	Global Variables: F_DIR
	"""

	def __init__(
	self, weight_path, tglyph_cfg_file, iglyph_cfg_file="en", device="cpu"
	):

	self.device = device
	self.in_glyph_obj = GlyphStrawboss(iglyph_cfg_file)
	self.tgt_glyph_obj = GlyphStrawboss(glyphs=tglyph_cfg_file)

	self._numsym_set = set(
	json.load(open(tglyph_cfg_file, encoding="utf-8"))["numsym_map"].keys()
	)
	self._inchar_set = set("abcdefghijklmnopqrstuvwxyzéèá")
	self._natscr_set = set().union(
	self.tgt_glyph_obj.glyphs, sum(self.tgt_glyph_obj.numsym_map.values(), [])
	)

	## Model Config Static TODO: add defining in json support
	input_dim = self.in_glyph_obj.size()
	output_dim = self.tgt_glyph_obj.size()
	enc_emb_dim = 300
	dec_emb_dim = 300
	enc_hidden_dim = 512
	dec_hidden_dim = 512
	rnn_type = "lstm"
	enc2dec_hid = True
	attention = True
	enc_layers = 1
	dec_layers = 2
	m_dropout = 0
	enc_bidirect = True
	enc_outstate_dim = enc_hidden_dim * (2 if enc_bidirect else 1)

	enc = Encoder(
	input_dim=input_dim,
	embed_dim=enc_emb_dim,
	hidden_dim=enc_hidden_dim,
	rnn_type=rnn_type,
	layers=enc_layers,
	dropout=m_dropout,
	device=self.device,
	bidirectional=enc_bidirect,
	)
	dec = Decoder(
	output_dim=output_dim,
	embed_dim=dec_emb_dim,
	hidden_dim=dec_hidden_dim,
	rnn_type=rnn_type,
	layers=dec_layers,
	dropout=m_dropout,
	use_attention=attention,
	enc_outstate_dim=enc_outstate_dim,
	device=self.device,
	)
	self.model = Seq2Seq(enc, dec, pass_enc2dec_hid=enc2dec_hid, device=self.device)
	self.model = self.model.to(self.device)
	weights = torch.load(weight_path, map_location=torch.device(self.device))

	self.model.load_state_dict(weights)
	self.model.eval()

	def character_model(self, word, beam_width=1):
	in_vec = torch.from_numpy(self.in_glyph_obj.word2xlitvec(word)).to(self.device)
	## change to active or passive beam
	p_out_list = self.model.active_beam_inference(in_vec, beam_width=beam_width)
	result = [
	self.tgt_glyph_obj.xlitvec2word(out.cpu().numpy()) for out in p_out_list
	]

	# List type
	return result

	def numsym_model(self, seg):
	"""tgt_glyph_obj.numsym_map[x] returns a list object"""
	if len(seg) == 1:
	return [seg] + self.tgt_glyph_obj.numsym_map[seg]

	a = [self.tgt_glyph_obj.numsym_map[n][0] for n in seg]
	return [seg] + ["".join(a)]

	def _word_segementer(self, sequence):

	sequence = sequence.lower()
	accepted = set().union(self._numsym_set, self._inchar_set, self._natscr_set)
	# sequence = ''.join([i for i in sequence if i in accepted])

	segment = []
	idx = 0
	seq_ = list(sequence)
	while len(seq_):
	# for Number-Symbol
	temp = ""
	while len(seq_) and seq_[0] in self._numsym_set:
	temp += seq_[0]
	seq_.pop(0)
	if temp != "":
	segment.append(temp)

	# for Target Chars
	temp = ""
	while len(seq_) and seq_[0] in self._natscr_set:
	temp += seq_[0]
	seq_.pop(0)
	if temp != "":
	segment.append(temp)

	# for Input-Roman Chars
	temp = ""
	while len(seq_) and seq_[0] in self._inchar_set:
	temp += seq_[0]
	seq_.pop(0)
	if temp != "":
	segment.append(temp)

	temp = ""
	while len(seq_) and seq_[0] not in accepted:
	temp += seq_[0]
	seq_.pop(0)
	if temp != "":
	segment.append(temp)

	return segment

	def inferencer(self, sequence, beam_width=10):

	seg = self._word_segementer(sequence[:120])
	lit_seg = []

	p = 0
	while p < len(seg):
	if seg[p][0] in self._natscr_set:
	lit_seg.append([seg[p]])
	p += 1

	elif seg[p][0] in self._inchar_set:
	lit_seg.append(self.character_model(seg[p], beam_width=beam_width))
	p += 1

	elif seg[p][0] in self._numsym_set: # num & punc
	lit_seg.append(self.numsym_model(seg[p]))
	p += 1
	else:
	lit_seg.append([seg[p]])
	p += 1

	## IF segment less/equal to 2 then return combinotorial,
	## ELSE only return top1 of each result concatenated
	if len(lit_seg) == 1:
	final_result = lit_seg[0]

	elif len(lit_seg) == 2:
	final_result = [""]
	for seg in lit_seg:
	new_result = []
	for s in seg:
	for f in final_result:
	new_result.append(f + s)
	final_result = new_result

	else:
	new_result = []
	for seg in lit_seg:
	new_result.append(seg[0])
	final_result = ["".join(new_result)]

	return final_result


	class XlitEngine:
	"""
	For Managing the top level tasks and applications of transliteration
	Global Variables: F_DIR
	"""

	def __init__(self, lang2use="hi", config_path="models/default_lineup.json"):
	lineup = json.load(open(os.path.join(F_DIR, config_path), encoding="utf-8"))
	models_path = os.path.join(F_DIR, "models")
	self.lang_config = {}
	if lang2use in lineup:
	self.lang_config[lang2use] = lineup[lang2use]
	else:
	raise Exception(
	"XlitError: The entered Langauge code not found. Available are {}".format(
	lineup.keys()
	)
	)
	self.langs = {}
	self.lang_model = {}
	for la in self.lang_config:
	try:
	print("Loading {}...".format(la))
	self.lang_model[la] = XlitPiston(
	weight_path=os.path.join(
	models_path, self.lang_config[la]["weight"]
	),
	tglyph_cfg_file=os.path.join(
	models_path, self.lang_config[la]["script"]
	),
	iglyph_cfg_file="en",
	)
	self.langs[la] = self.lang_config[la]["name"]
	except Exception as error:
	print("XlitError: Failure in loading {} \n".format(la), error)
	print(XlitError.loading_err.value)

	def translit_word(self, eng_word, lang_code="hi", topk=7, beam_width=10):
	if eng_word == "":
	return []
	if lang_code in self.langs:
	try:
	res_list = self.lang_model[lang_code].inferencer(
	eng_word, beam_width=beam_width
	)
	return res_list[:topk]

	except Exception as error:
	print("XlitError:", traceback.format_exc())
	print(XlitError.internal_err.value)
	return XlitError.internal_err
	else:
	print("XlitError: Unknown Langauge requested", lang_code)
	print(XlitError.lang_err.value)
	return XlitError.lang_err

	def translit_sentence(self, eng_sentence, lang_code="hi", beam_width=10):
	if eng_sentence == "":
	return []

	if lang_code in self.langs:
	try:
	out_str = ""
	for word in eng_sentence.split():
	res_ = self.lang_model[lang_code].inferencer(
	word, beam_width=beam_width
	)
	out_str = out_str + res_[0] + " "
	return out_str[:-1]

	except Exception as error:
	print("XlitError:", traceback.format_exc())
	print(XlitError.internal_err.value)
	return XlitError.internal_err

	else:
	print("XlitError: Unknown Langauge requested", lang_code)
	print(XlitError.lang_err.value)
	return XlitError.lang_err


	if __name__ == "__main__":

	engine = XlitEngine()
	y = engine.translit_sentence("Hello World !")
	print(y)