Spaces:
Sleeping
Sleeping
| # Standard library imports | |
| import warnings | |
| from typing import List, Dict, Union | |
| class TokenizerUtils: | |
| """ | |
| Utility class for handling token-related operations, particularly for identifying tokens | |
| that contain numerals or specific symbols. | |
| This class includes an __init__ method for completeness, but it does not perform any | |
| initialization since the class is intended to be used as a static utility class. | |
| Methods | |
| ------- | |
| find_numeral_symbol_tokens(tokenizer) | |
| Returns a list of token IDs that include numerals or symbols like '%', '$', or '£'. | |
| """ | |
| def __init__(self): | |
| """Initialize the TokenizerUtils class. This method is present for completeness.""" | |
| pass | |
| def find_numeral_symbol_tokens(tokenizer) -> List[int]: | |
| """ | |
| Identifies tokens that contain numerals or certain symbols in the tokenizer vocabulary. | |
| Parameters | |
| ---------- | |
| tokenizer : Any | |
| Tokenizer object with a 'get_vocab' method, typically from Hugging Face's tokenizer library. | |
| Returns | |
| ------- | |
| List[int] | |
| List of token IDs for tokens that contain numerals or symbols. | |
| Examples | |
| -------- | |
| >>> TokenizerUtils.find_numeral_symbol_tokens(tokenizer) | |
| [-1, 123, 456, 789] | |
| """ | |
| numeral_symbol_tokens = [-1] | |
| for token, token_id in tokenizer.get_vocab().items(): | |
| if any(c in "0123456789%$£" for c in token): | |
| numeral_symbol_tokens.append(token_id) | |
| return numeral_symbol_tokens | |
| class Formatter: | |
| """ | |
| A utility class for formatting audio-related data, such as sentence-speaker mappings. | |
| Methods | |
| ------- | |
| add_indices_to_ssm(ssm: List[Dict], reference_length: int = None) -> List[Dict]: | |
| Adds an index key to each item in the SSM list and checks for length mismatches with a reference. | |
| format_ssm_as_dialogue( | |
| ssm: List[Dict], | |
| print_output: bool = False, | |
| return_dict: bool = False | |
| ) -> Union[str, Dict[str, List[str]]]: | |
| Formats sentence-speaker mappings into a readable dialogue format and optionally prints it or returns a | |
| dictionary grouped by speakers. | |
| """ | |
| def add_indices_to_ssm(ssm: List[Dict], reference_length: int = None) -> List[Dict]: | |
| """ | |
| Adds an index key to each item in the SSM list and optionally checks for length mismatches with a reference | |
| length. | |
| Parameters | |
| ---------- | |
| ssm : List[Dict] | |
| The final SSM data. | |
| reference_length : int, optional | |
| A reference length to compare the SSM length against, default is None. | |
| Returns | |
| ------- | |
| List[Dict] | |
| The SSM data with added index keys and any necessary adjustments. | |
| """ | |
| if reference_length is not None and len(ssm) != reference_length: | |
| warnings.warn( | |
| f"Mismatch: SSM Length = {len(ssm)}, Reference Length = {reference_length}. " | |
| f"Adjusting to match lengths...", | |
| UserWarning, | |
| ) | |
| for idx, item in enumerate(ssm): | |
| item["index"] = idx | |
| if reference_length is not None: | |
| if len(ssm) > reference_length: | |
| ssm = ssm[:reference_length] | |
| elif len(ssm) < reference_length: | |
| for i in range(len(ssm), reference_length): | |
| ssm.append({ | |
| "index": i, | |
| "speaker": "Unknown", | |
| "start_time": None, | |
| "end_time": None, | |
| "text": "[Placeholder]" | |
| }) | |
| return ssm | |
| def format_ssm_as_dialogue( | |
| ssm: List[Dict], | |
| print_output: bool = False, | |
| return_dict: bool = False | |
| ) -> Union[str, Dict[str, List[str]]]: | |
| """ | |
| Formats the sentence-speaker mapping (ssm) as a dialogue and optionally prints the result or returns it as a | |
| dictionary grouped by speakers. | |
| Parameters | |
| ---------- | |
| ssm : List[Dict] | |
| List of sentences with speaker labels. | |
| print_output : bool, optional | |
| Whether to print the formatted dialogue, default is False. | |
| return_dict : bool, optional | |
| Whether to return the response as a dictionary grouped by speakers, default is False. | |
| Returns | |
| ------- | |
| Union[str, Dict[str, List[str]]] | |
| If `return_dict` is True, returns a dictionary with speakers as keys and lists of their sentences as values. | |
| Otherwise, returns the formatted dialogue string. | |
| """ | |
| dialogue_dict: Dict[str, List[str]] = {} | |
| for sentence in ssm: | |
| speaker = sentence['speaker'] | |
| text = sentence['text'].strip() | |
| if speaker in dialogue_dict: | |
| dialogue_dict[speaker].append(text) | |
| else: | |
| dialogue_dict[speaker] = [text] | |
| if print_output: | |
| print("Formatted Dialogue:") | |
| for speaker, texts in dialogue_dict.items(): | |
| for text in texts: | |
| print(f"{speaker}: {text}") | |
| print() | |
| if return_dict: | |
| return dialogue_dict | |
| formatted_dialogue = "\n\n".join( | |
| [f"{speaker}: {text}" for speaker, texts in dialogue_dict.items() for text in texts] | |
| ) | |
| return formatted_dialogue | |
| if __name__ == "__main__": | |
| # noinspection PyMissingOrEmptyDocstring | |
| class DummyTokenizer: | |
| def get_vocab(): | |
| return { | |
| "hello": 1, | |
| "world": 2, | |
| "100%": 3, | |
| "$value": 4, | |
| "item_123": 5, | |
| "£price": 6 | |
| } | |
| dummy_tokenizer = DummyTokenizer() | |
| numeral_tokens = TokenizerUtils.find_numeral_symbol_tokens(dummy_tokenizer) | |
| print(f"Numeral and symbol tokens: {numeral_tokens}") | |
| speaker_sentence_mapping = [ | |
| {"speaker": "Speaker 1", "text": "Hello, how are you?"}, | |
| {"speaker": "Speaker 2", "text": "I'm fine, thank you! And you?"}, | |
| {"speaker": "Speaker 1", "text": "I'm doing great, thanks for asking."} | |
| ] | |
| formatted_dialogue_str = Formatter.format_ssm_as_dialogue(speaker_sentence_mapping, print_output=True) | |
| print(f"Formatted Dialogue:\n{formatted_dialogue_str}") | |