Spaces:
Runtime error
Runtime error
| import string | |
| from collections import Counter | |
| from nltk import word_tokenize | |
| from nltk.corpus import stopwords | |
| from nltk.stem import WordNetLemmatizer | |
| from nltk.probability import FreqDist | |
| import torch | |
| def preprocess_text(text, remove_stopwords=True, use_lemmatization=True): | |
| tokens = word_tokenize(text.lower()) | |
| tokens = [token for token in tokens if token.isalpha()] | |
| if remove_stopwords: | |
| stop_words = set(stopwords.words("english")) | |
| tokens = [token for token in tokens if token not in stop_words] | |
| if use_lemmatization: | |
| lemmatizer = WordNetLemmatizer() | |
| tokens = [lemmatizer.lemmatize(token) for token in tokens] | |
| return tokens | |
| def get_special_chars(): | |
| import emoji # Use version emoji==1.6.1, otherwise it won't have UNICODE_EMOJI | |
| main_special_characters = string.punctuation + string.digits + string.whitespace | |
| other_special_characters = ( | |
| " ’“”–ー一▬…✦�£•€«»°·═" | |
| "×士^˘⇓↓↑←→()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰ ‑≤≥‖" | |
| "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン:∼⁄・♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚" | |
| "゜ʼ≖ʼ¤ッツシ℃√!【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖" | |
| "」﴾》" | |
| ) | |
| emoji = list(emoji.UNICODE_EMOJI["en"].keys()) | |
| special_characters_default = set(main_special_characters + other_special_characters) | |
| special_characters_default.update(emoji) | |
| return special_characters_default | |
| special_characters_default = get_special_chars() | |
| # -------------------- Features -------------------- | |
| def syllable_count(word, d): | |
| return [len(list(y for y in x if y[-1].isdigit())) for x in d.get(word, [])] | |
| def estimated_slightly_difficult_words_ratio(text, d): | |
| words = word_tokenize(text.lower()) | |
| total_words = len(words) | |
| # Considering words with 3 or more syllables as difficult | |
| difficult_count = sum( | |
| 1 for word in words if sum(1 for _ in syllable_count(word, d)) >= 2 | |
| ) | |
| return difficult_count / total_words if total_words > 0 else 0 | |
| # -------------------- Features -------------------- | |
| def entity_density(text, nlp): | |
| doc = nlp(text) | |
| return len(doc.ents) / len(doc) | |
| # -------------------- Features -------------------- | |
| def determiners_frequency(text, nlp): | |
| doc = nlp(text) | |
| determiners = sum(1 for token in doc if token.pos_ == "DET") | |
| total_words = len(doc) | |
| return determiners / total_words if total_words else 0 | |
| # -------------------- Features -------------------- | |
| def punctuation_diversity(text): | |
| punctuation_counts = Counter( | |
| char for char in text if char in special_characters_default | |
| ) | |
| diversity_score = ( | |
| len(punctuation_counts) / len(special_characters_default) | |
| if special_characters_default | |
| else 0 | |
| ) | |
| return diversity_score | |
| # -------------------- Features -------------------- | |
| def type_token_ratio(text, remove_stopwords=True, use_lemmatization=True): | |
| tokens = preprocess_text(text, remove_stopwords, use_lemmatization) | |
| unique_words = set(tokens) | |
| return len(unique_words) / len(tokens) if tokens else 0 | |
| # -------------------- Features -------------------- | |
| def hapax_legomena_ratio(text, remove_stopwords=True, use_lemmatization=True): | |
| tokens = word_tokenize(text.lower()) | |
| tokens = [token for token in tokens if token.isalpha()] | |
| if remove_stopwords: | |
| stop_words = set(stopwords.words("english")) | |
| tokens = [token for token in tokens if token not in stop_words] | |
| if use_lemmatization: | |
| lemmatizer = WordNetLemmatizer() | |
| tokens = [lemmatizer.lemmatize(token) for token in tokens] | |
| freq_dist = FreqDist(tokens) | |
| hapaxes = freq_dist.hapaxes() | |
| return len(hapaxes) / len(tokens) if tokens else 0 | |
| # -------------------- Features -------------------- | |
| def mtld(text, threshold=0.72, remove_stopwords=True, use_lemmatization=True): | |
| tokens = preprocess_text(text, remove_stopwords, use_lemmatization) | |
| def mtld_calc(direction): | |
| token_length, factor_count = 0, 0 | |
| types = set() | |
| for token in tokens if direction == "forward" else reversed(tokens): | |
| types.add(token) | |
| token_length += 1 | |
| if len(types) / token_length < threshold: | |
| factor_count += 1 | |
| types = set() | |
| token_length = 0 | |
| factor_count += 1 # For the last segment, even if it didn't reach the threshold | |
| return len(tokens) / factor_count if factor_count != 0 else 0 | |
| return (mtld_calc("forward") + mtld_calc("backward")) / 2 | |
| # -------------------- Features -------------------- | |
| def calculate_max_depth(sent): | |
| return max(len(list(token.ancestors)) for token in sent) | |
| def calculate_syntactic_tree_depth(text, nlp): | |
| doc = nlp(text) | |
| sentence_depths = [calculate_max_depth(sent) for sent in doc.sents] | |
| average_depth = ( | |
| sum(sentence_depths) / len(sentence_depths) if sentence_depths else 0 | |
| ) | |
| return average_depth | |
| # -------------------- Features -------------------- | |
| def calculate_perplexity(text, model, tokenizer, device, stride=512): | |
| encodings = tokenizer(text, return_tensors="pt") | |
| max_length = model.config.n_positions | |
| seq_len = encodings.input_ids.size(1) | |
| nlls = [] | |
| prev_end_loc = 0 | |
| for begin_loc in range(0, seq_len, stride): | |
| end_loc = min(begin_loc + max_length, seq_len) | |
| trg_len = end_loc - prev_end_loc # may be different from stride on last loop | |
| input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device) | |
| target_ids = input_ids.clone() | |
| target_ids[:, :-trg_len] = -100 | |
| with torch.no_grad(): | |
| outputs = model(input_ids, labels=target_ids) | |
| # loss is calculated using CrossEntropyLoss which averages over valid labels | |
| # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels | |
| # to the left by 1. | |
| neg_log_likelihood = outputs.loss | |
| nlls.append(neg_log_likelihood) | |
| prev_end_loc = end_loc | |
| if end_loc == seq_len: | |
| break | |
| ppl = torch.exp(torch.stack(nlls).mean()) | |
| return ppl.item() | |