Spaces:
Runtime error
Runtime error
| import yaml | |
| import subprocess | |
| import nltk | |
| from nltk import word_tokenize | |
| from nltk.corpus import cmudict, stopwords | |
| import spacy | |
| import torch | |
| from transformers import GPT2LMHeadModel, GPT2TokenizerFast | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| from matplotlib.patches import Circle, RegularPolygon | |
| from matplotlib.path import Path | |
| from matplotlib.projections import register_projection | |
| from matplotlib.projections.polar import PolarAxes | |
| from matplotlib.spines import Spine | |
| from matplotlib.transforms import Affine2D | |
| from writing_analysis import ( | |
| estimated_slightly_difficult_words_ratio, | |
| entity_density, | |
| determiners_frequency, | |
| punctuation_diversity, | |
| type_token_ratio, | |
| calculate_perplexity, | |
| calculate_syntactic_tree_depth, | |
| hapax_legomena_ratio, | |
| mtld, | |
| ) | |
| nltk.download("cmudict") | |
| nltk.download("punkt") | |
| nltk.download("stopwords") | |
| nltk.download("wordnet") | |
| d = cmudict.dict() | |
| command = ["python3", "-m", "spacy", "download", "en_core_web_sm"] | |
| subprocess.run(command) | |
| nlp = spacy.load("en_core_web_sm") | |
| with open("config.yaml", "r") as file: | |
| params = yaml.safe_load(file) | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| readability_model_id = params["READABILITY_MODEL_ID"] | |
| gpt2_model = GPT2LMHeadModel.from_pretrained(readability_model_id).to(device) | |
| gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(readability_model_id) | |
| def normalize(value, min_value, max_value): | |
| normalized_value = ((value - min_value) * 100) / (max_value - min_value) | |
| return max(0, min(100, normalized_value)) | |
| def depth_analysis(input_text): | |
| usual_ranges = { | |
| "estimated_slightly_difficult_words_ratio": ( | |
| 0.2273693623058005, | |
| 0.557383692351033, | |
| ), | |
| "entity_density": (-0.07940776754145815, 0.23491038179986615), | |
| "determiners_frequency": (0.012461059190031154, 0.15700934579439252), | |
| "punctuation_diversity": (-0.21875, 0.53125), | |
| "type_token_ratio": (0.33002482852189063, 1.0894414982357028), | |
| "calculate_perplexity": (-25.110544681549072, 82.4620680809021), | |
| "calculate_syntactic_tree_depth": ( | |
| 1.8380681818181812, | |
| 10.997159090909092, | |
| ), | |
| "hapax_legomena_ratio": (0.0830971690138207, 1.0302715687215778), | |
| "mtld": (-84.03125000000001, 248.81875000000002), | |
| } | |
| vocabulary_level = estimated_slightly_difficult_words_ratio(input_text, d) | |
| entity_ratio = entity_density(input_text, nlp) | |
| determiner_use = determiners_frequency(input_text, nlp) | |
| punctuation_variety = punctuation_diversity(input_text) | |
| sentence_depth = calculate_syntactic_tree_depth(input_text, nlp) | |
| perplexity = calculate_perplexity( | |
| input_text, gpt2_model, gpt2_tokenizer, device | |
| ) | |
| lexical_diversity = type_token_ratio(input_text) | |
| unique_words = hapax_legomena_ratio(input_text) | |
| vocabulary_stability = mtld(input_text) | |
| # normalize between 0 and 100 | |
| vocabulary_level_norm = normalize( | |
| vocabulary_level, | |
| *usual_ranges["estimated_slightly_difficult_words_ratio"], | |
| ) | |
| entity_ratio_norm = normalize(entity_ratio, *usual_ranges["entity_density"]) | |
| determiner_use_norm = normalize( | |
| determiner_use, *usual_ranges["determiners_frequency"] | |
| ) | |
| punctuation_variety_norm = normalize( | |
| punctuation_variety, *usual_ranges["punctuation_diversity"] | |
| ) | |
| lexical_diversity_norm = normalize( | |
| lexical_diversity, *usual_ranges["type_token_ratio"] | |
| ) | |
| unique_words_norm = normalize( | |
| unique_words, *usual_ranges["hapax_legomena_ratio"] | |
| ) | |
| vocabulary_stability_norm = normalize( | |
| vocabulary_stability, *usual_ranges["mtld"] | |
| ) | |
| sentence_depth_norm = normalize( | |
| sentence_depth, *usual_ranges["calculate_syntactic_tree_depth"] | |
| ) | |
| perplexity_norm = normalize( | |
| perplexity, *usual_ranges["calculate_perplexity"] | |
| ) | |
| features = { | |
| "Lexical Diversity": lexical_diversity_norm, | |
| "Vocabulary Level": vocabulary_level_norm, | |
| "Unique Words": unique_words_norm, | |
| "Determiner Use": determiner_use_norm, | |
| "Punctuation Variety": punctuation_variety_norm, | |
| "Sentence Depth": sentence_depth_norm, | |
| "Vocabulary Stability": vocabulary_stability_norm, | |
| "Entity Ratio": entity_ratio_norm, | |
| "Perplexity": perplexity_norm, | |
| } | |
| def radar_factory(num_vars, frame="circle"): | |
| theta = np.linspace(0, 2 * np.pi, num_vars, endpoint=False) | |
| class RadarTransform(PolarAxes.PolarTransform): | |
| def transform_path_non_affine(self, path): | |
| if path._interpolation_steps > 1: | |
| path = path.interpolated(num_vars) | |
| return Path(self.transform(path.vertices), path.codes) | |
| class RadarAxes(PolarAxes): | |
| name = "radar" | |
| PolarTransform = RadarTransform | |
| def __init__(self, *args, **kwargs): | |
| super().__init__(*args, **kwargs) | |
| self.set_theta_zero_location("N") | |
| def fill(self, *args, closed=True, **kwargs): | |
| return super().fill(closed=closed, *args, **kwargs) | |
| def plot(self, *args, **kwargs): | |
| lines = super().plot(*args, **kwargs) | |
| for line in lines: | |
| self._close_line(line) | |
| def _close_line(self, line): | |
| x, y = line.get_data() | |
| if x[0] != x[-1]: | |
| x = np.append(x, x[0]) | |
| y = np.append(y, y[0]) | |
| line.set_data(x, y) | |
| def set_varlabels(self, labels): | |
| self.set_thetagrids(np.degrees(theta), labels) | |
| def _gen_axes_patch(self): | |
| if frame == "circle": | |
| return Circle((0.5, 0.5), 0.5) | |
| elif frame == "polygon": | |
| return RegularPolygon( | |
| (0.5, 0.5), num_vars, radius=0.5, edgecolor="k" | |
| ) | |
| def _gen_axes_spines(self): | |
| if frame == "polygon": | |
| spine = Spine( | |
| axes=self, | |
| spine_type="circle", | |
| path=Path.unit_regular_polygon(num_vars), | |
| ) | |
| spine.set_transform( | |
| Affine2D().scale(0.5).translate(0.5, 0.5) | |
| + self.transAxes | |
| ) | |
| return {"polar": spine} | |
| register_projection(RadarAxes) | |
| return theta | |
| N = 9 | |
| theta = radar_factory(N, frame="polygon") | |
| data = features.values() | |
| labels = features.keys() | |
| fig, ax = plt.subplots( | |
| subplot_kw=dict(projection="radar"), figsize=(7.5, 5) | |
| ) | |
| ax.plot(theta, data) | |
| ax.fill(theta, data, alpha=0.4) | |
| ax.set_varlabels(labels) | |
| rgrids = np.linspace(0, 100, num=6) | |
| ax.set_rgrids( | |
| rgrids, | |
| labels=[f"{round(r)}%" for r in rgrids], | |
| fontsize=8, | |
| color="black", | |
| ) | |
| ax.grid(True, color="black", linestyle="-", linewidth=0.5, alpha=0.5) | |
| for dd, (label, value) in enumerate(zip(labels, data)): | |
| ax.text( | |
| theta[dd] + 0.1, | |
| value + 5, | |
| f"{value:.0f}", | |
| horizontalalignment="left", | |
| verticalalignment="bottom", | |
| fontsize=8, | |
| ) | |
| return fig | |