File size: 4,505 Bytes
e0db39e 0946447 e0db39e d1a2df2 e0db39e 6d2d9db e0db39e d1a2df2 0946447 e0db39e 0946447 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import re
import json
import pandas as pd
import plotly.express as px
import multiprocessing.pool
from spacy.lang.en import English
gender_lexicons = json.load(open("config/gender_lexicons.json", "r"))
profession_lexicons = json.load(open("config/profession_lexicons.json", "r"))
nlp = English()
nlp.add_pipe("sentencizer")
def call_multiprocessing_pool(df_text):
concurrent = 2000
pool = multiprocessing.pool.ThreadPool(processes=concurrent)
result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
pool.close()
flat_return_list = [item for sublist in result_list for item in sublist]
cols = ["Split Text", "Male Pronoun", "Female Pronoun", "Profession", "Both Match"]
return_df = pd.DataFrame(flat_return_list, columns=cols)
return return_df
def get_split_text(text):
doc = nlp(text)
sentences = [sent for sent in doc.sents]
return sentences
def compile_regex_patterns(patterns):
return [
re.compile(r"\b({})\b".format("|".join(pattern)), flags=re.IGNORECASE)
for pattern in patterns
]
def get_gender_prof_match_details(df_text):
male_pronouns = gender_lexicons.get("male_pronouns")
female_pronouns = gender_lexicons.get("female_pronouns")
professions = profession_lexicons.get("professions")
male_pronoun_pat, female_pronoun_pat, professions_pat = compile_regex_patterns(
[male_pronouns, female_pronouns, professions]
)
split_text = get_split_text(df_text)
results = []
for text in split_text:
male_pronoun_match = re.findall(male_pronoun_pat, str(text))
female_pronoun_match = re.findall(female_pronoun_pat, str(text))
prof_match = re.findall(professions_pat, str(text))
both_match = "No"
if len(male_pronoun_match) != 0 and len(prof_match) != 0:
both_match = "Yes"
if len(female_pronoun_match) != 0 and len(prof_match) != 0:
both_match = "Yes"
male_pronoun_match = ",".join(male_pronoun_match)
female_pronoun_match = ",".join(female_pronoun_match)
prof_match = ",".join(prof_match)
results.append(
(
str(text),
male_pronoun_match,
female_pronoun_match,
prof_match,
both_match,
)
)
return results
def get_statistics(result):
stats = {
"both_gender_prof_match": str((result["Both Match"] == "Yes").sum()),
"count_male_pronoun": str((result["Male Pronoun"] != "").sum()),
"count_female_pronoun": str((result["Female Pronoun"] != "").sum()),
"count_male_pronoun_profession": str(
((result["Male Pronoun"] != "") & (result["Profession"] != "")).sum()
),
"count_female_pronoun_profession": str(
((result["Female Pronoun"] != "") & (result["Profession"] != "")).sum()
),
"total_sentence": str(len(result)),
}
return stats
def get_plot(result_json):
both_gender_prof_match = int(result_json["both_gender_prof_match"])
count_male_pronoun = int(result_json["count_male_pronoun"])
count_female_pronoun = int(result_json["count_female_pronoun"])
count_male_pronoun_profession = int(result_json["count_male_pronoun_profession"])
count_female_pronoun_profession = int(
result_json["count_female_pronoun_profession"]
)
data = {
"Labels": [
"Both Gender & Profession Match",
"Male Pronoun",
"Female Pronoun",
"Male Pronoun & Profession",
"Female Pronoun & Profession",
],
"Values": [
both_gender_prof_match,
count_male_pronoun,
count_female_pronoun,
count_male_pronoun_profession,
count_female_pronoun_profession,
],
}
fig = px.pie(
data,
names="Labels",
values="Values",
title="Gender & Profession Match Statistics",
)
return fig
def eval_gender_profession(data):
data = data[data.columns[0]].str.lower().str.strip()
result = call_multiprocessing_pool(data)
result_json = get_statistics(result)
result_plot = get_plot(result_json)
result_df = (
pd.DataFrame.from_dict(result_json, orient="index")
.reset_index()
.rename(columns={"index": "Metric", 0: "Value"})
)
result_conclusion = ""
return result_df, result_plot, result_conclusion
|