|
import re |
|
import json |
|
|
|
gender_lexicons = json.load(open("config/gender_lexicons.json", "r")) |
|
|
|
|
|
def count_male_terms(text, male_terms): |
|
pattern = r"\b({})\b".format("|".join(male_terms)) |
|
match = re.findall(pattern, str(text)) |
|
return len(match) |
|
|
|
|
|
def count_female_terms(text, female_terms): |
|
pattern = r"\b({})\b".format("|".join(female_terms)) |
|
match = re.findall(pattern, str(text)) |
|
return len(match) |
|
|
|
|
|
def get_gender_tag(count_m_term, count_f_term): |
|
tag = "" |
|
if count_m_term == 0 and count_f_term == 0: |
|
tag = "No Gender" |
|
|
|
elif count_m_term == count_f_term: |
|
tag = "Equal Gender" |
|
|
|
elif count_m_term > count_f_term: |
|
m_proportion = (count_m_term / (count_m_term + count_f_term)) * 100 |
|
if m_proportion >= 50 and m_proportion < 75: |
|
tag = "Male Positive Gender" |
|
elif m_proportion >= 75: |
|
tag = "Male Strongly Positive Gender" |
|
|
|
elif count_m_term < count_f_term: |
|
f_proportion = (count_f_term / (count_m_term + count_f_term)) * 100 |
|
if f_proportion >= 50 and f_proportion < 75: |
|
tag = "Female Positive Gender" |
|
elif f_proportion >= 75: |
|
tag = "Female Strongly Positive Gender" |
|
|
|
return tag |
|
|
|
|
|
def get_pg_spg(sample_df): |
|
count_no_gender_sentences = sample_df[sample_df["gender_cat"] == "No Gender"][ |
|
"gender_cat" |
|
].count() |
|
|
|
count_gender_sentences = sample_df[sample_df["gender_cat"] != "No Gender"][ |
|
"gender_cat" |
|
].count() |
|
count_equal_gender = sample_df[sample_df["gender_cat"] == "Equal Gender"][ |
|
"gender_cat" |
|
].count() |
|
|
|
count_male_pg = sample_df[sample_df["gender_cat"] == "Male Positive Gender"][ |
|
"gender_cat" |
|
].count() |
|
count_male_spg = sample_df[ |
|
sample_df["gender_cat"] == "Male Strongly Positive Gender" |
|
]["gender_cat"].count() |
|
|
|
count_female_pg = sample_df[sample_df["gender_cat"] == "Female Positive Gender"][ |
|
"gender_cat" |
|
].count() |
|
count_female_spg = sample_df[ |
|
sample_df["gender_cat"] == "Female Stronly Positive Gender" |
|
]["gender_cat"].count() |
|
|
|
return { |
|
"gender": str(count_gender_sentences), |
|
"no gender": str(count_no_gender_sentences), |
|
"equal gender": str(count_equal_gender), |
|
"female pg": str(count_female_pg), |
|
"male pg": str(count_male_pg), |
|
"female spg": str(count_female_spg), |
|
"male spg": str(count_male_spg), |
|
} |
|
|
|
|
|
def eval_gender_divide(data): |
|
male_terms = gender_lexicons.get("male_lexicons") |
|
female_terms = gender_lexicons.get("female_lexicons") |
|
|
|
data[data.columns[0]] = data[data.columns[0]].str.lower().str.strip() |
|
|
|
data["count_male_term"] = data.apply( |
|
lambda x: count_male_terms(x[data.columns[0]], male_terms), axis=1 |
|
) |
|
data["count_female_term"] = data.apply( |
|
lambda x: count_female_terms(x[:], female_terms), axis=1 |
|
) |
|
|
|
data["gender_cat"] = data.apply( |
|
lambda row: get_gender_tag(row["count_male_term"], row["count_female_term"]), |
|
axis=1, |
|
) |
|
|
|
collection = get_pg_spg(data) |
|
return collection |
|
|