MALE = 'male'
FEMALE = 'female'
UNKNOWN = 'unknown' #表示这段句子既不关于男性也关于女性
BOTH = 'both'
MALE_WORDS = set([
'guy','spokesman','chairman',"men's",'men','him',"he's",'his',
'boy','boyfriend','boyfriends','boys','brother','brothers','dad',
'dads','dude','father','fathers','fiance','gentleman','gentlemen',
'god','grandfather','grandpa','grandson','groom','he','himself',
'husband','husbands','king','male','man','mr','nephew','nephews',
'priest','prince','son','sons','uncle','uncles','waiter','widower',
'widowers'
])
FEMALE_WORDS = set([
'heroine','spokeswoman','chairwoman',"women's",'actress','women',
"she's",'her','aunt','aunts','bride','daughter','daughters','female',
'fiancee','girl','girlfriend','girlfriends','girls','goddess',
'granddaughter','grandma','grandmother','herself','ladies','lady',
'mom','moms','mother','mothers','mrs','ms','niece','nieces',
'priestess','princess','queens','she','sister','sisters','waitress',
'widow','widows','wife','wives','woman'
])
def genderize(words):
mwlen = len(MALE_WORDS.intersection(words))
fwlen = len(FEMALE_WORDS.intersection(words))
if mwlen > 0 and fwlen == 0:
return MALE
elif mwlen == 0 and fwlen > 0:
return FEMALE
elif mwlen > 0 and fwlen > 0:
return BOTH
else:
return UNKNOWN
from collections import Counter
def count_gender(sentences):
sents = Counter()
words = Counter()
for sentence in sentences:
gender = genderize(sentence) #判断这个句子类型,是关于男性还是女性,亦或其他类型
sents[gender] += 1 #统计出现性别(男,女,或其他)的句子的数量
words[gender] += len(sentence) #将组成性别的句子的单词数量都认定为这一性别的单词数
return sents,words
import nltk
def parse_gender(text):
sentences = [
[word.lower() for word in nltk.word_tokenize(sentence)]
for sentence in nltk.sent_tokenize(text)
]
sents,words = count_gender(sentences)
total = sum(words.values())
for gender,count in words.items():
pcent = (count/total) * 100
nsents = sents[gender]
print(
"{0.3f}% {} ({} sentences)".format(pcent,gender,nsents)
)
if __name__ == '__main__':
with open('ballet.txt', 'r',encoding='utf8') as f:
parse_gender(f.read())
39.269% unknown (48 sentences)
52.994% female (38 sentences)
4.393% both (2 sentences)
3.344% male (3 sentences)
本文来源于《基于Python的智能文本分析》_Benjamin Bengfort等著,陈光译