w3hello.com logo
Home PHP C# C++ Android Java Javascript Python IOS SQL HTML videos Categories
NLTK Brown Corpus Tags

Use a defaultdict(Counter) to keep track of words and their POS. Then sort the dictionary by the keys' len(Counter):

from collections import defaultdict, Counter
from nltk.corpus import brown

# Keeps words and pos into a dictionary 
# where the key is a word and
# the value is a counter of POS and counts
word_tags = defaultdict(Counter)
for word, pos in brown.tagged_words():
    word_tags[word][pos] +=1

# To access the POS counter.    
print 'Red', word_tags['Red']
print 'Marlowe', word_tags['Marlowe']
print

# Greatest number of distinct tag.
word_with_most_distinct_pos = sorted(word_tags, key=lambda x:
len(word_tags[x]), reverse=True)[0]

print word_with_most_distinct_pos
print word_tags[word_with_most_distinct_pos]
print len(word_tags[word_with_most_distinct_pos])

[out]:

Red Counter({u'JJ-TL': 49, u'NP': 21, u'JJ': 3, u'NN-TL': 1,
u'JJ-TL-HL': 1})
Marlowe Counter({u'NP': 4})

that
Counter({u'CS': 6419, u'DT': 1975, u'WPS': 1638, u'WPO': 135, u'QL': 54,
u'DT-NC': 6, u'WPS-NC': 3, u'CS-NC': 2, u'WPS-HL': 2, u'NIL': 1, u'CS-HL':
1, u'WPO-NC': 1})
12

To get words with X no. of distinct POS:

# Words with 8 distinct POS
word_with_eight_pos = filter(lambda x: len(word_tags[x]) == 8,
word_tags.keys())

for i in word_with_eight_pos:
    print i, word_tags[i]
print 

# Words with 9 distinct POS
word_with_nine_pos = filter(lambda x: len(word_tags[x]) == 9,
word_tags.keys())

for i in word_with_nine_pos:
    print i, word_tags[i]

[out]:

a Counter({u'AT': 21824, u'AT-HL': 40, u'AT-NC': 7, u'FW-IN': 4,
u'NIL': 3, u'FW-IN-TL': 1, u'AT-TL': 1, u'NN': 1})

: Counter({u':': 1558, u':-HL': 138, u'.': 46, u':-TL': 22, u'IN': 20,
u'.-HL': 8, u'NIL': 1, u',': 1, u'NP': 1})




© Copyright 2018 w3hello.com Publishing Limited. All rights reserved.