Issue 1696199: Add collections.Counter().

This commit is contained in:
Raymond Hettinger 2009-01-12 22:58:41 +00:00
parent 99234c5c74
commit f94d7fa5fb
4 changed files with 430 additions and 2 deletions

View File

@ -152,6 +152,134 @@ Notes on using :class:`Set` and :class:`MutableSet` as a mixin:
(For more about ABCs, see the :mod:`abc` module and :pep:`3119`.)
.. _counter-objects:
:class:`Counter` objects
------------------------
A counter tool is provided to support convenient and rapid tallies.
For example::
# Tally repeated words in a list
>>> words = ['red', 'blue', 'red', 'green', 'blue', blue']
>>> cnt = Counter()
>>> for word in words:
... cnt[word] += 1
>>> cnt
Counter(items=[('blue', 3), ('red', 2), ('green', 1)])
# Find the ten most common words in Hamlet
>>> import re
>>> words = re.findall('\w+', open('hamlet.txt').read().lower())
>>> Counter(hamlet_words).most_common(10)
[('the', 1143), ('and', 966), ('to', 762), ('of', 669), ('i', 631),
('you', 554), ('a', 546), ('my', 514), ('hamlet', 471), ('in', 451)]
.. class:: Counter([iterable[, items]])
A :class:`Counter` is a :class:`dict` subclass for counting hashable items.
Elements are stored as dictionary keys and their counts are stored as
dictionary values. Counts are allowed to be any integer value including
zero or negative counts. The :class:`Counter` class is similar to bags
or multisets in other languages.
Elements are counted from the *iterable* if given. Also, the counts
can be initialized from an *items* list of *(element, count)* pairs.
If provided, *items* must be a keyword argument::
>>> c = Counter() # a new, empty counter
>>> c = Counter('gallahad') # a new counter from an iterable
>>> c = Counter(items=[('a', 4), ('b', 2)]) # a new counter from an items list
The returned object has a dictionary style interface except that it returns
a zero count for missing items (instead of raising a :exc:`KeyError` like a
dictionary would)::
>>> c = Counter(['if', 'your', 'peril', 'be'])
>>> c['questions'] # count of a missing element is zero
0
Assigning a count of zero or reducing the count to zero leaves the
element in the dictionary. Use ``del`` to remove the entry entirely:
>>> c = Counter(['arthur', 'gwain'])
>>> c['arthur'] = 0 # set the count of "arthur" to zero
>>> 'arthur' in c # but "arthur" is still in the counter
True
>>> del c['arthur'] # del will completely remove the entry
>>> 'arthur' in c
False
.. versionadded:: 2.7
Counter objects support two methods beyond those available for all
dictionaries:
.. method:: elements()
Return an iterator over elements repeating each as many times as its count.
Elements are returned in arbitrary order. If an element's count has been
set to zero or a negative number, :meth:`elements` will ignore it.
>>> c = Counter(items=[('a', 4), ('b', 2), ('d', 0), ('e', -2)])
>>> list(c.elements())
['a', 'a', 'a', 'a', 'b', 'b']
.. method:: most_common([n])
Return a list of the *n* most common elements and their counts from
the most common to the least. If *n* is not specified or is ``None``,
return a list of all element counts in decreasing order of frequency.
Elements with equal counts are ordered arbitrarily::
>>> Counter('abracadabra').most_common(3)
[('a', 5), ('r', 2), ('b', 2)]
The usual dictionary methods are available for :class:`Counter` objects.
All of those work the same as they do for dictionaries except for two
which work differently for counters.
.. method:: fromkeys(iterable)
There is no equivalent class method for :class:`Counter` objects.
Raises a :exc:`NotImplementedError` when called.
.. method:: update(mapping)
Like :meth:`dict.update` but adds-in counts instead of replacing them.
Used for combining two independent counts. Accepts a *mapping* object
which can be another counter or can be a :class:`dict` that maps
elements to element counts::
>>> c = Counter('which') # count letters in a word
>>> d = Counter('witch') # count letters in another word
>>> c.update(d) # add counts from d to those in c
>>> c['h'] # count of 'h' is now three
3
>>> c.update(Counter('watch')) # add in letters from another word
>>> c['h'] # count of 'h' is now four
4
.. seealso::
`Multisets <http://en.wikipedia.org/wiki/Multiset>`_
in the Wikipedia
`Bag <http://www.gnu.org/software/smalltalk/manual-base/html_node/Bag.html>`_
a Smalltalk class
`C++ multisets <http://www.demo2s.com/Tutorial/Cpp/0380__set-multiset/Catalog0380__set-multiset.htm>`_
a tutorial with standalone examples
`Bag class <http://code.activestate.com/recipes/259174/>`_
an early Python recipe
Use cases for multisets and mathematical operations on multisets.
Knuth, Donald. The Art of Computer Programming Volume II,
Section 4.6.3, Exercise 19.
.. _deque-objects:

View File

@ -9,6 +9,11 @@ from _collections import deque, defaultdict
from operator import itemgetter as _itemgetter
from keyword import iskeyword as _iskeyword
import sys as _sys
import heapq as _heapq
import itertools as _itertools
########################################################################
### namedtuple #######################################################
def namedtuple(typename, field_names, verbose=False):
"""Returns a new subclass of tuple with named fields.
@ -108,7 +113,160 @@ def namedtuple(typename, field_names, verbose=False):
return result
########################################################################
### Counter ##########################################################
class Counter(dict):
'''Dict subclass for counting hashable items. Sometimes called a bag
or multiset. Elements are stored as dictionary keys and their counts
are stored as dictionary values.
>>> c = Counter('abracadabra') # count elements from a string
>>> c.most_common(3) # three most common elements
[('a', 5), ('r', 2), ('b', 2)]
>>> sorted(c) # list all unique elements
['a', 'b', 'c', 'd', 'r']
>>> ''.join(sorted(c.elements())) # list elements with repetitions
'aaaaabbcdrr'
>>> sum(c.values()) # total of all counts
11
>>> c['a'] # count of letter 'a'
5
>>> for elem in 'shazam': # update counts from an iterable
... c[elem] += 1 # by adding 1 to each element's count
>>> c['a'] # now there are seven 'a'
7
>>> del c['r'] # remove all 'r'
>>> c['r'] # now there are zero 'r'
0
>>> d = Counter('simsalabim') # make another counter
>>> c.update(d) # add in the second counter
>>> c['a'] # now there are nine 'a'
9
>>> c.clear() # empty the counter
>>> c
Counter()
Note: If a count is set to zero or reduced to zero, it will remain
in the counter until the entry is deleted or the counter is cleared:
>>> c = Counter('aaabbc')
>>> c['b'] -= 2 # reduce the count of 'b' by two
>>> c.most_common() # 'b' is still in, but its count is zero
[('a', 3), ('c', 1), ('b', 0)]
'''
# References:
# http://en.wikipedia.org/wiki/Multiset
# http://www.gnu.org/software/smalltalk/manual-base/html_node/Bag.html
# http://www.demo2s.com/Tutorial/Cpp/0380__set-multiset/Catalog0380__set-multiset.htm
# http://code.activestate.com/recipes/259174/
# Knuth, TAOCP Vol. II section 4.6.3
def __init__(self, iterable=None, items=None):
'''Create a new, empty Counter object. And if given, count elements
from an input iterable. Or, initialize the count from an items list
of (element, count) pairs.
>>> c = Counter('hocus pocus') # count elements in an iterable
>>> c = Counter(items=[('a', 4), ('b', 2)]) # take counts from an items list
'''
if iterable is not None:
for elem in iterable:
self[elem] += 1
if items is not None:
for elem, count in items:
self[elem] += count
def __missing__(self, key):
'The count of elements not in the Counter is zero.'
# Needed so that self[missing_item] does not raise KeyError
return 0
def most_common(self, n=None):
'''List the n most common elements and their counts from the most
common to the least. If n is None, then list all element counts.
>>> Counter('abracadabra').most_common(3)
[('a', 5), ('r', 2), ('b', 2)]
'''
# Emulate Bag.sortedByCount from Smalltalk
if n is None:
return sorted(self.iteritems(), key=_itemgetter(1), reverse=True)
return _heapq.nlargest(n, self.iteritems(), key=_itemgetter(1))
def elements(self):
'''Iterator over elements repeating each as many times as its count.
>>> c = Counter('ABCABC')
>>> sorted(c.elements())
['A', 'A', 'B', 'B', 'C', 'C']
# Knuth's example of prime factors of 1836: 2**2 * 3**3 * 17**1
>>> import operator
>>> prime_factors = Counter(items=[(2,2), (3,3), (17,1)])
>>> sorted(prime_factors.elements()) # list individual factors
[2, 2, 3, 3, 3, 17]
>>> reduce(operator.mul, prime_factors.elements(), 1) # multiply them
1836
Note, if an element's count has been set to zero or a negative number,
elements() will ignore it.
'''
# Emulate Bag.do from Smalltalk and Multiset.begin from C++.
return _itertools.chain.from_iterable(
_itertools.starmap(_itertools.repeat,
self.iteritems()))
# Override dict methods where necessary
@classmethod
def fromkeys(cls, iterable, v=None):
# There is no equivalent method for counters because setting v=1
# means that no element can have a count greater than one.
raise NotImplementedError(
'Counter.fromkeys() is undefined. Use Counter(iterable) instead.')
def update(self, mapping):
'''Like dict.update() but add counts instead of replacing them.
Source can be another dictionary or a Counter.instance().
>>> c = Counter('which')
>>> d = Counter('witch')
>>> c.update(d) # Add counts from d to those in c
>>> c['h'] # Count of 'h' is now three
3
'''
# The regular dict.update() operation makes no sense here because the
# replace behavior results in the some of original untouched counts
# being mixed-in with all of the other counts for a mismash that
# doesn't have a straight-forward interpretation in most counting
# contexts. Instead, we look to Knuth for suggested operations on
# multisets and implement the union-add operation discussed in
# TAOCP Volume II section 4.6.3 exercise 19. The Wikipedia entry for
# multisets calls that operation a sum or join.
for elem, count in mapping.iteritems():
self[elem] += count
def copy(self):
'Like dict.copy() but returns a Counter instance instead of a dict.'
c = Counter()
c.update(self)
return c
def __repr__(self):
if not self:
return '%s()' % self.__class__.__name__
return '%s(items=%r)' % (self.__class__.__name__, self.most_common())
@ -143,6 +301,49 @@ if __name__ == '__main__':
Point3D = namedtuple('Point3D', Point._fields + ('z',))
print Point3D.__doc__
# Check that counters are copyable, deepcopyable, picklable, and have a
# repr/eval round-trip
import copy
words = Counter('which witch had which witches wrist watch'.split())
update_test = Counter()
update_test.update(words)
for i, dup in enumerate([
words.copy(),
copy.copy(words),
copy.deepcopy(words),
loads(dumps(words, 0)),
loads(dumps(words, 1)),
loads(dumps(words, 2)),
loads(dumps(words, -1)),
eval(repr(words)),
update_test,
]):
msg = (i, dup, words)
assert dup is not words, msg
assert dup == words, msg
assert len(dup) == len(words), msg
assert type(dup) == type(words), msg
# Verify that counters are unhashable
try:
hash(words)
except TypeError:
pass
else:
print 'Failed hashing test'
# Verify that Counter.fromkeys() is disabled
try:
Counter.fromkeys('razmataz')
except NotImplementedError:
pass
else:
print 'Failed fromkeys() test'
# Check ABCs
assert issubclass(Counter, Mapping)
assert isinstance(Counter('asdfasdf'), Mapping)
import doctest
TestResults = namedtuple('TestResults', 'failed attempted')
print TestResults(*doctest.testmod())

View File

@ -1,6 +1,6 @@
import unittest, doctest
from test import test_support
from collections import namedtuple
from collections import namedtuple, Counter, Mapping
import pickle, cPickle, copy
from collections import Hashable, Iterable, Iterator
from collections import Sized, Container, Callable
@ -346,11 +346,107 @@ class TestCollectionABCs(unittest.TestCase):
self.failUnless(issubclass(sample, MutableSequence))
self.failIf(issubclass(basestring, MutableSequence))
class TestCounter(unittest.TestCase):
def test_basics(self):
c = Counter('abcaba')
self.assert_(isinstance(c, dict))
self.assert_(isinstance(c, Mapping))
self.assert_(issubclass(Counter, dict))
self.assert_(issubclass(Counter, Mapping))
self.assertEqual(len(c), 3)
self.assertEqual(sum(c.values()), 6)
self.assertEqual(sorted(c.values()), [1, 2, 3])
self.assertEqual(sorted(c.keys()), ['a', 'b', 'c'])
self.assertEqual(sorted(c), ['a', 'b', 'c'])
self.assertEqual(sorted(c.items()),
[('a', 3), ('b', 2), ('c', 1)])
self.assertEqual(c['b'], 2)
self.assertEqual(c['z'], 0)
self.assertEqual(c.has_key('c'), True)
self.assertEqual(c.has_key('z'), False)
self.assertEqual(c.__contains__('c'), True)
self.assertEqual(c.__contains__('z'), False)
self.assertEqual(c.get('b', 10), 2)
self.assertEqual(c.get('z', 10), 10)
self.assertEqual(c, dict(a=3, b=2, c=1))
self.assertEqual(repr(c),
"Counter(items=[('a', 3), ('b', 2), ('c', 1)])")
self.assertEqual(c.most_common(), [('a', 3), ('b', 2), ('c', 1)])
for i in range(5):
self.assertEqual(c.most_common(i),
[('a', 3), ('b', 2), ('c', 1)][:i])
self.assertEqual(''.join(sorted(c.elements())), 'aaabbc')
c['a'] += 1 # increment an existing value
c['b'] -= 2 # sub existing value to zero
del c['c'] # remove an entry
c['d'] -= 2 # sub from a missing value
c['e'] = -5 # directly assign a missing value
c['f'] += 4 # add to a missing value
self.assertEqual(c, dict(a=4, b=0, d=-2, e=-5, f=4))
self.assertEqual(''.join(sorted(c.elements())), 'aaaaffff')
self.assertEqual(c.pop('f'), 4)
self.assertEqual('f' in c, False)
for i in range(3):
elem, cnt = c.popitem()
self.assertEqual(elem in c, False)
c.clear()
self.assertEqual(c, {})
self.assertEqual(repr(c), 'Counter()')
self.assertRaises(NotImplementedError, Counter.fromkeys, 'abc')
self.assertRaises(TypeError, hash, c)
c.update(dict(a=5, b=3, c=1))
c.update(Counter(items=[('a', 50), ('b', 30)]))
c.__init__(items=[('a', 500), ('b', 300)])
c.__init__('cdc')
self.assertEqual(c, dict(a=555, b=333, c=3, d=1))
self.assertEqual(c.setdefault('d', 5), 1)
self.assertEqual(c['d'], 1)
self.assertEqual(c.setdefault('e', 5), 5)
self.assertEqual(c['e'], 5)
def test_copying(self):
# Check that counters are copyable, deepcopyable, picklable, and
#have a repr/eval round-trip
words = Counter('which witch had which witches wrist watch'.split())
update_test = Counter()
update_test.update(words)
for i, dup in enumerate([
words.copy(),
copy.copy(words),
copy.deepcopy(words),
pickle.loads(pickle.dumps(words, 0)),
pickle.loads(pickle.dumps(words, 1)),
pickle.loads(pickle.dumps(words, 2)),
pickle.loads(pickle.dumps(words, -1)),
cPickle.loads(cPickle.dumps(words, 0)),
cPickle.loads(cPickle.dumps(words, 1)),
cPickle.loads(cPickle.dumps(words, 2)),
cPickle.loads(cPickle.dumps(words, -1)),
eval(repr(words)),
update_test,
]):
msg = (i, dup, words)
self.assert_(dup is not words)
self.assertEquals(dup, words)
self.assertEquals(len(dup), len(words))
self.assertEquals(type(dup), type(words))
def test_conversions(self):
# Convert to: set, list, dict
s = 'she sells sea shells by the sea shore'
self.assertEqual(sorted(Counter(s).elements()), sorted(s))
self.assertEqual(sorted(Counter(s)), sorted(set(s)))
self.assertEqual(dict(Counter(s)), dict(Counter(s).items()))
self.assertEqual(set(Counter(s)), set(s))
import doctest, collections
def test_main(verbose=None):
NamedTupleDocs = doctest.DocTestSuite(module=collections)
test_classes = [TestNamedTuple, NamedTupleDocs, TestOneTrickPonyABCs, TestCollectionABCs]
test_classes = [TestNamedTuple, NamedTupleDocs, TestOneTrickPonyABCs,
TestCollectionABCs, TestCounter]
test_support.run_unittest(*test_classes)
test_support.run_doctest(collections, verbose)

View File

@ -128,6 +128,9 @@ Core and Builtins
Library
-------
- Issue #1696199: Add collections.Counter() for rapid and convenient
counting.
- Issue #3860: GzipFile and BZ2File now support the context manager protocol.
- Issue #4272: Add an optional argument to the GzipFile constructor to override