mirror of https://github.com/python/cpython
Issue 1696199: Add collections.Counter().
This commit is contained in:
parent
99234c5c74
commit
f94d7fa5fb
|
@ -152,6 +152,134 @@ Notes on using :class:`Set` and :class:`MutableSet` as a mixin:
|
|||
(For more about ABCs, see the :mod:`abc` module and :pep:`3119`.)
|
||||
|
||||
|
||||
.. _counter-objects:
|
||||
|
||||
:class:`Counter` objects
|
||||
------------------------
|
||||
|
||||
A counter tool is provided to support convenient and rapid tallies.
|
||||
For example::
|
||||
|
||||
# Tally repeated words in a list
|
||||
>>> words = ['red', 'blue', 'red', 'green', 'blue', blue']
|
||||
>>> cnt = Counter()
|
||||
>>> for word in words:
|
||||
... cnt[word] += 1
|
||||
>>> cnt
|
||||
Counter(items=[('blue', 3), ('red', 2), ('green', 1)])
|
||||
|
||||
# Find the ten most common words in Hamlet
|
||||
>>> import re
|
||||
>>> words = re.findall('\w+', open('hamlet.txt').read().lower())
|
||||
>>> Counter(hamlet_words).most_common(10)
|
||||
[('the', 1143), ('and', 966), ('to', 762), ('of', 669), ('i', 631),
|
||||
('you', 554), ('a', 546), ('my', 514), ('hamlet', 471), ('in', 451)]
|
||||
|
||||
.. class:: Counter([iterable[, items]])
|
||||
|
||||
A :class:`Counter` is a :class:`dict` subclass for counting hashable items.
|
||||
Elements are stored as dictionary keys and their counts are stored as
|
||||
dictionary values. Counts are allowed to be any integer value including
|
||||
zero or negative counts. The :class:`Counter` class is similar to bags
|
||||
or multisets in other languages.
|
||||
|
||||
Elements are counted from the *iterable* if given. Also, the counts
|
||||
can be initialized from an *items* list of *(element, count)* pairs.
|
||||
If provided, *items* must be a keyword argument::
|
||||
|
||||
>>> c = Counter() # a new, empty counter
|
||||
>>> c = Counter('gallahad') # a new counter from an iterable
|
||||
>>> c = Counter(items=[('a', 4), ('b', 2)]) # a new counter from an items list
|
||||
|
||||
The returned object has a dictionary style interface except that it returns
|
||||
a zero count for missing items (instead of raising a :exc:`KeyError` like a
|
||||
dictionary would)::
|
||||
|
||||
>>> c = Counter(['if', 'your', 'peril', 'be'])
|
||||
>>> c['questions'] # count of a missing element is zero
|
||||
0
|
||||
|
||||
Assigning a count of zero or reducing the count to zero leaves the
|
||||
element in the dictionary. Use ``del`` to remove the entry entirely:
|
||||
|
||||
>>> c = Counter(['arthur', 'gwain'])
|
||||
>>> c['arthur'] = 0 # set the count of "arthur" to zero
|
||||
>>> 'arthur' in c # but "arthur" is still in the counter
|
||||
True
|
||||
>>> del c['arthur'] # del will completely remove the entry
|
||||
>>> 'arthur' in c
|
||||
False
|
||||
|
||||
.. versionadded:: 2.7
|
||||
|
||||
|
||||
Counter objects support two methods beyond those available for all
|
||||
dictionaries:
|
||||
|
||||
.. method:: elements()
|
||||
|
||||
Return an iterator over elements repeating each as many times as its count.
|
||||
Elements are returned in arbitrary order. If an element's count has been
|
||||
set to zero or a negative number, :meth:`elements` will ignore it.
|
||||
|
||||
>>> c = Counter(items=[('a', 4), ('b', 2), ('d', 0), ('e', -2)])
|
||||
>>> list(c.elements())
|
||||
['a', 'a', 'a', 'a', 'b', 'b']
|
||||
|
||||
.. method:: most_common([n])
|
||||
|
||||
Return a list of the *n* most common elements and their counts from
|
||||
the most common to the least. If *n* is not specified or is ``None``,
|
||||
return a list of all element counts in decreasing order of frequency.
|
||||
Elements with equal counts are ordered arbitrarily::
|
||||
|
||||
>>> Counter('abracadabra').most_common(3)
|
||||
[('a', 5), ('r', 2), ('b', 2)]
|
||||
|
||||
The usual dictionary methods are available for :class:`Counter` objects.
|
||||
All of those work the same as they do for dictionaries except for two
|
||||
which work differently for counters.
|
||||
|
||||
.. method:: fromkeys(iterable)
|
||||
|
||||
There is no equivalent class method for :class:`Counter` objects.
|
||||
Raises a :exc:`NotImplementedError` when called.
|
||||
|
||||
.. method:: update(mapping)
|
||||
|
||||
Like :meth:`dict.update` but adds-in counts instead of replacing them.
|
||||
Used for combining two independent counts. Accepts a *mapping* object
|
||||
which can be another counter or can be a :class:`dict` that maps
|
||||
elements to element counts::
|
||||
|
||||
>>> c = Counter('which') # count letters in a word
|
||||
>>> d = Counter('witch') # count letters in another word
|
||||
>>> c.update(d) # add counts from d to those in c
|
||||
>>> c['h'] # count of 'h' is now three
|
||||
3
|
||||
>>> c.update(Counter('watch')) # add in letters from another word
|
||||
>>> c['h'] # count of 'h' is now four
|
||||
4
|
||||
|
||||
|
||||
.. seealso::
|
||||
|
||||
`Multisets <http://en.wikipedia.org/wiki/Multiset>`_
|
||||
in the Wikipedia
|
||||
|
||||
`Bag <http://www.gnu.org/software/smalltalk/manual-base/html_node/Bag.html>`_
|
||||
a Smalltalk class
|
||||
|
||||
`C++ multisets <http://www.demo2s.com/Tutorial/Cpp/0380__set-multiset/Catalog0380__set-multiset.htm>`_
|
||||
a tutorial with standalone examples
|
||||
|
||||
`Bag class <http://code.activestate.com/recipes/259174/>`_
|
||||
an early Python recipe
|
||||
|
||||
Use cases for multisets and mathematical operations on multisets.
|
||||
Knuth, Donald. The Art of Computer Programming Volume II,
|
||||
Section 4.6.3, Exercise 19.
|
||||
|
||||
|
||||
.. _deque-objects:
|
||||
|
||||
|
|
|
@ -9,6 +9,11 @@ from _collections import deque, defaultdict
|
|||
from operator import itemgetter as _itemgetter
|
||||
from keyword import iskeyword as _iskeyword
|
||||
import sys as _sys
|
||||
import heapq as _heapq
|
||||
import itertools as _itertools
|
||||
|
||||
########################################################################
|
||||
### namedtuple #######################################################
|
||||
|
||||
def namedtuple(typename, field_names, verbose=False):
|
||||
"""Returns a new subclass of tuple with named fields.
|
||||
|
@ -108,7 +113,160 @@ def namedtuple(typename, field_names, verbose=False):
|
|||
return result
|
||||
|
||||
|
||||
########################################################################
|
||||
### Counter ##########################################################
|
||||
|
||||
class Counter(dict):
|
||||
'''Dict subclass for counting hashable items. Sometimes called a bag
|
||||
or multiset. Elements are stored as dictionary keys and their counts
|
||||
are stored as dictionary values.
|
||||
|
||||
>>> c = Counter('abracadabra') # count elements from a string
|
||||
|
||||
>>> c.most_common(3) # three most common elements
|
||||
[('a', 5), ('r', 2), ('b', 2)]
|
||||
>>> sorted(c) # list all unique elements
|
||||
['a', 'b', 'c', 'd', 'r']
|
||||
>>> ''.join(sorted(c.elements())) # list elements with repetitions
|
||||
'aaaaabbcdrr'
|
||||
>>> sum(c.values()) # total of all counts
|
||||
11
|
||||
|
||||
>>> c['a'] # count of letter 'a'
|
||||
5
|
||||
>>> for elem in 'shazam': # update counts from an iterable
|
||||
... c[elem] += 1 # by adding 1 to each element's count
|
||||
>>> c['a'] # now there are seven 'a'
|
||||
7
|
||||
>>> del c['r'] # remove all 'r'
|
||||
>>> c['r'] # now there are zero 'r'
|
||||
0
|
||||
|
||||
>>> d = Counter('simsalabim') # make another counter
|
||||
>>> c.update(d) # add in the second counter
|
||||
>>> c['a'] # now there are nine 'a'
|
||||
9
|
||||
|
||||
>>> c.clear() # empty the counter
|
||||
>>> c
|
||||
Counter()
|
||||
|
||||
Note: If a count is set to zero or reduced to zero, it will remain
|
||||
in the counter until the entry is deleted or the counter is cleared:
|
||||
|
||||
>>> c = Counter('aaabbc')
|
||||
>>> c['b'] -= 2 # reduce the count of 'b' by two
|
||||
>>> c.most_common() # 'b' is still in, but its count is zero
|
||||
[('a', 3), ('c', 1), ('b', 0)]
|
||||
|
||||
'''
|
||||
# References:
|
||||
# http://en.wikipedia.org/wiki/Multiset
|
||||
# http://www.gnu.org/software/smalltalk/manual-base/html_node/Bag.html
|
||||
# http://www.demo2s.com/Tutorial/Cpp/0380__set-multiset/Catalog0380__set-multiset.htm
|
||||
# http://code.activestate.com/recipes/259174/
|
||||
# Knuth, TAOCP Vol. II section 4.6.3
|
||||
|
||||
def __init__(self, iterable=None, items=None):
|
||||
'''Create a new, empty Counter object. And if given, count elements
|
||||
from an input iterable. Or, initialize the count from an items list
|
||||
of (element, count) pairs.
|
||||
|
||||
>>> c = Counter('hocus pocus') # count elements in an iterable
|
||||
>>> c = Counter(items=[('a', 4), ('b', 2)]) # take counts from an items list
|
||||
|
||||
'''
|
||||
if iterable is not None:
|
||||
for elem in iterable:
|
||||
self[elem] += 1
|
||||
if items is not None:
|
||||
for elem, count in items:
|
||||
self[elem] += count
|
||||
|
||||
def __missing__(self, key):
|
||||
'The count of elements not in the Counter is zero.'
|
||||
# Needed so that self[missing_item] does not raise KeyError
|
||||
return 0
|
||||
|
||||
def most_common(self, n=None):
|
||||
'''List the n most common elements and their counts from the most
|
||||
common to the least. If n is None, then list all element counts.
|
||||
|
||||
>>> Counter('abracadabra').most_common(3)
|
||||
[('a', 5), ('r', 2), ('b', 2)]
|
||||
|
||||
'''
|
||||
# Emulate Bag.sortedByCount from Smalltalk
|
||||
if n is None:
|
||||
return sorted(self.iteritems(), key=_itemgetter(1), reverse=True)
|
||||
return _heapq.nlargest(n, self.iteritems(), key=_itemgetter(1))
|
||||
|
||||
def elements(self):
|
||||
'''Iterator over elements repeating each as many times as its count.
|
||||
|
||||
>>> c = Counter('ABCABC')
|
||||
>>> sorted(c.elements())
|
||||
['A', 'A', 'B', 'B', 'C', 'C']
|
||||
|
||||
# Knuth's example of prime factors of 1836: 2**2 * 3**3 * 17**1
|
||||
>>> import operator
|
||||
>>> prime_factors = Counter(items=[(2,2), (3,3), (17,1)])
|
||||
>>> sorted(prime_factors.elements()) # list individual factors
|
||||
[2, 2, 3, 3, 3, 17]
|
||||
>>> reduce(operator.mul, prime_factors.elements(), 1) # multiply them
|
||||
1836
|
||||
|
||||
Note, if an element's count has been set to zero or a negative number,
|
||||
elements() will ignore it.
|
||||
|
||||
'''
|
||||
# Emulate Bag.do from Smalltalk and Multiset.begin from C++.
|
||||
return _itertools.chain.from_iterable(
|
||||
_itertools.starmap(_itertools.repeat,
|
||||
self.iteritems()))
|
||||
|
||||
# Override dict methods where necessary
|
||||
|
||||
@classmethod
|
||||
def fromkeys(cls, iterable, v=None):
|
||||
# There is no equivalent method for counters because setting v=1
|
||||
# means that no element can have a count greater than one.
|
||||
raise NotImplementedError(
|
||||
'Counter.fromkeys() is undefined. Use Counter(iterable) instead.')
|
||||
|
||||
def update(self, mapping):
|
||||
'''Like dict.update() but add counts instead of replacing them.
|
||||
|
||||
Source can be another dictionary or a Counter.instance().
|
||||
|
||||
>>> c = Counter('which')
|
||||
>>> d = Counter('witch')
|
||||
>>> c.update(d) # Add counts from d to those in c
|
||||
>>> c['h'] # Count of 'h' is now three
|
||||
3
|
||||
|
||||
'''
|
||||
# The regular dict.update() operation makes no sense here because the
|
||||
# replace behavior results in the some of original untouched counts
|
||||
# being mixed-in with all of the other counts for a mismash that
|
||||
# doesn't have a straight-forward interpretation in most counting
|
||||
# contexts. Instead, we look to Knuth for suggested operations on
|
||||
# multisets and implement the union-add operation discussed in
|
||||
# TAOCP Volume II section 4.6.3 exercise 19. The Wikipedia entry for
|
||||
# multisets calls that operation a sum or join.
|
||||
for elem, count in mapping.iteritems():
|
||||
self[elem] += count
|
||||
|
||||
def copy(self):
|
||||
'Like dict.copy() but returns a Counter instance instead of a dict.'
|
||||
c = Counter()
|
||||
c.update(self)
|
||||
return c
|
||||
|
||||
def __repr__(self):
|
||||
if not self:
|
||||
return '%s()' % self.__class__.__name__
|
||||
return '%s(items=%r)' % (self.__class__.__name__, self.most_common())
|
||||
|
||||
|
||||
|
||||
|
@ -143,6 +301,49 @@ if __name__ == '__main__':
|
|||
Point3D = namedtuple('Point3D', Point._fields + ('z',))
|
||||
print Point3D.__doc__
|
||||
|
||||
# Check that counters are copyable, deepcopyable, picklable, and have a
|
||||
# repr/eval round-trip
|
||||
import copy
|
||||
words = Counter('which witch had which witches wrist watch'.split())
|
||||
update_test = Counter()
|
||||
update_test.update(words)
|
||||
for i, dup in enumerate([
|
||||
words.copy(),
|
||||
copy.copy(words),
|
||||
copy.deepcopy(words),
|
||||
loads(dumps(words, 0)),
|
||||
loads(dumps(words, 1)),
|
||||
loads(dumps(words, 2)),
|
||||
loads(dumps(words, -1)),
|
||||
eval(repr(words)),
|
||||
update_test,
|
||||
]):
|
||||
msg = (i, dup, words)
|
||||
assert dup is not words, msg
|
||||
assert dup == words, msg
|
||||
assert len(dup) == len(words), msg
|
||||
assert type(dup) == type(words), msg
|
||||
|
||||
# Verify that counters are unhashable
|
||||
try:
|
||||
hash(words)
|
||||
except TypeError:
|
||||
pass
|
||||
else:
|
||||
print 'Failed hashing test'
|
||||
|
||||
# Verify that Counter.fromkeys() is disabled
|
||||
try:
|
||||
Counter.fromkeys('razmataz')
|
||||
except NotImplementedError:
|
||||
pass
|
||||
else:
|
||||
print 'Failed fromkeys() test'
|
||||
|
||||
# Check ABCs
|
||||
assert issubclass(Counter, Mapping)
|
||||
assert isinstance(Counter('asdfasdf'), Mapping)
|
||||
|
||||
import doctest
|
||||
TestResults = namedtuple('TestResults', 'failed attempted')
|
||||
print TestResults(*doctest.testmod())
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import unittest, doctest
|
||||
from test import test_support
|
||||
from collections import namedtuple
|
||||
from collections import namedtuple, Counter, Mapping
|
||||
import pickle, cPickle, copy
|
||||
from collections import Hashable, Iterable, Iterator
|
||||
from collections import Sized, Container, Callable
|
||||
|
@ -346,11 +346,107 @@ class TestCollectionABCs(unittest.TestCase):
|
|||
self.failUnless(issubclass(sample, MutableSequence))
|
||||
self.failIf(issubclass(basestring, MutableSequence))
|
||||
|
||||
class TestCounter(unittest.TestCase):
|
||||
|
||||
def test_basics(self):
|
||||
c = Counter('abcaba')
|
||||
self.assert_(isinstance(c, dict))
|
||||
self.assert_(isinstance(c, Mapping))
|
||||
self.assert_(issubclass(Counter, dict))
|
||||
self.assert_(issubclass(Counter, Mapping))
|
||||
self.assertEqual(len(c), 3)
|
||||
self.assertEqual(sum(c.values()), 6)
|
||||
self.assertEqual(sorted(c.values()), [1, 2, 3])
|
||||
self.assertEqual(sorted(c.keys()), ['a', 'b', 'c'])
|
||||
self.assertEqual(sorted(c), ['a', 'b', 'c'])
|
||||
self.assertEqual(sorted(c.items()),
|
||||
[('a', 3), ('b', 2), ('c', 1)])
|
||||
self.assertEqual(c['b'], 2)
|
||||
self.assertEqual(c['z'], 0)
|
||||
self.assertEqual(c.has_key('c'), True)
|
||||
self.assertEqual(c.has_key('z'), False)
|
||||
self.assertEqual(c.__contains__('c'), True)
|
||||
self.assertEqual(c.__contains__('z'), False)
|
||||
self.assertEqual(c.get('b', 10), 2)
|
||||
self.assertEqual(c.get('z', 10), 10)
|
||||
self.assertEqual(c, dict(a=3, b=2, c=1))
|
||||
self.assertEqual(repr(c),
|
||||
"Counter(items=[('a', 3), ('b', 2), ('c', 1)])")
|
||||
self.assertEqual(c.most_common(), [('a', 3), ('b', 2), ('c', 1)])
|
||||
for i in range(5):
|
||||
self.assertEqual(c.most_common(i),
|
||||
[('a', 3), ('b', 2), ('c', 1)][:i])
|
||||
self.assertEqual(''.join(sorted(c.elements())), 'aaabbc')
|
||||
c['a'] += 1 # increment an existing value
|
||||
c['b'] -= 2 # sub existing value to zero
|
||||
del c['c'] # remove an entry
|
||||
c['d'] -= 2 # sub from a missing value
|
||||
c['e'] = -5 # directly assign a missing value
|
||||
c['f'] += 4 # add to a missing value
|
||||
self.assertEqual(c, dict(a=4, b=0, d=-2, e=-5, f=4))
|
||||
self.assertEqual(''.join(sorted(c.elements())), 'aaaaffff')
|
||||
self.assertEqual(c.pop('f'), 4)
|
||||
self.assertEqual('f' in c, False)
|
||||
for i in range(3):
|
||||
elem, cnt = c.popitem()
|
||||
self.assertEqual(elem in c, False)
|
||||
c.clear()
|
||||
self.assertEqual(c, {})
|
||||
self.assertEqual(repr(c), 'Counter()')
|
||||
self.assertRaises(NotImplementedError, Counter.fromkeys, 'abc')
|
||||
self.assertRaises(TypeError, hash, c)
|
||||
c.update(dict(a=5, b=3, c=1))
|
||||
c.update(Counter(items=[('a', 50), ('b', 30)]))
|
||||
c.__init__(items=[('a', 500), ('b', 300)])
|
||||
c.__init__('cdc')
|
||||
self.assertEqual(c, dict(a=555, b=333, c=3, d=1))
|
||||
self.assertEqual(c.setdefault('d', 5), 1)
|
||||
self.assertEqual(c['d'], 1)
|
||||
self.assertEqual(c.setdefault('e', 5), 5)
|
||||
self.assertEqual(c['e'], 5)
|
||||
|
||||
def test_copying(self):
|
||||
# Check that counters are copyable, deepcopyable, picklable, and
|
||||
#have a repr/eval round-trip
|
||||
words = Counter('which witch had which witches wrist watch'.split())
|
||||
update_test = Counter()
|
||||
update_test.update(words)
|
||||
for i, dup in enumerate([
|
||||
words.copy(),
|
||||
copy.copy(words),
|
||||
copy.deepcopy(words),
|
||||
pickle.loads(pickle.dumps(words, 0)),
|
||||
pickle.loads(pickle.dumps(words, 1)),
|
||||
pickle.loads(pickle.dumps(words, 2)),
|
||||
pickle.loads(pickle.dumps(words, -1)),
|
||||
cPickle.loads(cPickle.dumps(words, 0)),
|
||||
cPickle.loads(cPickle.dumps(words, 1)),
|
||||
cPickle.loads(cPickle.dumps(words, 2)),
|
||||
cPickle.loads(cPickle.dumps(words, -1)),
|
||||
eval(repr(words)),
|
||||
update_test,
|
||||
]):
|
||||
msg = (i, dup, words)
|
||||
self.assert_(dup is not words)
|
||||
self.assertEquals(dup, words)
|
||||
self.assertEquals(len(dup), len(words))
|
||||
self.assertEquals(type(dup), type(words))
|
||||
|
||||
def test_conversions(self):
|
||||
# Convert to: set, list, dict
|
||||
s = 'she sells sea shells by the sea shore'
|
||||
self.assertEqual(sorted(Counter(s).elements()), sorted(s))
|
||||
self.assertEqual(sorted(Counter(s)), sorted(set(s)))
|
||||
self.assertEqual(dict(Counter(s)), dict(Counter(s).items()))
|
||||
self.assertEqual(set(Counter(s)), set(s))
|
||||
|
||||
|
||||
import doctest, collections
|
||||
|
||||
def test_main(verbose=None):
|
||||
NamedTupleDocs = doctest.DocTestSuite(module=collections)
|
||||
test_classes = [TestNamedTuple, NamedTupleDocs, TestOneTrickPonyABCs, TestCollectionABCs]
|
||||
test_classes = [TestNamedTuple, NamedTupleDocs, TestOneTrickPonyABCs,
|
||||
TestCollectionABCs, TestCounter]
|
||||
test_support.run_unittest(*test_classes)
|
||||
test_support.run_doctest(collections, verbose)
|
||||
|
||||
|
|
|
@ -128,6 +128,9 @@ Core and Builtins
|
|||
Library
|
||||
-------
|
||||
|
||||
- Issue #1696199: Add collections.Counter() for rapid and convenient
|
||||
counting.
|
||||
|
||||
- Issue #3860: GzipFile and BZ2File now support the context manager protocol.
|
||||
|
||||
- Issue #4272: Add an optional argument to the GzipFile constructor to override
|
||||
|
|
Loading…
Reference in New Issue