544 lines
19 KiB
TeX
544 lines
19 KiB
TeX
\section{\module{itertools} ---
|
|
Functions creating iterators for efficient looping}
|
|
|
|
\declaremodule{standard}{itertools}
|
|
\modulesynopsis{Functions creating iterators for efficient looping.}
|
|
\moduleauthor{Raymond Hettinger}{python@rcn.com}
|
|
\sectionauthor{Raymond Hettinger}{python@rcn.com}
|
|
\versionadded{2.3}
|
|
|
|
|
|
This module implements a number of iterator building blocks inspired
|
|
by constructs from the Haskell and SML programming languages. Each
|
|
has been recast in a form suitable for Python.
|
|
|
|
The module standardizes a core set of fast, memory efficient tools
|
|
that are useful by themselves or in combination. Standardization helps
|
|
avoid the readability and reliability problems which arise when many
|
|
different individuals create their own slightly varying implementations,
|
|
each with their own quirks and naming conventions.
|
|
|
|
The tools are designed to combine readily with one another. This makes
|
|
it easy to construct more specialized tools succinctly and efficiently
|
|
in pure Python.
|
|
|
|
For instance, SML provides a tabulation tool: \code{tabulate(f)}
|
|
which produces a sequence \code{f(0), f(1), ...}. This toolbox
|
|
provides \function{imap()} and \function{count()} which can be combined
|
|
to form \code{imap(f, count())} and produce an equivalent result.
|
|
|
|
Likewise, the functional tools are designed to work well with the
|
|
high-speed functions provided by the \refmodule{operator} module.
|
|
|
|
The module author welcomes suggestions for other basic building blocks
|
|
to be added to future versions of the module.
|
|
|
|
Whether cast in pure python form or compiled code, tools that use iterators
|
|
are more memory efficient (and faster) than their list based counterparts.
|
|
Adopting the principles of just-in-time manufacturing, they create
|
|
data when and where needed instead of consuming memory with the
|
|
computer equivalent of ``inventory''.
|
|
|
|
The performance advantage of iterators becomes more acute as the number
|
|
of elements increases -- at some point, lists grow large enough to
|
|
severely impact memory cache performance and start running slowly.
|
|
|
|
\begin{seealso}
|
|
\seetext{The Standard ML Basis Library,
|
|
\citetitle[http://www.standardml.org/Basis/]
|
|
{The Standard ML Basis Library}.}
|
|
|
|
\seetext{Haskell, A Purely Functional Language,
|
|
\citetitle[http://www.haskell.org/definition/]
|
|
{Definition of Haskell and the Standard Libraries}.}
|
|
\end{seealso}
|
|
|
|
|
|
\subsection{Itertool functions \label{itertools-functions}}
|
|
|
|
The following module functions all construct and return iterators.
|
|
Some provide streams of infinite length, so they should only be accessed
|
|
by functions or loops that truncate the stream.
|
|
|
|
\begin{funcdesc}{chain}{*iterables}
|
|
Make an iterator that returns elements from the first iterable until
|
|
it is exhausted, then proceeds to the next iterable, until all of the
|
|
iterables are exhausted. Used for treating consecutive sequences as
|
|
a single sequence. Equivalent to:
|
|
|
|
\begin{verbatim}
|
|
def chain(*iterables):
|
|
for it in iterables:
|
|
for element in it:
|
|
yield element
|
|
\end{verbatim}
|
|
\end{funcdesc}
|
|
|
|
\begin{funcdesc}{count}{\optional{n}}
|
|
Make an iterator that returns consecutive integers starting with \var{n}.
|
|
If not specified \var{n} defaults to zero.
|
|
Does not currently support python long integers. Often used as an
|
|
argument to \function{imap()} to generate consecutive data points.
|
|
Also, used with \function{izip()} to add sequence numbers. Equivalent to:
|
|
|
|
\begin{verbatim}
|
|
def count(n=0):
|
|
while True:
|
|
yield n
|
|
n += 1
|
|
\end{verbatim}
|
|
|
|
Note, \function{count()} does not check for overflow and will return
|
|
negative numbers after exceeding \code{sys.maxint}. This behavior
|
|
may change in the future.
|
|
\end{funcdesc}
|
|
|
|
\begin{funcdesc}{cycle}{iterable}
|
|
Make an iterator returning elements from the iterable and saving a
|
|
copy of each. When the iterable is exhausted, return elements from
|
|
the saved copy. Repeats indefinitely. Equivalent to:
|
|
|
|
\begin{verbatim}
|
|
def cycle(iterable):
|
|
saved = []
|
|
for element in iterable:
|
|
yield element
|
|
saved.append(element)
|
|
while saved:
|
|
for element in saved:
|
|
yield element
|
|
\end{verbatim}
|
|
|
|
Note, this member of the toolkit may require significant
|
|
auxiliary storage (depending on the length of the iterable).
|
|
\end{funcdesc}
|
|
|
|
\begin{funcdesc}{dropwhile}{predicate, iterable}
|
|
Make an iterator that drops elements from the iterable as long as
|
|
the predicate is true; afterwards, returns every element. Note,
|
|
the iterator does not produce \emph{any} output until the predicate
|
|
is true, so it may have a lengthy start-up time. Equivalent to:
|
|
|
|
\begin{verbatim}
|
|
def dropwhile(predicate, iterable):
|
|
iterable = iter(iterable)
|
|
for x in iterable:
|
|
if not predicate(x):
|
|
yield x
|
|
break
|
|
for x in iterable:
|
|
yield x
|
|
\end{verbatim}
|
|
\end{funcdesc}
|
|
|
|
\begin{funcdesc}{groupby}{iterable\optional{, key}}
|
|
Make an iterator that returns consecutive keys and groups from the
|
|
\var{iterable}. The \var{key} is a function computing a key value for each
|
|
element. If not specified or is \code{None}, \var{key} defaults to an
|
|
identity function and returns the element unchanged. Generally, the
|
|
iterable needs to already be sorted on the same key function.
|
|
|
|
The returned group is itself an iterator that shares the underlying
|
|
iterable with \function{groupby()}. Because the source is shared, when
|
|
the \function{groupby} object is advanced, the previous group is no
|
|
longer visible. So, if that data is needed later, it should be stored
|
|
as a list:
|
|
|
|
\begin{verbatim}
|
|
groups = []
|
|
uniquekeys = []
|
|
for k, g in groupby(data, keyfunc):
|
|
groups.append(list(g)) # Store group iterator as a list
|
|
uniquekeys.append(k)
|
|
\end{verbatim}
|
|
|
|
\function{groupby()} is equivalent to:
|
|
|
|
\begin{verbatim}
|
|
class groupby(object):
|
|
def __init__(self, iterable, key=None):
|
|
if key is None:
|
|
key = lambda x: x
|
|
self.keyfunc = key
|
|
self.it = iter(iterable)
|
|
self.tgtkey = self.currkey = self.currvalue = xrange(0)
|
|
def __iter__(self):
|
|
return self
|
|
def next(self):
|
|
while self.currkey == self.tgtkey:
|
|
self.currvalue = self.it.next() # Exit on StopIteration
|
|
self.currkey = self.keyfunc(self.currvalue)
|
|
self.tgtkey = self.currkey
|
|
return (self.currkey, self._grouper(self.tgtkey))
|
|
def _grouper(self, tgtkey):
|
|
while self.currkey == tgtkey:
|
|
yield self.currvalue
|
|
self.currvalue = self.it.next() # Exit on StopIteration
|
|
self.currkey = self.keyfunc(self.currvalue)
|
|
\end{verbatim}
|
|
\versionadded{2.4}
|
|
\end{funcdesc}
|
|
|
|
\begin{funcdesc}{ifilter}{predicate, iterable}
|
|
Make an iterator that filters elements from iterable returning only
|
|
those for which the predicate is \code{True}.
|
|
If \var{predicate} is \code{None}, return the items that are true.
|
|
Equivalent to:
|
|
|
|
\begin{verbatim}
|
|
def ifilter(predicate, iterable):
|
|
if predicate is None:
|
|
predicate = bool
|
|
for x in iterable:
|
|
if predicate(x):
|
|
yield x
|
|
\end{verbatim}
|
|
\end{funcdesc}
|
|
|
|
\begin{funcdesc}{ifilterfalse}{predicate, iterable}
|
|
Make an iterator that filters elements from iterable returning only
|
|
those for which the predicate is \code{False}.
|
|
If \var{predicate} is \code{None}, return the items that are false.
|
|
Equivalent to:
|
|
|
|
\begin{verbatim}
|
|
def ifilterfalse(predicate, iterable):
|
|
if predicate is None:
|
|
predicate = bool
|
|
for x in iterable:
|
|
if not predicate(x):
|
|
yield x
|
|
\end{verbatim}
|
|
\end{funcdesc}
|
|
|
|
\begin{funcdesc}{imap}{function, *iterables}
|
|
Make an iterator that computes the function using arguments from
|
|
each of the iterables. If \var{function} is set to \code{None}, then
|
|
\function{imap()} returns the arguments as a tuple. Like
|
|
\function{map()} but stops when the shortest iterable is exhausted
|
|
instead of filling in \code{None} for shorter iterables. The reason
|
|
for the difference is that infinite iterator arguments are typically
|
|
an error for \function{map()} (because the output is fully evaluated)
|
|
but represent a common and useful way of supplying arguments to
|
|
\function{imap()}.
|
|
Equivalent to:
|
|
|
|
\begin{verbatim}
|
|
def imap(function, *iterables):
|
|
iterables = map(iter, iterables)
|
|
while True:
|
|
args = [i.next() for i in iterables]
|
|
if function is None:
|
|
yield tuple(args)
|
|
else:
|
|
yield function(*args)
|
|
\end{verbatim}
|
|
\end{funcdesc}
|
|
|
|
\begin{funcdesc}{islice}{iterable, \optional{start,} stop \optional{, step}}
|
|
Make an iterator that returns selected elements from the iterable.
|
|
If \var{start} is non-zero, then elements from the iterable are skipped
|
|
until start is reached. Afterward, elements are returned consecutively
|
|
unless \var{step} is set higher than one which results in items being
|
|
skipped. If \var{stop} is \code{None}, then iteration continues until
|
|
the iterator is exhausted, if at all; otherwise, it stops at the specified
|
|
position. Unlike regular slicing,
|
|
\function{islice()} does not support negative values for \var{start},
|
|
\var{stop}, or \var{step}. Can be used to extract related fields
|
|
from data where the internal structure has been flattened (for
|
|
example, a multi-line report may list a name field on every
|
|
third line). Equivalent to:
|
|
|
|
\begin{verbatim}
|
|
def islice(iterable, *args):
|
|
s = slice(*args)
|
|
it = iter(xrange(s.start or 0, s.stop or sys.maxint, s.step or 1))
|
|
nexti = it.next()
|
|
for i, element in enumerate(iterable):
|
|
if i == nexti:
|
|
yield element
|
|
nexti = it.next()
|
|
\end{verbatim}
|
|
|
|
If \var{start} is \code{None}, then iteration starts at zero.
|
|
If \var{step} is \code{None}, then the step defaults to one.
|
|
\versionchanged[accept \code{None} values for default \var{start} and
|
|
\var{step}]{2.5}
|
|
\end{funcdesc}
|
|
|
|
\begin{funcdesc}{izip}{*iterables}
|
|
Make an iterator that aggregates elements from each of the iterables.
|
|
Like \function{zip()} except that it returns an iterator instead of
|
|
a list. Used for lock-step iteration over several iterables at a
|
|
time. Equivalent to:
|
|
|
|
\begin{verbatim}
|
|
def izip(*iterables):
|
|
iterables = map(iter, iterables)
|
|
while iterables:
|
|
result = [it.next() for it in iterables]
|
|
yield tuple(result)
|
|
\end{verbatim}
|
|
|
|
\versionchanged[When no iterables are specified, returns a zero length
|
|
iterator instead of raising a \exception{TypeError}
|
|
exception]{2.4}
|
|
|
|
Note, the left-to-right evaluation order of the iterables is guaranteed.
|
|
This makes possible an idiom for clustering a data series into n-length
|
|
groups using \samp{izip(*[iter(s)]*n)}. For data that doesn't fit
|
|
n-length groups exactly, the last tuple can be pre-padded with fill
|
|
values using \samp{izip(*[chain(s, [None]*(n-1))]*n)}.
|
|
|
|
Note, when \function{izip()} is used with unequal length inputs, subsequent
|
|
iteration over the longer iterables cannot reliably be continued after
|
|
\function{izip()} terminates. Potentially, up to one entry will be missing
|
|
from each of the left-over iterables. This occurs because a value is fetched
|
|
from each iterator in-turn, but the process ends when one of the iterators
|
|
terminates. This leaves the last fetched values in limbo (they cannot be
|
|
returned in a final, incomplete tuple and they are cannot be pushed back
|
|
into the iterator for retrieval with \code{it.next()}). In general,
|
|
\function{izip()} should only be used with unequal length inputs when you
|
|
don't care about trailing, unmatched values from the longer iterables.
|
|
\end{funcdesc}
|
|
|
|
\begin{funcdesc}{repeat}{object\optional{, times}}
|
|
Make an iterator that returns \var{object} over and over again.
|
|
Runs indefinitely unless the \var{times} argument is specified.
|
|
Used as argument to \function{imap()} for invariant parameters
|
|
to the called function. Also used with \function{izip()} to create
|
|
an invariant part of a tuple record. Equivalent to:
|
|
|
|
\begin{verbatim}
|
|
def repeat(object, times=None):
|
|
if times is None:
|
|
while True:
|
|
yield object
|
|
else:
|
|
for i in xrange(times):
|
|
yield object
|
|
\end{verbatim}
|
|
\end{funcdesc}
|
|
|
|
\begin{funcdesc}{starmap}{function, iterable}
|
|
Make an iterator that computes the function using arguments tuples
|
|
obtained from the iterable. Used instead of \function{imap()} when
|
|
argument parameters are already grouped in tuples from a single iterable
|
|
(the data has been ``pre-zipped''). The difference between
|
|
\function{imap()} and \function{starmap()} parallels the distinction
|
|
between \code{function(a,b)} and \code{function(*c)}.
|
|
Equivalent to:
|
|
|
|
\begin{verbatim}
|
|
def starmap(function, iterable):
|
|
iterable = iter(iterable)
|
|
while True:
|
|
yield function(*iterable.next())
|
|
\end{verbatim}
|
|
\end{funcdesc}
|
|
|
|
\begin{funcdesc}{takewhile}{predicate, iterable}
|
|
Make an iterator that returns elements from the iterable as long as
|
|
the predicate is true. Equivalent to:
|
|
|
|
\begin{verbatim}
|
|
def takewhile(predicate, iterable):
|
|
for x in iterable:
|
|
if predicate(x):
|
|
yield x
|
|
else:
|
|
break
|
|
\end{verbatim}
|
|
\end{funcdesc}
|
|
|
|
\begin{funcdesc}{tee}{iterable\optional{, n=2}}
|
|
Return \var{n} independent iterators from a single iterable.
|
|
The case where \code{n==2} is equivalent to:
|
|
|
|
\begin{verbatim}
|
|
def tee(iterable):
|
|
def gen(next, data={}, cnt=[0]):
|
|
for i in count():
|
|
if i == cnt[0]:
|
|
item = data[i] = next()
|
|
cnt[0] += 1
|
|
else:
|
|
item = data.pop(i)
|
|
yield item
|
|
it = iter(iterable)
|
|
return (gen(it.next), gen(it.next))
|
|
\end{verbatim}
|
|
|
|
Note, once \function{tee()} has made a split, the original \var{iterable}
|
|
should not be used anywhere else; otherwise, the \var{iterable} could get
|
|
advanced without the tee objects being informed.
|
|
|
|
Note, this member of the toolkit may require significant auxiliary
|
|
storage (depending on how much temporary data needs to be stored).
|
|
In general, if one iterator is going to use most or all of the data before
|
|
the other iterator, it is faster to use \function{list()} instead of
|
|
\function{tee()}.
|
|
\versionadded{2.4}
|
|
\end{funcdesc}
|
|
|
|
|
|
\subsection{Examples \label{itertools-example}}
|
|
|
|
The following examples show common uses for each tool and
|
|
demonstrate ways they can be combined.
|
|
|
|
\begin{verbatim}
|
|
|
|
>>> amounts = [120.15, 764.05, 823.14]
|
|
>>> for checknum, amount in izip(count(1200), amounts):
|
|
... print 'Check %d is for $%.2f' % (checknum, amount)
|
|
...
|
|
Check 1200 is for $120.15
|
|
Check 1201 is for $764.05
|
|
Check 1202 is for $823.14
|
|
|
|
>>> import operator
|
|
>>> for cube in imap(operator.pow, xrange(1,5), repeat(3)):
|
|
... print cube
|
|
...
|
|
1
|
|
8
|
|
27
|
|
64
|
|
|
|
>>> reportlines = ['EuroPython', 'Roster', '', 'alex', '', 'laura',
|
|
'', 'martin', '', 'walter', '', 'mark']
|
|
>>> for name in islice(reportlines, 3, None, 2):
|
|
... print name.title()
|
|
...
|
|
Alex
|
|
Laura
|
|
Martin
|
|
Walter
|
|
Mark
|
|
|
|
# Show a dictionary sorted and grouped by value
|
|
>>> from operator import itemgetter
|
|
>>> d = dict(a=1, b=2, c=1, d=2, e=1, f=2, g=3)
|
|
>>> di = sorted(d.iteritems(), key=itemgetter(1))
|
|
>>> for k, g in groupby(di, key=itemgetter(1)):
|
|
... print k, map(itemgetter(0), g)
|
|
...
|
|
1 ['a', 'c', 'e']
|
|
2 ['b', 'd', 'f']
|
|
3 ['g']
|
|
|
|
# Find runs of consecutive numbers using groupby. The key to the solution
|
|
# is differencing with a range so that consecutive numbers all appear in
|
|
# same group.
|
|
>>> data = [ 1, 4,5,6, 10, 15,16,17,18, 22, 25,26,27,28]
|
|
>>> for k, g in groupby(enumerate(data), lambda (i,x):i-x):
|
|
... print map(operator.itemgetter(1), g)
|
|
...
|
|
[1]
|
|
[4, 5, 6]
|
|
[10]
|
|
[15, 16, 17, 18]
|
|
[22]
|
|
[25, 26, 27, 28]
|
|
|
|
\end{verbatim}
|
|
|
|
|
|
\subsection{Recipes \label{itertools-recipes}}
|
|
|
|
This section shows recipes for creating an extended toolset using the
|
|
existing itertools as building blocks.
|
|
|
|
The extended tools offer the same high performance as the underlying
|
|
toolset. The superior memory performance is kept by processing elements one
|
|
at a time rather than bringing the whole iterable into memory all at once.
|
|
Code volume is kept small by linking the tools together in a functional style
|
|
which helps eliminate temporary variables. High speed is retained by
|
|
preferring ``vectorized'' building blocks over the use of for-loops and
|
|
generators which incur interpreter overhead.
|
|
|
|
|
|
\begin{verbatim}
|
|
def take(n, seq):
|
|
return list(islice(seq, n))
|
|
|
|
def enumerate(iterable):
|
|
return izip(count(), iterable)
|
|
|
|
def tabulate(function):
|
|
"Return function(0), function(1), ..."
|
|
return imap(function, count())
|
|
|
|
def iteritems(mapping):
|
|
return izip(mapping.iterkeys(), mapping.itervalues())
|
|
|
|
def nth(iterable, n):
|
|
"Returns the nth item"
|
|
return list(islice(iterable, n, n+1))
|
|
|
|
def all(seq, pred=None):
|
|
"Returns True if pred(x) is true for every element in the iterable"
|
|
for elem in ifilterfalse(pred, seq):
|
|
return False
|
|
return True
|
|
|
|
def any(seq, pred=None):
|
|
"Returns True if pred(x) is true for at least one element in the iterable"
|
|
for elem in ifilter(pred, seq):
|
|
return True
|
|
return False
|
|
|
|
def no(seq, pred=None):
|
|
"Returns True if pred(x) is false for every element in the iterable"
|
|
for elem in ifilter(pred, seq):
|
|
return False
|
|
return True
|
|
|
|
def quantify(seq, pred=None):
|
|
"Count how many times the predicate is true in the sequence"
|
|
return sum(imap(pred, seq))
|
|
|
|
def padnone(seq):
|
|
"""Returns the sequence elements and then returns None indefinitely.
|
|
|
|
Useful for emulating the behavior of the built-in map() function.
|
|
"""
|
|
return chain(seq, repeat(None))
|
|
|
|
def ncycles(seq, n):
|
|
"Returns the sequence elements n times"
|
|
return chain(*repeat(seq, n))
|
|
|
|
def dotproduct(vec1, vec2):
|
|
return sum(imap(operator.mul, vec1, vec2))
|
|
|
|
def flatten(listOfLists):
|
|
return list(chain(*listOfLists))
|
|
|
|
def repeatfunc(func, times=None, *args):
|
|
"""Repeat calls to func with specified arguments.
|
|
|
|
Example: repeatfunc(random.random)
|
|
"""
|
|
if times is None:
|
|
return starmap(func, repeat(args))
|
|
else:
|
|
return starmap(func, repeat(args, times))
|
|
|
|
def pairwise(iterable):
|
|
"s -> (s0,s1), (s1,s2), (s2, s3), ..."
|
|
a, b = tee(iterable)
|
|
try:
|
|
b.next()
|
|
except StopIteration:
|
|
pass
|
|
return izip(a, b)
|
|
|
|
def grouper(n, iterable, padvalue=None):
|
|
"grouper(3, 'abcdefg', 'x') --> ('a','b','c'), ('d','e','f'), ('g','x','x')"
|
|
return izip(*[chain(iterable, repeat(padvalue, n-1))]*n)
|
|
|
|
|
|
\end{verbatim}
|