Joining multiple iteratorars by a key_问答_开发者

Given: n iterators, and a function to get a key for an item for each of them

Assuming:

The iterators provide the items sorted by the key
The keys from any iterator are unique

I want to iterate through them joined by the keys. Eg, given the following 2 lists:

[('a', {type:'x', mtime:Datetime()}), ('b', {type='y', mtime:Datetime()})]
[('b', Datetime()), ('c', Datetime())]

Using the first item in each tuple as the key, I want to ge开发者_运维知识库t:

(('a', {type:'x', mtime:Datetime()}), None)
(('b', {type:'y', mtime:Datetime()}), ('b', Datetime()),)
(None, ('c', Datetime()),)

So I hacked up this method:

def iter_join(*iterables_and_key_funcs):
    iterables_len = len(iterables_and_key_funcs)

    keys_funcs = tuple(key_func for iterable, key_func in iterables_and_key_funcs)
    iters = tuple(iterable.__iter__() for iterable, key_func in iterables_and_key_funcs)

    current_values = [None] * iterables_len
    current_keys= [None] * iterables_len
    iters_stoped = [False] * iterables_len

    def get_nexts(iters_needing_fetch):
        for i, fetch in enumerate(iters_needing_fetch):
            if fetch and not iters_stoped[i]:
                try:
                    current_values[i] = iters[i].next()
                    current_keys[i] = keys_funcs[i](current_values[i])
                except StopIteration:
                    iters_stoped[i] = True
                    current_values[i] = None
                    current_keys[i] = None

    get_nexts([True] * iterables_len)

    while not all(iters_stoped):
        min_key = min(key
                      for key, iter_stoped in zip(current_keys, iters_stoped)
                      if not iter_stoped)

        keys_equal_to_min = tuple(key == min_key for key in current_keys)
        yield tuple(value if key_eq_min else None
                    for key_eq_min, value in zip(keys_equal_to_min, current_values))

        get_nexts(keys_equal_to_min)

and test it:

key_is_value = lambda v: v
a = (  2, 3, 4,  )
b = (1,          )
c = (          5,)
d = (1,   3,   5,)
l = list(iter_join(
        (a, key_is_value),
        (b, key_is_value),
        (c, key_is_value),
        (d, key_is_value),
    ))
import pprint; pprint.pprint(l)

which outputs:

[(None, 1, None, 1),
 (2, None, None, None),
 (3, None, None, 3),
 (4, None, None, None),
 (None, None, 5, 5)]

Is there an existing method to do this? I checkout itertools, but could not find anything.

Are there any ways to improve my method? Make it simpler, faster, etc..

Update: Solution used

I decided to simplify the contract for this function by requiring the iterators to yield tuple(key, value) or tuple(key, *values). Using agf's answer as a starting point, I came up with this :

def join_items(*iterables):

    iters = tuple(iter(iterable) for iterable in iterables)
    current_items = [next(itr, None) for itr in iters]

    while True:
        try:
            key = min(item[0] for item in current_items if item != None)
        except ValueError:
            break

        yield tuple(item if item != None and item[0]==key else None
                    for item in current_items)

        for i, (item, itr) in enumerate(zip(current_items, iters)):
            if item != None and item[0] == key:
                current_items[i] = next(itr, None)


a = (      (2,), (3,), (4,),      )
b = ((1,),                        )
c = (                        (5,),)
d = ((1,),       (3,),       (5,),)
e = (                             )

import pprint; pprint.pprint(list(join_items(a, b, c, d, e)))

[(None, (1,), None, (1,), None),
 ((2,), None, None, None, None),
 ((3,), None, None, (3,), None),
 ((4,), None, None, None, None),
 (None, None, (5,), (5,), None)]

The example at the beginning of your question is different than at the end.

For the first example, I would do this:

x = [('a', {}), ('b', {})]
y = [('b', {}), ('c', {})]
xd, yd = dict(x), dict(y)
combined = []
for k in sorted(set(xd.keys()+yd.keys())):
    row = []
    for d in (xd, yd):
        row.append((k, d[k]) if k in d else None)
    combined.append(tuple(row))

for row in combined:
    print row

gives

(('a', {}), None)
(('b', {}), ('b', {}))
(None, ('c', {}))

For the second example

a = (  2, 3, 4,  )
b = (1,          )
c = (          5,)
d = (1,   3,   5,)

abcd = map(set, [a,b,c,d])
values = sorted(set(a+b+c+d))
print [tuple(v if v in row else None for row in abcd) for v in values]

gives

[(None, 1, None, 1),
 (2, None, None, None),
 (3, None, None, 3),
 (4, None, None, None),
 (None, None, 5, 5)]

But what are you trying to accomplish? Perhaps you need different data structures.

a = (  2, 3, 4,  )
b = (1,          )
c = (          5,)
d = (1,   3,   5,)

iters = [iter(x) for x in (a, b, c, d)]

this = [next(i) for i in iters]

while True:
    try:
        key = min(i for i in this if i != None)
    except ValueError:
        break
    for i, val in enumerate(this):
        if val == key:
            print val,
            this[i] = next(iters[i], None)
        else:
            print None,
    print

import itertools as it
import heapq
import pprint

def imerge(*iterables):
    '''
    http://code.activestate.com/recipes/491285-iterator-merge/
    Author: Raymond Hettinger

    Merge multiple sorted inputs into a single sorted output.

    Equivalent to:  sorted(itertools.chain(*iterables))

    >>> list(imerge([1,3,5,7], [0,2,4,8], [5,10,15,20], [], [25]))
    [0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25]

    '''
    heappop, siftup, _StopIteration = heapq.heappop, heapq._siftup, StopIteration

    h = []
    h_append = h.append
    for it in map(iter, iterables):
        try:
            next = it.next
            h_append([next(), next])
        except _StopIteration:
            pass
    heapq.heapify(h)

    while 1:
        try:
            while 1:
                v, next = s = h[0]      # raises IndexError when h is empty
                yield v
                s[0] = next()           # raises StopIteration when exhausted
                siftup(h, 0)            # restore heap condition
        except _StopIteration:
            heappop(h)                  # remove empty iterator
        except IndexError:
            return

a = (  2, 3, 4,  )
b = (1,          )
c = (          5,)
d = (1,   3,   5,)

def tag(iterator,val):
    for elt in iterator:
        yield elt,val

def expand(group):
    dct=dict((tag,val)for val,tag in group)
    result=[dct.get(tag,None) for tag in range(4)]
    return result

pprint.pprint(
    [ expand(group)
     for key,group in it.groupby(
          imerge(*it.imap(tag,(a,b,c,d),it.count())),
          key=lambda x:x[0]
          )])

Explanation:

Life would be easier if we mergesort the iterators. This can be done with imerge

itertools.groupby gives us the desired grouping if we feed it the
result from imerge. The rest is just niggling details.

pprint.pprint(
   [ list(group)
    for key,group in it.groupby(
         imerge(a,b,c,d))
     ] )
# [[1, 1], [2], [3, 3], [4], [5, 5]]

From the output above, it is clear we need to keep track of the source of each value -- (did the value come from a, or b, etc.). That way we can pad the output with Nones in the right places.
To do that I used it.imap(tag,(a,b,c,d),it.count()). tag(a) returns an iterator which yields values from a along with a counter value.
```
>>> list(tag(a,0))
# [(2, 0), (3, 0), (4, 0)]
```

The output now looks like this:

pprint.pprint(
   [ list(group)
    for key,group in it.groupby(
         imerge(*it.imap(tag,(a,b,c,d),it.count())),
         key=lambda x:x[0]
         )])

# [[(1, 1), (1, 3)], [(2, 0)], [(3, 0), (3, 3)], [(4, 0)], [(5, 2), (5, 3)]]

Finally, we use expand(group) to change [(1, 1), (1, 3)] into [None, 1, None, 1].

I realize the answer has already been decided. However I do think that a cleaner way can be written up using the defaultdict object from the collections library.

Here is an example from http://docs.python.org/release/2.6.7/library/collections.html

>>> s = [('yellow', 1), ('blue', 2), ('yellow', 3), ('blue', 4), ('red', 1)]
>>> d = defaultdict(list)
>>> for k, v in s:
...    d[k].append(v)
>>>>d.items()
[('blue', [2, 4]), ('red', [1]), ('yellow', [1, 3])]