suppose I have a list of coordinates:
data = [
[(10, 20), (100, 120), (0, 5), (50, 60)],
[(13, 20), (300, 400), (100, 120), (51, 62)]
]
and I want to take all tuples that either appear in each list in data, or any tuple that differs from all tuples in lists other than its own by 3 or less. How can I do this efficiently in Python?
For the above example, the results should be:
[[(100, 120), # since it occurs in both lists
(10, 20), (13, 20), # since they differ by only 3
(50, 60), (51, 60)]]
开发者_如何学编程(0, 5) and (300, 400) would not be included, since they don't appear in both lists and are not different from elements in lists other than their own by 3 or less.
how can this be computed? thanks.
A naive implementation of this will be slow: O(n^2), testing for each node against each other node. Use a tree to speed it up.
This implementation uses a simple quadtree to make searching more efficient. This doesn't make any attempt to balance the tree, so a badly-ordered list of points could make it very inefficient. For a lot of uses, simply shuffling the list is likely to make it good enough; just be sure not to pass it a lot of items sorted by coordinate, since that'll reduce it to a linked list.
The optimization here is simple: if we're looking for items within a Euclidean distance of 3 units of some point, and we know that all items in a subtree are at least 3 units to the right, there's no way any points in that area could possibly be less than 3 units away.
This code is public domain. Try not to turn it in as homework.
#!/usr/bin/python
import math
def euclidean_distance(pos1, pos2):
x = math.pow(pos1[0] - pos2[0], 2)
y = math.pow(pos1[1] - pos2[1], 2)
return math.sqrt(x + y)
class QuadTreeNode(object):
def __init__(self, pos):
"""
Create a QuadTreeNode at the specified position. pos must be an (x, y) tuple.
Children are classified by quadrant.
"""
# Children of this node are ordered TL, TR, BL, BL (origin top-left).
self.children = [None, None, None, None]
self.pos = pos
def classify_node(self, pos):
"""
Return which entry in children can contain pos. If pos is equal to this
node, return None.
>>> node = QuadTreeNode((10, 20))
>>> node.classify_node((10, 20)) == None
True
>>> node.classify_node((2, 2))
0
>>> node.classify_node((50, 2))
1
>>> node.classify_node((2, 50))
2
>>> node.classify_node((50, 50))
3
X boundary condition:
>>> node.classify_node((10, 2))
0
>>> node.classify_node((10, 50))
2
Y boundary conditoin:
>>> node.classify_node((2, 20))
0
>>> node.classify_node((50, 20))
1
"""
if pos == self.pos:
return None
if pos[0] <= self.pos[0]: # Left
if pos[1] <= self.pos[1]: # Top-left
return 0
else: # Bottom-left
return 2
else: # Right
if pos[1] <= self.pos[1]: # Top-right
return 1
else: # Bottom-right
return 3
assert False, "not reached"
def add_node(self, node):
"""
Add a specified point under this node.
"""
type = self.classify_node(node.pos)
if type is None:
# node is equal to self, so this is a duplicate node. Ignore it.
return
if self.children[type] is None:
self.children[type] = node
else:
# We already have a node there; recurse and add it to the child.
self.children[type].add_node(node)
@staticmethod
def CreateQuadTree(data):
"""
Create a quad tree from the specified list of points.
"""
root = QuadTreeNode(data[0])
for val in data[1:]:
node = QuadTreeNode(val)
root.add_node(node)
return root
def distance_from_pos(self, pos):
return euclidean_distance(self.pos, pos)
def __str__(self): return str(self.pos)
def find_point_within_range(self, pos, distance):
"""
If a point exists within the specified Euclidean distance of the specified
point, return it. Otherwise, return None.
"""
if self.distance_from_pos(pos) <= distance:
return self
for axis in range(0, 4):
if self.children[axis] is None:
# We don't have a node on this axis.
continue
# If moving forward on this axis would permanently put us out of range of
# the point, short circuit the search on that axis.
if axis in (0, 2): # axis moves left on X
if self.pos[0] < pos[0] - distance:
continue
if axis in (1, 3): # axis moves right on X
if self.pos[0] > pos[0] + distance:
continue
if axis in (0, 1): # axis moves up on Y
if self.pos[1] < pos[1] - distance:
continue
if axis in (2, 3): # axis moves down on Y
if self.pos[1] > pos[1] + distance:
continue
node = self.children[axis].find_point_within_range(pos, distance)
if node is not None:
return node
return None
@staticmethod
def find_point_in_range_for_all_trees(point, trees, distance):
"""
If all QuadTreeNodes in trees contain a a point within the specified distance
of point, return True, Otherwise, return False.
"""
for tree in trees:
if tree.find_point_within_range(point, distance) is None:
return False
return True
def test_naive(data, distance):
def find_point_in_list(iter, point):
for i in iter:
if euclidean_distance(i, point) <= distance:
return True
return False
def find_point_in_all_lists(point):
for d in data:
if not find_point_in_list(d, point):
return False
return True
results = []
for d in data:
for point in d:
if find_point_in_all_lists(point):
results.append(point)
return set(results)
def test_tree(data, distance):
trees = [QuadTreeNode.CreateQuadTree(d) for d in data]
results = []
for d in data:
for point in d:
if QuadTreeNode.find_point_in_range_for_all_trees(point, trees, 3):
results.append(point)
return set(results)
def test():
sample_data = [
[(10, 20), (100, 120), (0, 5), (50, 60)],
[(13, 20), (300, 400), (100, 120), (51, 62)]
]
result1 = test_naive(sample_data, 3)
result2 = test_tree(sample_data, 3)
print result1
assert result1 == result2
# Loosely validate the tree algorithm against a lot of sample data, and compare
# performance while we're at it:
def random_data():
import random
return [(random.randint(0,1000), random.randint(0,1000)) for d in range(0,500)]
data = [random_data() for x in range(0,10)]
print "Searching (naive)..."
result1 = test_naive(data, 3)
print "Searching (tree)..."
result2 = test_tree(data, 3)
assert result1 == result2
if __name__ == "__main__":
test()
import doctest
doctest.testmod()
I hope this would get you started. Any improvements would be greatly appreciated.
Appears in all lists is trivial - simply take the intersection of all elements in the list.
>>> data = [
... [(10, 20), (100, 120), (0, 5), (50, 60)],
... [(13, 20), (300, 400), (100, 120), (51, 62)]
... ]
>>> dataset = [set(d) for d in data]
>>> dataset[0].intersection(*dataset[1:])
set([(100, 120)])
"Different by at 3 or less" with tuples other than the ones in the same list appears to me a graph/2d space problem. There's no simple algorithm for this, without a polynomial algorithm, and if your dataset is not very big, you could just iterate through them and group close points that are not in the same list.
@barrycarter's intuition is interesting: to reduce the number of comparisons (where by "comparing" two points we mean checking if their distance is <= 3
), "virtually slice" the 2D plane into suitable "cells", so that each point need only be compared with ones in adjacent "cells". This could really help (wrt a brute force solution where each point needs to be compared with basically all others) if your data set is large.
Here's a Python implementation of the idea (since barry's code sketch appears to be Perl or something), aiming for clarity rather than for speed...:
import collections
import math
def cellof(point):
x, y = point
return x//3, y//3
def distance(p1, p2):
return math.hypot(p1[0]-p2[0], p1[1]-p2[1])
def process(data):
cells = collections.defaultdict(list)
for i, points in enumerate(data):
for p in points:
cx, cy = cellof(p)
cells[cx, cy].append((i, p))
res = set()
for c, alist in cells.items():
for i, p in alist:
for cx in range(c[0]-1, c[0]+2):
for cy in range(c[1]-1, c[1]+2):
otherc = cells[cx, cy]
for otheri, otherp in otherc:
if i == otheri: continue
dst = distance(p, otherp)
if dst <= 3: res.add(p)
return sorted(res)
if __name__ == '__main__': # just an example
data = [
[(10, 20), (100, 120), (0, 5), (50, 60)],
[(13, 20), (300, 400), (100, 120), (51, 62)]
]
print process(data)
When run as a script, this produces the output
[(10, 20), (13, 20), (50, 60), (51, 62), (100, 120)]
Of course, to determine whether this is worthwhile, or the simpler brute-force approach is indeed better, the only viable approach is for you to run benchmarks of both solutions on realistic data -- the kinds of data sets your program will actually need to deal with in real life. Depending on how many lists you have, how many points on each, how closely spaced, performance can vary a lot, and it's better to measure than to guess!-)
精彩评论