I wrote simple web site crowler with threadpool. The problem is: then crawler is get all over site it must finish, but in real it wait for something in the end,and script dont finished, why this happend?
from Queue import Queue
from threading import Thread
import sys
from urllib import urlopen
from BeautifulSoup import BeautifulSoup, SoupStrainer
import re
from Queue import Queue, Empty
from threading import Thread
visited = set()
queue = Queue()
class Worker(Thread):
"""Thread executing tasks from a given tasks queue"""
def __init__(self, tasks):
Thread.__init__(self)
self.tasks = tasks
self.daemon = True
self.start()
def run(self):
while True:
func, args, kargs = self.tasks.get()
print "startcall in thread",self
print args
try: func(*args, **kargs)
except Exception, e: print e
print "stopcall in thread",self
self.tasks.task_done()
class ThreadPool:
"""Pool of threads consuming tasks from a queue"""
def __init__(self, num_threads):
self.tasks = Queue(num_threads)
for _ in range(num_threads): Worker(self.tasks)
def add_task(self, func, *args, **kargs):
"""Add a task to the queue"""
self.tasks.put((func, args, kargs))
def wait_completion(self):
"""Wait for completion of all the tasks in the queue"""
self.tasks.join()
def process(pool,host,url):
try:
print "get url",url
#content = urlopen(url).read().decode(charset)
content = urlopen(url).read()
except UnicodeDecodeError:
return
for link in BeautifulSoup(content, parseOnlyThese=SoupStrainer('a')):
#print "link",link
try:
href = link['href']
except KeyError:
continu开发者_StackOverflow中文版e
if not href.startswith('http://'):
href = 'http://%s%s' % (host, href)
if not href.startswith('http://%s%s' % (host, '/')):
continue
if href not in visited:
visited.add(href)
pool.add_task(process,pool,host,href)
print href
def start(host,charset):
pool = ThreadPool(7)
pool.add_task(process,pool,host,'http://%s/' % (host))
pool.wait_completion()
start('simplesite.com','utf8')
The problem I see is that you never quit the while in run. So, it will block forever. You need to break that loop when the jobs are done.
You could try to :
1) insert
if not func: break
after task.get(...) in run.
2) append
pool.add_task(None, None, None)
at the end of process.
This is a way for process to notify the pool that he has no more task to process.
精彩评论