python threadpool problem (wait for something)_问答_开发者

python threadpool problem (wait for something)

开发者 https://www.devze.com 2023-01-15 03:12 出处：网络

I wrote simple web site crowler with threadpool. The problem is: then crawler is get all over site it must finish, but in real it wait for something in the end,and script dont finished, why this happe

from Queue import Queue
from threading import Thread

import sys
from urllib import urlopen
from BeautifulSoup import BeautifulSoup, SoupStrainer
import re
from Queue import Queue, Empty
from threading import Thread

visited = set()
queue = Queue()

class Worker(Thread):
    """Thread executing tasks from a given tasks queue"""
    def __init__(self, tasks):
        Thread.__init__(self)
        self.tasks = tasks
        self.daemon = True
        self.start()

    def run(self):
        while True:
            func, args, kargs = self.tasks.get()
            print "startcall in thread",self
            print args
            try: func(*args, **kargs)
            except Exception, e: print e
            print "stopcall in thread",self
            self.tasks.task_done()

class ThreadPool:
    """Pool of threads consuming tasks from a queue"""
    def __init__(self, num_threads):
        self.tasks = Queue(num_threads)
        for _ in range(num_threads): Worker(self.tasks)

    def add_task(self, func, *args, **kargs):
        """Add a task to the queue"""
        self.tasks.put((func, args, kargs))

    def wait_completion(self):
        """Wait for completion of all the tasks in the queue"""
        self.tasks.join()


def process(pool,host,url):

    try:
        print "get url",url
        #content = urlopen(url).read().decode(charset)
        content = urlopen(url).read()
    except UnicodeDecodeError:
        return

    for link in BeautifulSoup(content, parseOnlyThese=SoupStrainer('a')):
        #print "link",link
        try:
            href = link['href']
        except KeyError:
            continu开发者_StackOverflow中文版e


        if not href.startswith('http://'):
            href = 'http://%s%s' % (host, href)
        if not href.startswith('http://%s%s' % (host, '/')):
            continue



        if href not in visited:
            visited.add(href)
            pool.add_task(process,pool,host,href)
            print href




def start(host,charset):

    pool = ThreadPool(7)
    pool.add_task(process,pool,host,'http://%s/' % (host))
    pool.wait_completion()

start('simplesite.com','utf8')

The problem I see is that you never quit the while in run. So, it will block forever. You need to break that loop when the jobs are done.

You could try to :
1) insert