I have some way of building a data structure (out of some file contents, say):
def loadfile(FILE):
return # some data structure created from the contents of FILE
So I can do things like
puppies = loadfile("puppies.csv") # wait for loadfile to work
kitties = loadfile("kitties.csv") # wait some more
print len(puppies)
print puppies[32]
In the above example, I wasted a bunch of time actually reading kitties.csv
and creating a data structure that I never used. I'd like to avoid that waste without constantly checking if not kitties
whenever I want to do something. I'd like to be able to do
puppies = lazyload("puppies.csv") # instant
kitties = lazyload("kitties.csv") # instant
print len(puppies) # wait for loadfile
print puppies[32]
So if I don't ever try to do anything with kitties
, loadfile("kitties.csv")
never gets called.
Is there some standard way to do this?
After playing around with it for a bit, I produced the following solution, which appears to work correctly and is quite brief. Are there some alternatives? Are there drawbacks to using this approach that I should keep in mind?
class lazyload:
def __init__(self,FILE):
self.FILE = FILE
self.F = None
def __getattr__(self,name):
if not self.F:
print "loading %s" % self.FILE
self.F = loadfile(self.FILE)
return object.__getattribute__(self.F, name)
What might be even better is if something like this worked:
class lazyload:
def __init__(self,FILE):
self.FILE = FILE
def __getattr__(self,name):
self = loadfile(self.FILE) # this never gets called again
开发者_Go百科 # since self is no longer a
# lazyload instance
return object.__getattribute__(self, name)
But this doesn't work because self
is local. It actually ends up calling loadfile
every time you do anything.
The csv module in the Python stdlibrary will not load the data until you start iterating over it, so it is in fact lazy.
Edit: If you need to read through the whole file to build the datastructure, having a complex Lazy load object that proxies things is overkill. Just do this:
class Lazywrapper(object):
def __init__(self, filename):
self.filename = filename
self._data = None
def get_data(self):
if self._data = None:
self._build_data()
return self._data
def _build_data(self):
# Now open and iterate over the file to build a datastructure, and
# put that datastructure as self._data
With the above class you can do this:
puppies = Lazywrapper("puppies.csv") # Instant
kitties = Lazywrapper("kitties.csv") # Instant
print len(puppies.getdata()) # Wait
print puppies.getdata()[32] # instant
Also
allkitties = kitties.get_data() # wait
print len(allkitties)
print kitties[32]
If you have a lot of data, and you don't really need to load all the data you could also implement something like class that will read the file until it finds the doggie called "Froufrou" and then stop, but at that point it's likely better to stick the data in a database once and for all and access it from there.
If you're really worried about the if statement, you have a Stateful object.
from collections import MutableMapping
class LazyLoad( MutableMapping ):
def __init__( self, source ):
self.source= source
self.process= LoadMe( self )
self.data= None
def __getitem__( self, key ):
self.process= self.process.load()
return self.data[key]
def __setitem__( self, key, value ):
self.process= self.process.load()
self.data[key]= value
def __contains__( self, key ):
self.process= self.process.load()
return key in self.data
This class delegates the work to a process
object which is either a Load
or a
DoneLoading
object. The Load
object will actually load. The DoneLoading
will not load.
Note that there are no if-statements.
class LoadMe( object ):
def __init__( self, parent ):
self.parent= parent
def load( self ):
## Actually load, setting self.parent.data
return DoneLoading( self.parent )
class DoneLoading( object ):
def __init__( self, parent ):
self.parent= parent
def load( self ):
return self
Wouldn't if not self.F
lead to another call to __getattr__
, putting you into an infinite loop? I think your approach makes sense, but to be on the safe side, I'd make that line into:
if name == "F" and not self.F:
Also, you could make loadfile a method on the class, depending on what you're doing.
Here's a solution that uses a class decorator to defer initialisation until the first time an object is used:
def lazyload(cls):
original_init = cls.__init__
original_getattribute = cls.__getattribute__
def newinit(self, *args, **kwargs):
# Just cache the arguments for the eventual initialization.
self._init_args = args
self._init_kwargs = kwargs
self.initialized = False
newinit.__doc__ = original_init.__doc__
def performinit(self):
# We call object's __getattribute__ rather than super(...).__getattribute__
# or original_getattribute so that no custom __getattribute__ implementations
# can interfere with what we are doing.
original_init(self,
*object.__getattribute__(self, "_init_args"),
**object.__getattribute__(self, "_init_kwargs"))
del self._init_args
del self._init_kwargs
self.initialized = True
def newgetattribute(self, name):
if not object.__getattribute__(self, "initialized"):
performinit(self)
return original_getattribute(self, name)
if hasattr(cls, "__getitem__"):
original_getitem = cls.__getitem__
def newgetitem(self, key):
if not object.__getattribute__(self, "initialized"):
performinit(self)
return original_getitem(self, key)
newgetitem.__doc__ = original_getitem.__doc__
cls.__getitem__ = newgetitem
if hasattr(cls, "__len__"):
original_len = cls.__len__
def newlen(self):
if not object.__getattribute__(self, "initialized"):
performinit(self)
return original_len(self)
newlen.__doc__ = original_len.__doc__
cls.__len__ = newlen
cls.__init__ = newinit
cls.__getattribute__ = newgetattribute
return cls
@lazyload
class FileLoader(dict):
def __init__(self, filename):
self.filename = filename
print "Performing expensive load operation"
self[32] = "Felix"
self[33] = "Eeek"
kittens = FileLoader("kitties.csv")
print "kittens is instance of FileLoader: %s" % isinstance(kittens, FileLoader) # Well obviously
print len(kittens) # Wait
print kittens[32] # No wait
print kittens[33] # No wait
print kittens.filename # Still no wait
print kittens.filename
The output:
kittens is instance of FileLoader: True
Performing expensive load operation
2
Felix
Eeek
kitties.csv
kitties.csv
I tried to actually restore the original magic methods after the initialization, but it wasn't working out. It may be necessary to proxy additional magic methods, I didn't investigate every scenario.
Note that kittens.initialized will always return True because it kicks off the initialization if it hasn't already been performed. Obviously it would be possible to add an exemption for this attribute so that it would return False if no other operation had been performed on the object, or the check could be changed to the equivalent of a hasattr call and the initialized attribute could be deleted after the initialization.
Here's a hack that makes the "even better" solution work, but I think it's annoying enough that it's probably better to just use the first solution. The idea is to execute the step self = loadfile(self.FILE)
by passing the the variable name as an attribute:
class lazyload:
def __init__(self,FILE,var):
self.FILE = FILE
self.var = var
def __getattr__(self,name):
x = loadfile(self.FILE)
globals()[self.var]=x
return object.__getattribute__(x, name)
Then you can do
kitties = lazyload("kitties.csv","kitties")
^ ^
\ /
These two better match exactly
After you call any method on kitties
(aside from kitties.FILE
or kitties.var
), it will become completely indistinguishable from what you'd have gotten with kitties = loadfile("kitties.csv")
. In particular, it will no longer be an instance of lazyload
and kitties.FILE
and kitties.var
will no longer exist.
If you need use puppies[32]
you need also define __getitem__
method because __getattr__
don't catch that behaviour.
I implement lazy load for my needs, there is non-adapted code:
class lazy_mask(object):
'''Fake object, which is substituted in
place of masked object'''
def __init__(self, master, id):
self.master=master
self.id=id
self._result=None
self.master.add(self)
def _res(self):
'''Run lazy job'''
if not self._result:
self._result=self.master.get(self.id)
return self._result
def __getattribute__(self, name):
'''proxy all queries to masked object'''
name=name.replace('_lazy_mask', '')
#print 'attr', name
if name in ['_result', '_res', 'master', 'id']:#don't proxy requests for own properties
return super(lazy_mask, self).__getattribute__(name)
else:#but proxy requests for masked object
return self._res().__getattribute__(name)
def __getitem__(self, key):
'''provide object["key"] access. Else can raise
TypeError: 'lazy_mask' object is unsubscriptable'''
return self._res().__getitem__(key)
(master is registry object that load data when i run it's get() method)
This implementation works ok for isinstance() and str() and json.dumps() with it
精彩评论