__revision__ = "$Revision: 1.6 $" __copyright__ = "Copyright (c) 2005 by Simon Pamies" __doc__ = """ Standalone indexer for the ZODB. Can be used to index many objects in a database and search for attributes without loading objects. Example: >>> o = DummyObject() >>> from standaloneindexer import Indexer >>> from standaloneindexer import MetadataIndex >>> idx = Indexer() >>> idx.addIndex(MetadataIndex('name')) >>> idx.index_object(o) >>> idx.search(name='dummy') >>> str(idx.search(name='dummy')) '' >>> result = idx.search(name='dummy') >>> result()[0] """ try: from ZODB.cPersistence import Persistent except: from Persistence import Persistent from BTrees.OOBTree import union as OOSetUnion from BTrees.OOBTree import difference as OOSetDifference from BTrees.OOBTree import intersection as OOSetIntersection from BTrees.OOBTree import OOBTree, OOSet import exceptions from types import StringType import sys import random class NotIndexable(exceptions.Exception): """ Dummy """ class MetadataIndex(Persistent): """ An Indexer indexes one single metadata from objects. An Index is a circular mapping between the name of the attribute and the value of it. self._data['id'] = [] ex.: We have objects A,B,C with A.__dict__['id'] = 'A' B.__dict__['id'] = 'B' C.__dict__['id'] = 'C' d = MetadataIndex(datum='id') d.index_object(A) d.index_object(B) d.index_object(C) # searching for all objects that # have an attribute id and where # this attribute is either A or B d.get_results(['A', 'B']) ['A', 'B'] """ def __init__(self, datum): """ datum is the attribute we want to index with this MetadataIndex. """ if type(datum) != type(''): raise TypeError, '%s not a valid metadata field!' % str(datum) self._datum = datum # mapping for {attribute value -> uid} # can be a list, because it is not unthinkable # that some objects have the same content # for a specific attribute self._data = OOBTree() # mapping for {uid -> attribute value} self._uids = OOBTree() self._length = 0 def __len__(self): return self._length def _check_object(self, object): """ This method tries to extract the value from the objects attribute. First we check if the attribute is a callable one. If not we try to return the raw value... """ if not hasattr(object, self._datum) or \ not hasattr(object, 'id'): raise NotIndexable, 'Object %s has not the required fields (id or %s)!' \ % (str(object), self._datum) idx = getattr(object, self._datum) if callable(idx): try: idx = idx() except: raise NotIndexable, 'Attribute %s of object %s not safely callable!' \ % (self._datum, str(object)) return idx def _vindex(self, uid, idx): """ Creates the mapping as described in the class documentation """ if self._data.has_key(idx): self._data[idx] += [uid] else: self._data[idx] = [uid] self._uids[uid] = idx self._length += 1 self._p_changed = 1 return uid def index_object(self, object, uid): """ Indexes objects. Objects that want to be indexable need at least the id field... """ # getting the attributes value idx = self._check_object(object) # creating mapping return self._vindex(uid, idx) def _virtual_index(self, uid): """ In the current implementation we create index entries also for objects not havin' the specified attribute. If object A has no attribute field, we create an entry filled with None. """ return self._vindex(uid, None) def unindex_uid(self, uid): """ Unindexes object given a specific UID that is returned by index_object. """ if not self._uids.has_key(uid): raise KeyError, '%s does not exist in this Index!' % uid idx = self._uids[uid] lst = self._data[idx] for item in lst: if len(lst)==1: del self._data[idx] elif len(lst)>1: self._data[idx]=[item for item in lst if item!=uid] else: raise KeyError, 'Fatal Error: Empty list. Can be a data corruption.' del self._uids[uid] self._length -= 1 self._p_changed = 1 def unindex_object(self, object): """ Tries to unindex an index entry by looking through that object """ idx = self._check_object(object) if not self._data.has_key(idx): raise KeyError, 'Object %s does not exists in this Index!' % str(object) self.unindex_uid(self._data[idx]) def get_results(self, input): """ Returns the results for the indicated search. input can be a list because we perhaps wanna search for all objects that have _datum == ['dummy1', 'dummy2']... """ result = OOSet() if type(input) != type([]): try: if len(input) == 0: input = [] else: input = [input] except TypeError: if input != None: input = [input] # returning all results if # the argument is empty if not input: result.update(self._uids.keys()) else: for item in input: try: for uid in self._data[item]: result.insert(uid) except: # passing >> nirvana in order to avoid # a has_key call... pass return result class IndexContainer(Persistent): """ An IndexContainer can be used to collect many MetadataIndex objects at one place. @see: MetadataIndex """ def __init__(self): self._indexes = {} # adding default index. all objects # must have an id... self.addIndex(MetadataIndex('id')) def hasIndexFor(self, field_name): return self._indexes.has_key(field_name) def addIndex(self, index): #if not IMetadataIndex.isImplementedBy(index): # raise TypeError, '%s seems not to be an MetadataIndex!' % str(index) key = getattr(index, '_datum') self._indexes[key] = index self._p_changed = 1 def removeIndex(self, field_name): try: del self._indexes[field_name] except: raise KeyError, 'Index %s not found in here :(' def clear(self): """ Removing *ALL* indizes from this container """ self._indexes = {} self._p_changed = 1 def getIndex(self, field_name): try: return self._indexes[field_name] except: raise KeyError, 'Index for %s not found!' % field_name class ResultItem(Persistent): def __init__(self): self._result_item = 1 def getCatalogUID(self): """ Returns the uid this object is indexed with in the catalog """ return self._ticle_catalog_uid def __call__(self): """ Wrapper """ return self.getCatalogUID() def __str__(self): return '' % (self.id) class ResultListItem(Persistent): def __init__(self, id): self.id = id self._result_list = OOBTree() self._length = 0 self._p_changed = 1 def addObject(self, other, uid): if not self._result_list.has_key(uid): self._length += 1 self._result_list[uid] = other self._p_changed = 1 return self.__class__ def __call__(self, *args, **kw): """ Returns a *lazy* result set that is managed by an IOBTreeResultSet... """ return self._result_list.values() def lazy_results(self): return self.__call__() def results(self): """ Returns the complete result list as normal list cause internal repr is an IOBTreeResult... """ return list(self._result_list.values()) def __str__(self): return '' % len(self) def __len__(self): return self._length class Indexer(Persistent, IndexContainer): """ A little bit like the ZCatalog but more lightweight. We do not need that pluggable functionality and other special filtering stuff... """ # we say that zeros are not handled # as None conditions... __empty_types__ = [type(0), type(0.0)] def __init__(self, id='std-indexer', servicehub=None): self.id = id self.servicehub = servicehub self._v_index = 0 IndexContainer.__init__(self) def _ext_search(self, kws, logical_or=0): old = OOSet(); rs = OOSet(); for index in self._indexes.values(): if kws.has_key(index._datum): rs = index.get_results(kws[index._datum]) if len(old) == 0: if len(rs) == 0 and not logical_or: return OOSet() old = OOSetUnion(rs, old) else: # take only these that are equal if not logical_or: old = OOSetIntersection(rs,old) if len(old) == 0: return OOSet() else: old = OOSetUnion(rs, old) return old def search(self, inverted=0, logical_or=0, include_empty=0, **kw): """ Main method for dealing with data indexed here. The search terms (aka fields) are concatenated with AND. The param can be used to say: 'Give me all indexed data NOT matching that query' if is set, the search terms are concatenated with OR not with AND if is true, then you can also have params like that: id=None and it will give you all objects where id == None. The default behaviour is to ignore such parameters... """ # Here we eliminate all # entries not indexed herein kws = {} for key in kw.keys(): if self.hasIndexFor(key): # see param list above if not include_empty: # only adding if there is # really something we wanna # search for... if kw[key] or (type(kw[key]) in Indexer.__empty_types__): kws[key] = kw[key] else: kws[key] = kw[key] kwfilled = 1 if not kws: # Here we indicate that we wanna # have ALL indexed data. If indexes # get an empty value they return all # data in there... kwfilled = 0 for index in self._indexes.values(): kws[index._datum] = None # It can happen, that we do not have # that volatile variable here - so we # need to create a new one when searching. try: self._v_index += 1 except: self._v_index = 1 # generate some list object with a random id results = ResultListItem('%s-%s' % \ (str(str(random.random())), str(self._v_index))) rs = self._ext_search(kws, logical_or) if inverted and kwfilled: # getting ALL indexed data and do # a difference match on that k = {} for index in self_indexes.values(): k[index._datum] = None buddy = self._ext_search(k) rs = OOSetDifference(buddy, rs) # now we have a list of UIDs and want to # create the result list with them. I've # chosen here an double pass mechanism - for index in self._indexes.values(): for uid in rs: if not results._result_list.has_key(uid): ob = ResultItem() else: ob = results._result_list[uid] ob.__dict__[index._datum] = index._uids[uid] # adding catalog uid, so that it can be # retrieved easily later ob._ticle_catalog_uid = uid # adding or readding object to the result list results.addObject(ob, uid) # Returns an OOBTreeResultSet! return results def index_object(self, object): uid = str(random.random()) for index in self._indexes.keys(): if hasattr(object, index): self._indexes[index].index_object(object, uid) else: # indexing virtually the metadata # so that all objects contain all metadata self._indexes[index]._virtual_index(uid) return uid def unindex_object(self, indexed_uid): for index in self._indexes.values(): index.unindex_uid(indexed_uid)