View Javadoc

1   package org.ajmm.obsearch.index;
2   
3   import hep.aida.bin.QuantileBin1D;
4   
5   import java.io.File;
6   import java.io.IOException;
7   
8   import org.ajmm.obsearch.OB;
9   import org.ajmm.obsearch.exception.OBException;
10  import org.ajmm.obsearch.exception.OutOfRangeException;
11  import org.apache.log4j.Logger;
12  
13  import cern.colt.list.FloatArrayList;
14  
15  import com.sleepycat.bind.tuple.IntegerBinding;
16  import com.sleepycat.bind.tuple.TupleInput;
17  import com.sleepycat.je.Cursor;
18  import com.sleepycat.je.Database;
19  import com.sleepycat.je.DatabaseConfig;
20  import com.sleepycat.je.DatabaseEntry;
21  import com.sleepycat.je.DatabaseException;
22  import com.sleepycat.je.LockMode;
23  import com.sleepycat.je.OperationStatus;
24  import com.sleepycat.je.PreloadConfig;
25  import com.thoughtworks.xstream.annotations.XStreamAlias;
26  
27  /*
28   OBSearch: a distributed similarity search engine
29   This project is to similarity search what 'bit-torrent' is to downloads.
30   Copyright (C)  2007 Arnoldo Jose Muller Molina
31  
32   This program is free software; you can redistribute it and/or modify
33   it under the terms of the GNU General Public License as published by
34   the Free Software Foundation; either version 2 of the License, or
35   (at your option) any later version.
36  
37   This program is distributed in the hope that it will be useful,
38   but WITHOUT ANY WARRANTY; without even the implied warranty of
39   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
40   GNU General Public License for more details.
41  
42   You should have received a copy of the GNU General Public License along
43   with this program; if not, write to the Free Software Foundation, Inc.,
44   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
45   */
46  /**
47   * This Index uses the extended pyramid technique and SMAP to store arbitrary
48   * objects.
49   * @param <O>
50   *            The type of object to be stored in the Index.
51   * @author Arnoldo Jose Muller Molina
52   * @since 0.7
53   */
54  @XStreamAlias("ExtendedPyramidIndex")
55  public abstract class AbstractExtendedPyramidIndex < O extends OB >
56          extends AbstractPivotIndex < O > {
57  
58      /**
59       * Used to access the first item of a two item array.
60       */
61      protected static final int MIN = 0;
62  
63      /**
64       * Used to access the second item of a two item array.
65       */
66      protected static final int MAX = 1;
67  
68      /**
69       * Used to access the first item of a two item array. (Used for queries)
70       */
71      protected static final int HLOW = 0;
72  
73      /**
74       * Used to access the second item of a two item array. (Used for queries)
75       */
76      protected static final int HHIGH = 1;
77  
78      /**
79       * Logger.
80       */
81      private static final transient Logger logger = Logger
82              .getLogger(AbstractExtendedPyramidIndex.class);
83  
84      /**
85       * Median points of the extended pyramid technique.
86       */
87      protected float[] mp;
88  
89      /**
90       * The database where the pyramid values are stored.
91       */
92      protected transient Database cDB;
93  
94      /**
95       * Constructs an extended pyramid index.
96       * @param databaseDirectory
97       *            the database directory
98       * @param pivots
99       *            how many pivots will be used
100      * @param pivotSelector
101      *                The pivot selector that will be used by this index.
102      * @param type The class of the object O that will be used.                 
103      * @throws DatabaseException
104      *             If a database error occurs.
105      * @throws IOException
106      *             If a serialization issue occurs.
107      */
108     public AbstractExtendedPyramidIndex(final File databaseDirectory,
109             final short pivots, PivotSelector < O > pivotSelector, Class<O> type) throws DatabaseException, IOException {
110         super(databaseDirectory, pivots, pivotSelector,type); // initializes the databases
111         mp = new float[super.pivotsCount];
112     }
113 
114     /**
115      * This method will be called by the super class. Initializes the C
116      * database(s).
117      * @throws DatabaseException
118      *             If a database error occurs.
119      */
120     //TODO: insert the keys in C in order to increase performance.
121     //           I read somewhere that Berkeley DB works better if you do so.
122     @Override
123     protected final void initC() throws DatabaseException {
124         DatabaseConfig dbConfig = createDefaultDatabaseConfig();
125         
126         dbConfig.setSortedDuplicates(true);
127         dbConfig.setTransactional(false);
128         cDB = databaseEnvironment.openDatabase(null, "C", dbConfig);
129                 
130         //PreloadConfig pc = new PreloadConfig();
131         //pc.setLoadLNs(true);
132         // TODO: remove this comment?
133        //cDB.preload(pc);
134     }
135 
136     /**
137      * Calculates the pyramid's median values. We basically have to get the
138      * median from each dimension and use the median to approximate the center
139      * of the data Using Colt's QuantileBin1D to extract the median. It allows
140      * an approximate but memory friendly processing! :)
141      * @throws DatabaseException
142      *             If a database error occurs.
143      * @throws IllegalAccessException
144      *             If there is a problem when instantiating objects O
145      * @throws InstantiationException
146      *             If there is a problem when instantiating objects O
147      * @throws OutOfRangeException
148      *             If the distance of any object to any other object exceeds the
149      *             range defined by the user.
150      * @throws OBException
151      *             User generated exception
152      */
153     @Override
154     protected void calculateIndexParameters() throws DatabaseException,
155             IllegalAccessException, InstantiationException,
156             OutOfRangeException, OBException {
157         long count = super.bDB.count();
158         // each median for each dimension will be stored in this array
159         QuantileBin1D[] medianHolder = createMedianHolders(count);
160         Cursor cursor = null;
161         DatabaseEntry foundKey = new DatabaseEntry();
162         DatabaseEntry foundData = new DatabaseEntry();
163 
164         try {
165             int i = 0;
166             cursor = bDB.openCursor(null, null);
167             while (cursor.getNext(foundKey, foundData, LockMode.DEFAULT) == OperationStatus.SUCCESS) {
168                 assert i == IntegerBinding.entryToInt(foundKey);
169 
170                 TupleInput in = new TupleInput(foundData.getData());
171                 updateMedianHolder(extractTuple(in), medianHolder);
172                 i++;
173             }
174             assert i == count; // pivot count and read # of pivots
175             // should be the same
176         } finally {
177             cursor.close();
178         }
179 
180         int i = 0;
181         while (i < mp.length) {
182             double median = medianHolder[i].median();
183             assert median >= 0 && median <= 1;
184             mp[i] = (float) median;
185             i++;
186         }
187         logger.debug("Median calculation finished");
188     }
189 
190     /**
191      * Extracts the tuple values from in and returns a normalized vector with
192      * values ranging from 1 to 0. Note that this is first level normalization.
193      * The extended pyramid technique algorithm performs another normalization
194      * on top of this one.
195      * @param in
196      *            Extracts a tuple from the given byte stream.
197      * @throws OutOfRangeException
198      *             If the distance of any object to any other object exceeds the
199      *             range defined by the user.
200      * @return A float tuple (the input of the pyramid technique)
201      */
202     protected abstract float[] extractTuple(TupleInput in)
203             throws OutOfRangeException;
204 
205     /**
206      * Updates each median holder with the given float tuple.
207      * @param tuple
208      *            TupleInput of floats
209      * @param medianHolder
210      *            an array of QuantileBin1D objects used to find an approximate
211      *            median.
212      */
213     protected final void updateMedianHolder(final float[] tuple,
214             QuantileBin1D[] medianHolder) {
215         int i = 0;
216         assert tuple.length == medianHolder.length;
217         assert medianHolder.length == pivotsCount;
218         while (i < medianHolder.length) {
219             medianHolder[i].add(tuple[i]);
220             i++;
221         }
222     }
223     
224     protected final void updateFloatHolder(final float[] tuple,
225             FloatArrayList[] medianHolder) {
226         int i = 0;
227         assert tuple.length == medianHolder.length;
228         assert medianHolder.length == pivotsCount;
229         while (i < medianHolder.length) {
230             medianHolder[i].add(tuple[i]);
231             i++;
232         }
233     }
234 
235     /**
236      * Creates pivotsCount QuantileBin1D objects that will be used to calculate
237      * the medians of the data.
238      * @param size
239      *            the size of the data to be processed
240      * @return an array of QuantileBin1D objects
241      */
242     protected final QuantileBin1D[] createMedianHolders(final long size) {
243         QuantileBin1D[] res = new QuantileBin1D[pivotsCount];
244         int i = 0;
245         while (i < res.length) {
246             // TODO: move these parameters to a centralized
247             // configuration file.
248             res[i] = new QuantileBin1D(true, size, 0.00001, 0.00001, 10000,
249                     new cern.jet.random.engine.DRand(new java.util.Date()),
250                     true, true, 2);
251             i++;
252         }
253         return res;
254     }
255     
256     protected final FloatArrayList[] createFloatHolders(int size) {
257         FloatArrayList[] res = new FloatArrayList[pivotsCount];
258         int i = 0;
259         while (i < res.length) {
260             // TODO: move these parameters to a centralized
261             // configuration file.
262             res[i] = new FloatArrayList(size);
263             i++;
264         }
265         return res;
266     }
267 
268     /**
269      * Normalizes a value in the given dimension. The value must have been
270      * converted into a float [0,1] before using this method
271      * @param norm
272      *            Normalizes the value in the given dimension
273      * @param i
274      *            Dimension to use.
275      * @return Normalized version of the value.
276      */
277     protected final float extendedPyramidNormalization(final float norm,
278             final int i) {
279         return (float) Math.pow(norm, -1.d / (Math.log(mp[i]) / Math.log(2)));
280     }
281 
282     /**
283      * For the given point and the pyramid number we return the height of that
284      * point.
285      * @param tuple
286      *            tuple to be processed
287      * @param pyramidNumber
288      *            which pyramid number will be processed.
289      * @return height of the point
290      */
291     protected final float heightOfPoint(final float[] tuple,
292             final int pyramidNumber) {
293         float res = (float) Math.abs(0.5 - tuple[pyramidNumber % pivotsCount]);
294         assert res >= 0 && res <= 0.5;
295         return res;
296     }
297 
298     /**
299      * Returns the pyramid value for the given tuple.
300      * @param Normalized
301      *            tuple (first pass)
302      * @return The pyramid value for the given tuple.
303      */
304     public final float pyramidValue(final float[] tuple) {
305         int pyramid = pyramidOfPoint(tuple);
306         assert pyramid >= 0 && pyramid < pivotsCount * 2 : " Pyramid value:"
307                 + pyramid;
308         return pyramid + heightOfPoint(tuple, pyramid);
309     }
310 
311     /**
312      * Calculates the pyramid # for the given point.
313      * @param tuple
314      *            Normalized tuple (first pass)
315      * @return The pyramid # for the given tuple
316      */
317     public final int pyramidOfPoint(final float[] tuple) {
318         int jmax = pyramidOfPointAux(tuple);
319         if (tuple[jmax] < 0.5) {
320             return jmax;
321         } else {
322             return jmax + pivotsCount;
323         }
324     }
325 
326     /**
327      * For the given tuple, returns the pyramid # for the tuple.
328      * @param tuple
329      *            Normalized tuple (first pass)
330      * @return pyramid # for the given tuple
331      */
332     protected final int pyramidOfPointAux(final float[] tuple) {
333         int j = 0;
334         assert pivotsCount == tuple.length;
335         while (j < pivotsCount) {
336             int k = 0;
337             boolean failed = false;
338             while (k < pivotsCount && !failed) {
339                 if (k == j) { // can't process k == j
340                     k++;
341                     continue;
342                 }
343                 if (!(Math.abs(0.5 - tuple[j]) >= Math.abs(0.5 - tuple[k]))) {
344                     // if this property is not fullfilled we have to finish the
345                     // loop
346                     // we also failed for this j, need to try the next j
347                     failed = true;
348                 }
349                 k++;
350             }
351             if (!failed) {
352                 // we did not fail, let's return j_max
353                 // we should always exit the method here
354                 return j;
355             }
356             j++;
357         }
358         assert false : " Catastrophic Failure "; // we shouldn't have reached
359         return Integer.MIN_VALUE;
360     }
361 
362     /**
363      * Returns true if the given query (min[] and max[]) intersects pyramid p.
364      * It also modifies lowHighResult so that we can perform the range
365      * accordingly.
366      * @param q
367      *            A query rectangle.
368      * @param p
369      *            The pyramid number
370      * @param lowHighResult
371      *            Where the lowHighResult will be stored.
372      * @return True if rectangle q intersects pyramid p
373      */
374     protected final boolean intersect(final float[][] q, final int p,
375             float[] minArray, float[] lowHighResult) {
376         // strategy: as soon as we find something is false, we stop processing
377         // otherwise we return true! :)
378         int i = p;
379         assert i < this.pivotsCount * 2;
380         int j = 0;
381         if (i < this.pivotsCount) { // the case where i < d
382             float minimum = q[i][MIN];
383             while (j < q.length) {
384                 if (j != i) {
385                     if (!(minimum <= - minArray[j])) {
386                         return false;
387                     }
388                 }
389                 j++;
390             }
391             assert j == pivotsCount;
392             if (q[i][MAX] > 0) {
393                 q[i][MAX] = 0;
394             }
395         } else { // i >= d
396             i = i - this.pivotsCount;
397             float maximum = q[i][MAX] ;
398             while (j < q.length) { // this is the first definition!!!                
399                 if (j != i) {
400                     if (! (maximum >= minArray[j])) {
401                         return false;
402                     }
403                 }
404                 j++;
405             }
406             assert j == pivotsCount;
407             if (q[i][MIN] < 0) {
408                 q[i][MIN] = 0;
409             }
410         }
411         // if we reach this, is because the pyramid intersects
412         // we have to get the ranges now.
413         determineRanges(i, q, lowHighResult);
414         return true;
415     }
416     
417     /**
418      * Receives a query and returns a new array generated of calculating min() to 
419      * each of the elements
420      * @param query
421      * @return The array of the min(qj) for each element of the query
422      */
423     protected final float[] generateMinArray(float[][] query){
424         int i  = 0;
425         float [] res = new float[query.length];
426         while(i < query.length){
427             res[i] = min(query[i]);
428             i++;
429         }
430         return res;
431     }
432 
433     /**
434      * @return The total number of boxes served by this index.
435      */
436     public int totalBoxes() {
437         return pivotsCount * 2;
438     }
439 
440     /**
441      * Determines the ranges that have to be searched in the b-tree.
442      * @param p
443      *            Pyramid #
444      * @param q
445      *            the query
446      * @param lowHighResult
447      *            where the high a low will be stored (a two element array)
448      */
449     private final void determineRanges(int p, float[][] q, float[] lowHighResult) {
450 
451         int i = p;
452         assert i < pivotsCount;
453         lowHighResult[HHIGH] = max(q[i]);         
454         if (isEasyCase(q)) { // do not use max2 here
455             lowHighResult[HLOW] = 0;
456         } else {   
457             int j = 0;
458             float max = 0;
459             while (j < pivotsCount) {
460                 if (i != j) {
461                     float t = qjmin(q, j, i);
462                     if (t > max) {
463                         max = t;
464                     }
465                 }
466                 j++;
467             }
468             lowHighResult[HLOW] = max;
469         }
470     }
471     
472 
473     /*
474     private final void determineRanges2 (int p, float[][] q, float[] lowHighResult){
475         int j = 0;
476         int i = p;
477         float min = Float.MAX_VALUE;
478         while (j < pivotsCount) {
479             if (i != j) {
480                 float t = qbarjmin(q, j, i);
481                 if (t < min) {
482                     min = t;
483                 }
484             }
485             j++;
486         }
487         lowHighResult[HLOW] = min;
488     }
489         
490     private final float qbarjmin(float[][] q, int j, int i){
491         if(max(q[j]) >= min(q[i])){
492             return Math.max(max(q[i]), min(q[j]));
493         }else{
494             return min(q[i]);
495         }
496     }
497 */
498     /**
499      * Finds qjmin for the given j, and the given i pyramid and the ranges of
500      * the query max and min. Please see the paper on the pyramid technique.
501      * @param q
502      *            Query rectangle
503      * @param j
504      *            j parameter
505      * @param i
506      *            i parameter
507      * @return qjmin for the given q,j,i.
508      */
509     private final float qjmin(final float[][] q, final int j, final int i) {
510         if (min(q[j]) > min(q[i])) {
511             return min(q[j]);
512         } else {
513             return min(q[i]);
514         }
515     }
516 
517     /**
518      * Returns true if the query is an easy case. Please see the paper on the
519      * pyramid technique.
520      * @param q
521      *            A query rectangle
522      * @return True if this query is an "easy case" query.
523      */
524     private final boolean isEasyCase(final float[][] q) {
525         int i = 0;
526         while (i < q.length) {
527             if (!((q[i][MIN] <= 0) && (0 <= q[i][MAX]))) {
528                 return false;
529             }
530             i++;
531         }
532         return true;
533     }
534 
535     /**
536      * Calculates min for the given 2 element array.
537      * @param minMax
538      *            2 element array.
539      * @return min
540      */
541     private final float min(final float[] minMax) {
542         if (minMax[MIN] <= 0 && 0 <= minMax[MAX]) {
543             return 0;
544         } else {
545             return Math.min(Math.abs(minMax[MAX]), Math.abs(minMax[MIN]));
546         }
547     }
548 
549     /**
550      * Calculates max for the given 2 element array.
551      * @param minMax
552      *            2 element array.
553      * @return max
554      */
555     private final float max(final float[] minMax) {
556         return Math.max(Math.abs(minMax[MAX]), Math.abs(minMax[MIN]));
557     }
558 
559     /**
560      * Queries are aligned to the center of the space. Aligns the given query to
561      * the center of the space.
562      * @param q
563      *            query.
564      */
565     protected final void centerQuery(final float[][] q) {
566         int i = 0;
567         while (i < q.length) {
568             q[i][MIN] = q[i][MIN] - 0.5f;
569             q[i][MAX] = q[i][MAX] - 0.5f;
570             i++;
571         }
572     }
573 
574     /**
575      * Closes database C.
576      * @throws DatabaseException
577      *             If something goes wrong with the DB
578      */
579     @Override
580     protected final void closeC() throws DatabaseException {
581         this.cDB.close();
582     }
583 
584     /**
585      * Copies the contents of query src into dest.
586      * @param src
587      *            source
588      * @param dest
589      *            destination
590      */
591     protected final void copyQuery(final float[][] src, float[][] dest) {
592         int i = 0;
593         while (i < src.length) {
594             dest[i][MIN] = src[i][MIN];
595             dest[i][MAX] = src[i][MAX];
596             i++;
597         }
598     }
599 
600 }