View Javadoc

1   package net.obsearch.index.pyramid;
2   
3   import hep.aida.bin.QuantileBin1D;
4   import hep.aida.bin.StaticBin1D;
5   
6   import java.io.File;
7   import java.io.IOException;
8   import java.nio.ByteBuffer;
9   import java.util.Arrays;
10  import java.util.Iterator;
11  
12  import net.obsearch.OB;
13  import net.obsearch.exception.AlreadyFrozenException;
14  import net.obsearch.exception.IllegalIdException;
15  import net.obsearch.exception.NotFrozenException;
16  import net.obsearch.exception.OBException;
17  import net.obsearch.exception.OBStorageException;
18  import net.obsearch.exception.OutOfRangeException;
19  import net.obsearch.index.pivot.AbstractPivotOBIndex;
20  import net.obsearch.pivots.IncrementalPivotSelector;
21  import net.obsearch.storage.CloseIterator;
22  import net.obsearch.storage.OBStore;
23  import net.obsearch.storage.TupleBytes;
24  
25  
26  import net.obsearch.storage.OBStorageConfig;
27  import net.obsearch.storage.OBStoreDouble;
28  import net.obsearch.storage.OBStoreFactory;
29  import net.obsearch.storage.OBStoreLong;
30  import net.obsearch.storage.TupleDouble;
31  import net.obsearch.storage.TupleLong;
32  import org.apache.log4j.Logger;
33  
34  import cern.colt.list.DoubleArrayList;
35  import cern.colt.list.DoubleArrayList;
36  
37  import com.sleepycat.bind.tuple.IntegerBinding;
38  import com.sleepycat.bind.tuple.TupleInput;
39  import com.sleepycat.je.Cursor;
40  import com.sleepycat.je.Database;
41  import com.sleepycat.je.DatabaseConfig;
42  import com.sleepycat.je.DatabaseEntry;
43  import com.sleepycat.je.DatabaseException;
44  import com.sleepycat.je.LockMode;
45  import com.sleepycat.je.OperationStatus;
46  import com.sleepycat.je.PreloadConfig;
47  import com.thoughtworks.xstream.annotations.XStreamAlias;
48  
49  /*
50   OBSearch: a distributed similarity search engine
51   This project is to similarity search what 'bit-torrent' is to downloads.
52   Copyright (C)  2007 Arnoldo Jose Muller Molina
53  
54   This program is free software; you can redistribute it and/or modify
55   it under the terms of the GNU General Public License as published by
56   the Free Software Foundation; either version 2 of the License, or
57   (at your option) any later version.
58  
59   This program is distributed in the hope that it will be useful,
60   but WITHOUT ANY WARRANTY; without even the implied warranty of
61   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
62   GNU General Public License for more details.
63  
64   You should have received a copy of the GNU General Public License along
65   with this program; if not, write to the Free Software Foundation, Inc.,
66   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
67   */
68  /**
69   * This Index uses the extended pyramid technique and SMAP to store arbitrary
70   * objects.
71   * @param <O>
72   *                The type of object to be stored in the Index.
73   * @author Arnoldo Jose Muller Molina
74   * @since 0.7
75   */
76  @XStreamAlias("ExtendedPyramidIndex")
77  public abstract class AbstractExtendedPyramidIndex < O extends OB >
78          extends AbstractPivotOBIndex < O > {
79  
80      /**
81       * Used to access the first item of a two item array.
82       */
83      protected static final int MIN = 0;
84  
85      /**
86       * Used to access the second item of a two item array.
87       */
88      protected static final int MAX = 1;
89  
90      /**
91       * Used to access the first item of a two item array. (Used for queries)
92       */
93      protected static final int HLOW = 0;
94  
95      /**
96       * Used to access the second item of a two item array. (Used for queries)
97       */
98      protected static final int HHIGH = 1;
99  
100     /**
101      * Logger.
102      */
103     // private static final transient Logger logger = Logger
104     // .getLogger(AbstractExtendedPyramidIndex.class);
105     private static Logger logger = Logger
106             .getLogger(AbstractExtendedPyramidIndex.class.getSimpleName());
107 
108     /**
109      * Median points of the extended pyramid technique.
110      */
111     private double[] mp;
112 
113     /**
114      * Pyramid values
115      */
116     protected OBStoreDouble C;
117 
118     /**
119      * Pyramid values
120      */
121     protected OBStoreLong B;
122 
123     /**
124      * Constructs an extended pyramid index.
125      */
126     public AbstractExtendedPyramidIndex(Class < O > type,
127             IncrementalPivotSelector < O > pivotSelector, int pivotCount)
128             throws OBStorageException, OBException {
129         super(type, pivotSelector, pivotCount); // initializes the databases
130         mp = new double[super.getPivotCount()];
131     }
132 
133     public void init(OBStoreFactory fact) throws OBStorageException,
134             OBException, NotFrozenException, IllegalAccessException,
135             InstantiationException, OBException {
136         super.init(fact);
137         OBStorageConfig conf = new OBStorageConfig();
138 		conf.setTemp(false);
139 		conf.setDuplicates(true);
140 		conf.setBulkMode(! isFrozen());
141         this.C = fact.createOBStoreDouble("C", conf);
142         if (!this.isFrozen()) {
143         	conf = new OBStorageConfig();
144     		conf.setTemp(true);
145     		conf.setDuplicates(false);
146     		conf.setBulkMode(false);
147             this.B = fact.createOBStoreLong("B", conf);
148         }
149 
150     }
151 
152     @Override
153     public void close() throws OBException {
154         C.close();
155         B.close();
156         super.close();
157         
158     }
159 
160     /**
161      * Calculates the pyramid's median values. We basically have to get the
162      * median from each dimension and use the median to approximate the center
163      * of the data Using Colt's QuantileBin1D to extract the median. It allows
164      * an approximate but memory friendly processing! :)
165      * @throws DatabaseException
166      *                 If a database error occurs.
167      * @throws IllegalAccessException
168      *                 If there is a problem when instantiating objects O
169      * @throws InstantiationException
170      *                 If there is a problem when instantiating objects O
171      * @throws OutOfRangeException
172      *                 If the distance of any object to any other object exceeds
173      *                 the range defined by the user.
174      * @throws OBException
175      *                 User generated exception
176      */
177     protected void calculateIndexParameters() throws IllegalAccessException,
178             InstantiationException, OutOfRangeException, OBException {
179 
180         // each median for each dimension will be stored in this array
181         QuantileBin1D[] medianHolder = createMedianHolders(B.size());
182         
183         StaticBin1D qt = new StaticBin1D();
184 
185 
186         CloseIterator < TupleLong > it = this.B.processAll();
187         try {
188             while (it.hasNext()) {
189                 TupleLong t = it.next();
190                 double [] vect = extractTuple(t.getValue());
191                 updateMedianHolder(vect, medianHolder);
192                 calculateDistances(vect, qt);
193             }
194         } finally {
195             it.closeCursor();
196         }
197         logger.info("Separation: " + qt.mean() + "std:" + qt.standardDeviation() + " min " + qt.min());
198         int i = 0;
199         while (i < mp.length) {
200             double median = medianHolder[i].median();
201             assert median >= 0 && median <= 1;
202             mp[i] = median;
203             i++;
204         }
205         logger.debug("Median calculation finished");
206     }
207     
208     private void calculateDistances(double[] vector, StaticBin1D medianHolder){
209         //Arrays.sort(vector);
210         
211         int i = 0;
212         while(i < vector.length-1){
213             medianHolder.add(Math.abs(vector[i] - vector[i+1]));
214             i++;
215         }
216     }
217 
218     /**
219      * Extracts the tuple values from in and leaves the values as doubles. The
220      * resulting values must be normalized.
221      * @param in
222      *                Extracts a tuple from the given byte stream.
223      * @throws OutOfRangeException
224      *                 If the distance of any object to any other object exceeds
225      *                 the range defined by the user.
226      * @return A double tuple (the input of the pyramid technique)
227      */
228     protected abstract double[] extractTuple(ByteBuffer in)
229             throws OutOfRangeException;
230     
231     /**
232      * Converts an object into its projection.
233      * @param object projection of the vector.
234      * @return A projection of object.
235      */
236     protected abstract ByteBuffer objectToProjectionBytes(O object) throws OBException;
237 
238     @Override
239     public void freeze() throws IOException, AlreadyFrozenException,
240             IllegalIdException, IllegalAccessException, InstantiationException,
241             OBStorageException, OutOfRangeException, OBException {
242         super.freeze();
243         // now process each object and put it in B.
244         CloseIterator < TupleLong > it = A.processAll();
245         try {
246             while (it.hasNext()) {
247                 TupleLong tup = it.next();
248                 O obj = bytesToObject(tup.getValue());
249                 ByteBuffer pivots = objectToProjectionBytes(obj);
250                 assert pivots != null : "Implement projection serialization";
251                 B.put(tup.getKey(), pivots);
252             }
253         } finally {
254             it.closeCursor();
255         }
256 
257         calculateIndexParameters();
258 
259         //B.deleteAll();
260 
261         it = A.processAll();
262 
263         // now we have to insert all the data again.
264         try {
265             while (it.hasNext()) {
266                 TupleLong tup = it.next();
267                 O obj = bytesToObject(tup.getValue());
268                 insertAux(tup.getKey(), obj);
269             }
270         } finally {
271             it.closeCursor();
272         }
273         // done.
274     }
275 
276     /**
277      * Updates each median holder with the given double tuple.
278      * @param tuple
279      *                TupleInput of doubles
280      * @param medianHolder
281      *                an array of QuantileBin1D objects used to find an
282      *                approximate median.
283      */
284     protected final void updateMedianHolder(final double[] tuple,
285             QuantileBin1D[] medianHolder) {
286         int i = 0;
287         assert tuple.length == medianHolder.length;
288         assert medianHolder.length == super.getPivotCount();
289         while (i < medianHolder.length) {
290             medianHolder[i].add(tuple[i]);
291             i++;
292         }
293     }
294 
295     protected final void updateDoubleHolder(final double[] tuple,
296             DoubleArrayList[] medianHolder) {
297         int i = 0;
298         assert tuple.length == medianHolder.length;
299         assert medianHolder.length == getPivotCount();
300         while (i < medianHolder.length) {
301             medianHolder[i].add(tuple[i]);
302             i++;
303         }
304     }
305 
306     /**
307      * Creates pivotsCount QuantileBin1D objects that will be used to calculate
308      * the medians of the data.
309      * @param size
310      *                the size of the data to be processed
311      * @return an array of QuantileBin1D objects
312      */
313     protected final QuantileBin1D[] createMedianHolders(final long size) {
314         QuantileBin1D[] res = new QuantileBin1D[getPivotCount()];
315         int i = 0;
316         while (i < res.length) {
317             // TODO: move these parameters to a centralized
318             // configuration file.
319             res[i] = new QuantileBin1D(true, size, 0.00001, 0.00001, 10000,
320                     new cern.jet.random.engine.DRand(new java.util.Date()),
321                     true, true, 2);
322             i++;
323         }
324         return res;
325     }
326 
327     protected final DoubleArrayList[] createDoubleHolders(int size) {
328         DoubleArrayList[] res = new DoubleArrayList[getPivotCount()];
329         int i = 0;
330         while (i < res.length) {
331             // TODO: move these parameters to a centralized
332             // configuration file.
333             res[i] = new DoubleArrayList(size);
334             i++;
335         }
336         return res;
337     }
338 
339     /**
340      * Normalizes a value in the given dimension. The value must have been
341      * converted into a double [0,1] before using this method
342      * @param norm
343      *                Normalizes the value in the given dimension
344      * @param i
345      *                Dimension to use.
346      * @return Normalized version of the value.
347      */
348     protected final double extendedPyramidNormalization(final double norm,
349             final int i) {
350         return (double) Math.pow(norm, -1.d / (Math.log(mp[i]) / Math.log(2)));
351     }
352 
353     /**
354      * For the given point and the pyramid number we return the height of that
355      * point.
356      * @param tuple
357      *                tuple to be processed
358      * @param pyramidNumber
359      *                which pyramid number will be processed.
360      * @return height of the point
361      */
362     protected final double heightOfPoint(final double[] tuple,
363             final int pyramidNumber) {
364         double res = Math.abs(0.5 - tuple[pyramidNumber % getPivotCount()]);
365         assert res >= 0 && res <= 0.5;
366         return res;
367     }
368 
369     /**
370      * Returns the pyramid value for the given tuple.
371      * @param Normalized
372      *                tuple (first pass)
373      * @return The pyramid value for the given tuple.
374      */
375     public final double pyramidValue(final double[] tuple) {
376         int pyramid = pyramidOfPoint(tuple);
377         assert pyramid >= 0 && pyramid < getPivotCount() * 2 : " Pyramid value:"
378                 + pyramid;
379         return pyramid + heightOfPoint(tuple, pyramid);
380     }
381 
382     /**
383      * Calculates the pyramid # for the given point.
384      * @param tuple
385      *                Normalized tuple (first pass)
386      * @return The pyramid # for the given tuple
387      */
388     public final int pyramidOfPoint(final double[] tuple) {
389         int jmax = pyramidOfPointAux(tuple);
390         if (tuple[jmax] < 0.5) {
391             return jmax;
392         } else {
393             return jmax + getPivotCount();
394         }
395     }
396 
397     /**
398      * For the given tuple, returns the pyramid # for the tuple.
399      * @param tuple
400      *                Normalized tuple (first pass)
401      * @return pyramid # for the given tuple
402      */
403     protected final int pyramidOfPointAux(final double[] tuple) {
404         int j = 0;
405         assert getPivotCount() == tuple.length;
406         while (j < getPivotCount()) {
407             int k = 0;
408             boolean failed = false;
409             while (k < getPivotCount() && !failed) {
410                 if (k == j) { // can't process k == j
411                     k++;
412                     continue;
413                 }
414                 if (!(Math.abs(0.5 - tuple[j]) >= Math.abs(0.5 - tuple[k]))) {
415                     // if this property is not fullfilled we have to finish the
416                     // loop
417                     // we also failed for this j, need to try the next j
418                     failed = true;
419                 }
420                 k++;
421             }
422             if (!failed) {
423                 // we did not fail, let's return j_max
424                 // we should always exit the method here
425                 return j;
426             }
427             j++;
428         }
429         assert false : " Catastrophic Failure "; // we shouldn't have reached
430         return Integer.MIN_VALUE;
431     }
432 
433     /**
434      * Returns true if the given query (min[] and max[]) intersects pyramid p.
435      * It also modifies lowHighResult so that we can perform the range
436      * accordingly.
437      * @param q
438      *                A query rectangle.
439      * @param p
440      *                The pyramid number
441      * @param lowHighResult
442      *                Where the lowHighResult will be stored.
443      * @return True if rectangle q intersects pyramid p
444      */
445     protected final boolean intersect(final double[][] q, final int p,
446             double[] minArray, double[] lowHighResult) {
447         // strategy: as soon as we find something is false, we stop processing
448         // otherwise we return true! :)
449         int i = p;
450         assert i < this.getPivotCount() * 2;
451         int j = 0;
452         if (i < this.getPivotCount()) { // the case where i < d
453             double minimum = q[i][MIN];
454             while (j < q.length) {
455                 if (j != i) {
456                     if (!(minimum <= -minArray[j])) {
457                         return false;
458                     }
459                 }
460                 j++;
461             }
462             assert j == getPivotCount();
463             if (q[i][MAX] > 0) {
464                 q[i][MAX] = 0;
465             }
466         } else { // i >= d
467             i = i - this.getPivotCount();
468             double maximum = q[i][MAX];
469             while (j < q.length) { // this is the first definition!!!
470                 if (j != i) {
471                     if (!(maximum >= minArray[j])) {
472                         return false;
473                     }
474                 }
475                 j++;
476             }
477             assert j == getPivotCount();
478             if (q[i][MIN] < 0) {
479                 q[i][MIN] = 0;
480             }
481         }
482         // if we reach this, is because the pyramid intersects
483         // we have to get the ranges now.
484         determineRanges(i, q, lowHighResult);
485         return true;
486     }
487 
488     /**
489      * Receives a query and returns a new array generated of calculating min()
490      * to each of the elements
491      * @param query
492      * @return The array of the min(qj) for each element of the query
493      */
494     protected final double[] generateMinArray(double[][] query) {
495         int i = 0;
496         double[] res = new double[query.length];
497         while (i < query.length) {
498             res[i] = min(query[i]);
499             i++;
500         }
501         return res;
502     }
503 
504     /**
505      * @return The total number of boxes served by this index.
506      */
507     public long totalBoxes() {
508         return getPivotCount() * 2;
509     }
510 
511     /**
512      * Determines the ranges that have to be searched in the b-tree.
513      * @param p
514      *                Pyramid #
515      * @param q
516      *                the query
517      * @param lowHighResult
518      *                where the high a low will be stored (a two element array)
519      */
520     private final void determineRanges(int p, double[][] q,
521             double[] lowHighResult) {
522 
523         int i = p;
524         assert i < getPivotCount();
525         lowHighResult[HHIGH] = max(q[i]);
526         if (isEasyCase(q)) { // do not use max2 here
527             lowHighResult[HLOW] = 0;
528         } else {
529             int j = 0;
530             double max = 0;
531             while (j < getPivotCount()) {
532                 if (i != j) {
533                     double t = qjmin(q, j, i);
534                     if (t > max) {
535                         max = t;
536                     }
537                 }
538                 j++;
539             }
540             lowHighResult[HLOW] = max;
541         }
542     }
543 
544     /**
545      * Finds qjmin for the given j, and the given i pyramid and the ranges of
546      * the query max and min. Please see the paper on the pyramid technique.
547      * @param q
548      *                Query rectangle
549      * @param j
550      *                j parameter
551      * @param i
552      *                i parameter
553      * @return qjmin for the given q,j,i.
554      */
555     private final double qjmin(final double[][] q, final int j, final int i) {
556         if (min(q[j]) > min(q[i])) {
557             return min(q[j]);
558         } else {
559             return min(q[i]);
560         }
561     }
562 
563     /**
564      * Returns true if the query is an easy case. Please see the paper on the
565      * pyramid technique.
566      * @param q
567      *                A query rectangle
568      * @return True if this query is an "easy case" query.
569      */
570     private final boolean isEasyCase(final double[][] q) {
571         int i = 0;
572         while (i < q.length) {
573             if (!((q[i][MIN] <= 0) && (0 <= q[i][MAX]))) {
574                 return false;
575             }
576             i++;
577         }
578         return true;
579     }
580 
581     /**
582      * Calculates min for the given 2 element array.
583      * @param minMax
584      *                2 element array.
585      * @return min
586      */
587     private final double min(final double[] minMax) {
588         if (minMax[MIN] <= 0 && 0 <= minMax[MAX]) {
589             return 0;
590         } else {
591             return Math.min(Math.abs(minMax[MAX]), Math.abs(minMax[MIN]));
592         }
593     }
594 
595     /**
596      * Calculates max for the given 2 element array.
597      * @param minMax
598      *                2 element array.
599      * @return max
600      */
601     private final double max(final double[] minMax) {
602         return Math.max(Math.abs(minMax[MAX]), Math.abs(minMax[MIN]));
603     }
604 
605     /**
606      * Queries are aligned to the center of the space. Aligns the given query to
607      * the center of the space.
608      * @param q
609      *                query.
610      */
611     protected final void centerQuery(final double[][] q) {
612         int i = 0;
613         while (i < q.length) {
614             q[i][MIN] = q[i][MIN] - 0.5f;
615             q[i][MAX] = q[i][MAX] - 0.5f;
616             i++;
617         }
618     }
619 
620     /**
621      * Copies the contents of query src into dest.
622      * @param src
623      *                source
624      * @param dest
625      *                destination
626      */
627     protected final void copyQuery(final double[][] src, double[][] dest) {
628         int i = 0;
629         while (i < src.length) {
630             dest[i][MIN] = src[i][MIN];
631             dest[i][MAX] = src[i][MAX];
632             i++;
633         }
634     }
635 
636 }