| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848 | /*********************************************************************** * Software License Agreement (BSD License) * * Copyright 2008-2011  Marius Muja (mariusm@cs.ubc.ca). All rights reserved. * Copyright 2008-2011  David G. Lowe (lowe@cs.ubc.ca). All rights reserved. * * THE BSD LICENSE * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright *    notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright *    notice, this list of conditions and the following disclaimer in the *    documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************************/#ifndef OPENCV_FLANN_HIERARCHICAL_CLUSTERING_INDEX_H_#define OPENCV_FLANN_HIERARCHICAL_CLUSTERING_INDEX_H_#include <algorithm>#include <map>#include <cassert>#include <limits>#include <cmath>#include "general.h"#include "nn_index.h"#include "dist.h"#include "matrix.h"#include "result_set.h"#include "heap.h"#include "allocator.h"#include "random.h"#include "saving.h"namespace cvflann{struct HierarchicalClusteringIndexParams : public IndexParams{    HierarchicalClusteringIndexParams(int branching = 32,                                      flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM,                                      int trees = 4, int leaf_size = 100)    {        (*this)["algorithm"] = FLANN_INDEX_HIERARCHICAL;        // The branching factor used in the hierarchical clustering        (*this)["branching"] = branching;        // Algorithm used for picking the initial cluster centers        (*this)["centers_init"] = centers_init;        // number of parallel trees to build        (*this)["trees"] = trees;        // maximum leaf size        (*this)["leaf_size"] = leaf_size;    }};/** * Hierarchical index * * Contains a tree constructed through a hierarchical clustering * and other information for indexing a set of points for nearest-neighbour matching. */template <typename Distance>class HierarchicalClusteringIndex : public NNIndex<Distance>{public:    typedef typename Distance::ElementType ElementType;    typedef typename Distance::ResultType DistanceType;private:    typedef void (HierarchicalClusteringIndex::* centersAlgFunction)(int, int*, int, int*, int&);    /**     * The function used for choosing the cluster centers.     */    centersAlgFunction chooseCenters;    /**     * Chooses the initial centers in the k-means clustering in a random manner.     *     * Params:     *     k = number of centers     *     vecs = the dataset of points     *     indices = indices in the dataset     *     indices_length = length of indices vector     *     */    void chooseCentersRandom(int k, int* dsindices, int indices_length, int* centers, int& centers_length)    {        UniqueRandom r(indices_length);        int index;        for (index=0; index<k; ++index) {            bool duplicate = true;            int rnd;            while (duplicate) {                duplicate = false;                rnd = r.next();                if (rnd<0) {                    centers_length = index;                    return;                }                centers[index] = dsindices[rnd];                for (int j=0; j<index; ++j) {                    DistanceType sq = distance(dataset[centers[index]], dataset[centers[j]], dataset.cols);                    if (sq<1e-16) {                        duplicate = true;                    }                }            }        }        centers_length = index;    }    /**     * Chooses the initial centers in the k-means using Gonzales' algorithm     * so that the centers are spaced apart from each other.     *     * Params:     *     k = number of centers     *     vecs = the dataset of points     *     indices = indices in the dataset     * Returns:     */    void chooseCentersGonzales(int k, int* dsindices, int indices_length, int* centers, int& centers_length)    {        int n = indices_length;        int rnd = rand_int(n);        assert(rnd >=0 && rnd < n);        centers[0] = dsindices[rnd];        int index;        for (index=1; index<k; ++index) {            int best_index = -1;            DistanceType best_val = 0;            for (int j=0; j<n; ++j) {                DistanceType dist = distance(dataset[centers[0]],dataset[dsindices[j]],dataset.cols);                for (int i=1; i<index; ++i) {                    DistanceType tmp_dist = distance(dataset[centers[i]],dataset[dsindices[j]],dataset.cols);                    if (tmp_dist<dist) {                        dist = tmp_dist;                    }                }                if (dist>best_val) {                    best_val = dist;                    best_index = j;                }            }            if (best_index!=-1) {                centers[index] = dsindices[best_index];            }            else {                break;            }        }        centers_length = index;    }    /**     * Chooses the initial centers in the k-means using the algorithm     * proposed in the KMeans++ paper:     * Arthur, David; Vassilvitskii, Sergei - k-means++: The Advantages of Careful Seeding     *     * Implementation of this function was converted from the one provided in Arthur's code.     *     * Params:     *     k = number of centers     *     vecs = the dataset of points     *     indices = indices in the dataset     * Returns:     */    void chooseCentersKMeanspp(int k, int* dsindices, int indices_length, int* centers, int& centers_length)    {        int n = indices_length;        double currentPot = 0;        DistanceType* closestDistSq = new DistanceType[n];        // Choose one random center and set the closestDistSq values        int index = rand_int(n);        assert(index >=0 && index < n);        centers[0] = dsindices[index];        // Computing distance^2 will have the advantage of even higher probability further to pick new centers        // far from previous centers (and this complies to "k-means++: the advantages of careful seeding" article)        for (int i = 0; i < n; i++) {            closestDistSq[i] = distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols);            closestDistSq[i] = ensureSquareDistance<Distance>( closestDistSq[i] );            currentPot += closestDistSq[i];        }        const int numLocalTries = 1;        // Choose each center        int centerCount;        for (centerCount = 1; centerCount < k; centerCount++) {            // Repeat several trials            double bestNewPot = -1;            int bestNewIndex = 0;            for (int localTrial = 0; localTrial < numLocalTries; localTrial++) {                // Choose our center - have to be slightly careful to return a valid answer even accounting                // for possible rounding errors                double randVal = rand_double(currentPot);                for (index = 0; index < n-1; index++) {                    if (randVal <= closestDistSq[index]) break;                    else randVal -= closestDistSq[index];                }                // Compute the new potential                double newPot = 0;                for (int i = 0; i < n; i++) {                    DistanceType dist = distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols);                    newPot += std::min( ensureSquareDistance<Distance>(dist), closestDistSq[i] );                }                // Store the best result                if ((bestNewPot < 0)||(newPot < bestNewPot)) {                    bestNewPot = newPot;                    bestNewIndex = index;                }            }            // Add the appropriate center            centers[centerCount] = dsindices[bestNewIndex];            currentPot = bestNewPot;            for (int i = 0; i < n; i++) {                DistanceType dist = distance(dataset[dsindices[i]], dataset[dsindices[bestNewIndex]], dataset.cols);                closestDistSq[i] = std::min( ensureSquareDistance<Distance>(dist), closestDistSq[i] );            }        }        centers_length = centerCount;        delete[] closestDistSq;    }    /**     * Chooses the initial centers in a way inspired by Gonzales (by Pierre-Emmanuel Viel):     * select the first point of the list as a candidate, then parse the points list. If another     * point is further than current candidate from the other centers, test if it is a good center     * of a local aggregation. If it is, replace current candidate by this point. And so on...     *     * Used with KMeansIndex that computes centers coordinates by averaging positions of clusters points,     * this doesn't make a real difference with previous methods. But used with HierarchicalClusteringIndex     * class that pick centers among existing points instead of computing the barycenters, there is a real     * improvement.     *     * Params:     *     k = number of centers     *     vecs = the dataset of points     *     indices = indices in the dataset     * Returns:     */    void GroupWiseCenterChooser(int k, int* dsindices, int indices_length, int* centers, int& centers_length)    {        const float kSpeedUpFactor = 1.3f;        int n = indices_length;        DistanceType* closestDistSq = new DistanceType[n];        // Choose one random center and set the closestDistSq values        int index = rand_int(n);        assert(index >=0 && index < n);        centers[0] = dsindices[index];        for (int i = 0; i < n; i++) {            closestDistSq[i] = distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols);        }        // Choose each center        int centerCount;        for (centerCount = 1; centerCount < k; centerCount++) {            // Repeat several trials            double bestNewPot = -1;            int bestNewIndex = 0;            DistanceType furthest = 0;            for (index = 0; index < n; index++) {                // We will test only the potential of the points further than current candidate                if( closestDistSq[index] > kSpeedUpFactor * (float)furthest ) {                    // Compute the new potential                    double newPot = 0;                    for (int i = 0; i < n; i++) {                        newPot += std::min( distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols)                                            , closestDistSq[i] );                    }                    // Store the best result                    if ((bestNewPot < 0)||(newPot <= bestNewPot)) {                        bestNewPot = newPot;                        bestNewIndex = index;                        furthest = closestDistSq[index];                    }                }            }            // Add the appropriate center            centers[centerCount] = dsindices[bestNewIndex];            for (int i = 0; i < n; i++) {                closestDistSq[i] = std::min( distance(dataset[dsindices[i]], dataset[dsindices[bestNewIndex]], dataset.cols)                                             , closestDistSq[i] );            }        }        centers_length = centerCount;        delete[] closestDistSq;    }public:    /**     * Index constructor     *     * Params:     *          inputData = dataset with the input features     *          params = parameters passed to the hierarchical k-means algorithm     */    HierarchicalClusteringIndex(const Matrix<ElementType>& inputData, const IndexParams& index_params = HierarchicalClusteringIndexParams(),                                Distance d = Distance())        : dataset(inputData), params(index_params), root(NULL), indices(NULL), distance(d)    {        memoryCounter = 0;        size_ = dataset.rows;        veclen_ = dataset.cols;        branching_ = get_param(params,"branching",32);        centers_init_ = get_param(params,"centers_init", FLANN_CENTERS_RANDOM);        trees_ = get_param(params,"trees",4);        leaf_size_ = get_param(params,"leaf_size",100);        if (centers_init_==FLANN_CENTERS_RANDOM) {            chooseCenters = &HierarchicalClusteringIndex::chooseCentersRandom;        }        else if (centers_init_==FLANN_CENTERS_GONZALES) {            chooseCenters = &HierarchicalClusteringIndex::chooseCentersGonzales;        }        else if (centers_init_==FLANN_CENTERS_KMEANSPP) {            chooseCenters = &HierarchicalClusteringIndex::chooseCentersKMeanspp;        }        else if (centers_init_==FLANN_CENTERS_GROUPWISE) {            chooseCenters = &HierarchicalClusteringIndex::GroupWiseCenterChooser;        }        else {            throw FLANNException("Unknown algorithm for choosing initial centers.");        }        trees_ = get_param(params,"trees",4);        root = new NodePtr[trees_];        indices = new int*[trees_];        for (int i=0; i<trees_; ++i) {            root[i] = NULL;            indices[i] = NULL;        }    }    HierarchicalClusteringIndex(const HierarchicalClusteringIndex&);    HierarchicalClusteringIndex& operator=(const HierarchicalClusteringIndex&);    /**     * Index destructor.     *     * Release the memory used by the index.     */    virtual ~HierarchicalClusteringIndex()    {        free_elements();        if (root!=NULL) {            delete[] root;        }        if (indices!=NULL) {            delete[] indices;        }    }    /**     * Release the inner elements of indices[]     */    void free_elements()    {        if (indices!=NULL) {            for(int i=0; i<trees_; ++i) {                if (indices[i]!=NULL) {                    delete[] indices[i];                    indices[i] = NULL;                }            }        }    }    /**     *  Returns size of index.     */    size_t size() const    {        return size_;    }    /**     * Returns the length of an index feature.     */    size_t veclen() const    {        return veclen_;    }    /**     * Computes the inde memory usage     * Returns: memory used by the index     */    int usedMemory() const    {        return pool.usedMemory+pool.wastedMemory+memoryCounter;    }    /**     * Builds the index     */    void buildIndex()    {        if (branching_<2) {            throw FLANNException("Branching factor must be at least 2");        }        free_elements();        for (int i=0; i<trees_; ++i) {            indices[i] = new int[size_];            for (size_t j=0; j<size_; ++j) {                indices[i][j] = (int)j;            }            root[i] = pool.allocate<Node>();            computeClustering(root[i], indices[i], (int)size_, branching_,0);        }    }    flann_algorithm_t getType() const    {        return FLANN_INDEX_HIERARCHICAL;    }    void saveIndex(FILE* stream)    {        save_value(stream, branching_);        save_value(stream, trees_);        save_value(stream, centers_init_);        save_value(stream, leaf_size_);        save_value(stream, memoryCounter);        for (int i=0; i<trees_; ++i) {            save_value(stream, *indices[i], size_);            save_tree(stream, root[i], i);        }    }    void loadIndex(FILE* stream)    {        free_elements();        if (root!=NULL) {            delete[] root;        }        if (indices!=NULL) {            delete[] indices;        }        load_value(stream, branching_);        load_value(stream, trees_);        load_value(stream, centers_init_);        load_value(stream, leaf_size_);        load_value(stream, memoryCounter);        indices = new int*[trees_];        root = new NodePtr[trees_];        for (int i=0; i<trees_; ++i) {            indices[i] = new int[size_];            load_value(stream, *indices[i], size_);            load_tree(stream, root[i], i);        }        params["algorithm"] = getType();        params["branching"] = branching_;        params["trees"] = trees_;        params["centers_init"] = centers_init_;        params["leaf_size"] = leaf_size_;    }    /**     * Find set of nearest neighbors to vec. Their indices are stored inside     * the result object.     *     * Params:     *     result = the result object in which the indices of the nearest-neighbors are stored     *     vec = the vector for which to search the nearest neighbors     *     searchParams = parameters that influence the search algorithm (checks)     */    void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams)    {        int maxChecks = get_param(searchParams,"checks",32);        // Priority queue storing intermediate branches in the best-bin-first search        Heap<BranchSt>* heap = new Heap<BranchSt>((int)size_);        std::vector<bool> checked(size_,false);        int checks = 0;        for (int i=0; i<trees_; ++i) {            findNN(root[i], result, vec, checks, maxChecks, heap, checked);        }        BranchSt branch;        while (heap->popMin(branch) && (checks<maxChecks || !result.full())) {            NodePtr node = branch.node;            findNN(node, result, vec, checks, maxChecks, heap, checked);        }        assert(result.full());        delete heap;    }    IndexParams getParameters() const    {        return params;    }private:    /**     * Struture representing a node in the hierarchical k-means tree.     */    struct Node    {        /**         * The cluster center index         */        int pivot;        /**         * The cluster size (number of points in the cluster)         */        int size;        /**         * Child nodes (only for non-terminal nodes)         */        Node** childs;        /**         * Node points (only for terminal nodes)         */        int* indices;        /**         * Level         */        int level;    };    typedef Node* NodePtr;    /**     * Alias definition for a nicer syntax.     */    typedef BranchStruct<NodePtr, DistanceType> BranchSt;    void save_tree(FILE* stream, NodePtr node, int num)    {        save_value(stream, *node);        if (node->childs==NULL) {            int indices_offset = (int)(node->indices - indices[num]);            save_value(stream, indices_offset);        }        else {            for(int i=0; i<branching_; ++i) {                save_tree(stream, node->childs[i], num);            }        }    }    void load_tree(FILE* stream, NodePtr& node, int num)    {        node = pool.allocate<Node>();        load_value(stream, *node);        if (node->childs==NULL) {            int indices_offset;            load_value(stream, indices_offset);            node->indices = indices[num] + indices_offset;        }        else {            node->childs = pool.allocate<NodePtr>(branching_);            for(int i=0; i<branching_; ++i) {                load_tree(stream, node->childs[i], num);            }        }    }    void computeLabels(int* dsindices, int indices_length,  int* centers, int centers_length, int* labels, DistanceType& cost)    {        cost = 0;        for (int i=0; i<indices_length; ++i) {            ElementType* point = dataset[dsindices[i]];            DistanceType dist = distance(point, dataset[centers[0]], veclen_);            labels[i] = 0;            for (int j=1; j<centers_length; ++j) {                DistanceType new_dist = distance(point, dataset[centers[j]], veclen_);                if (dist>new_dist) {                    labels[i] = j;                    dist = new_dist;                }            }            cost += dist;        }    }    /**     * The method responsible with actually doing the recursive hierarchical     * clustering     *     * Params:     *     node = the node to cluster     *     indices = indices of the points belonging to the current node     *     branching = the branching factor to use in the clustering     *     * TODO: for 1-sized clusters don't store a cluster center (it's the same as the single cluster point)     */    void computeClustering(NodePtr node, int* dsindices, int indices_length, int branching, int level)    {        node->size = indices_length;        node->level = level;        if (indices_length < leaf_size_) { // leaf node            node->indices = dsindices;            std::sort(node->indices,node->indices+indices_length);            node->childs = NULL;            return;        }        std::vector<int> centers(branching);        std::vector<int> labels(indices_length);        int centers_length;        (this->*chooseCenters)(branching, dsindices, indices_length, ¢ers[0], centers_length);        if (centers_length<branching) {            node->indices = dsindices;            std::sort(node->indices,node->indices+indices_length);            node->childs = NULL;            return;        }        //	assign points to clusters        DistanceType cost;        computeLabels(dsindices, indices_length, ¢ers[0], centers_length, &labels[0], cost);        node->childs = pool.allocate<NodePtr>(branching);        int start = 0;        int end = start;        for (int i=0; i<branching; ++i) {            for (int j=0; j<indices_length; ++j) {                if (labels[j]==i) {                    std::swap(dsindices[j],dsindices[end]);                    std::swap(labels[j],labels[end]);                    end++;                }            }            node->childs[i] = pool.allocate<Node>();            node->childs[i]->pivot = centers[i];            node->childs[i]->indices = NULL;            computeClustering(node->childs[i],dsindices+start, end-start, branching, level+1);            start=end;        }    }    /**     * Performs one descent in the hierarchical k-means tree. The branches not     * visited are stored in a priority queue.     *     * Params:     *      node = node to explore     *      result = container for the k-nearest neighbors found     *      vec = query points     *      checks = how many points in the dataset have been checked so far     *      maxChecks = maximum dataset points to checks     */    void findNN(NodePtr node, ResultSet<DistanceType>& result, const ElementType* vec, int& checks, int maxChecks,                Heap<BranchSt>* heap, std::vector<bool>& checked)    {        if (node->childs==NULL) {            if (checks>=maxChecks) {                if (result.full()) return;            }            for (int i=0; i<node->size; ++i) {                int index = node->indices[i];                if (!checked[index]) {                    DistanceType dist = distance(dataset[index], vec, veclen_);                    result.addPoint(dist, index);                    checked[index] = true;                    ++checks;                }            }        }        else {            DistanceType* domain_distances = new DistanceType[branching_];            int best_index = 0;            domain_distances[best_index] = distance(vec, dataset[node->childs[best_index]->pivot], veclen_);            for (int i=1; i<branching_; ++i) {                domain_distances[i] = distance(vec, dataset[node->childs[i]->pivot], veclen_);                if (domain_distances[i]<domain_distances[best_index]) {                    best_index = i;                }            }            for (int i=0; i<branching_; ++i) {                if (i!=best_index) {                    heap->insert(BranchSt(node->childs[i],domain_distances[i]));                }            }            delete[] domain_distances;            findNN(node->childs[best_index],result,vec, checks, maxChecks, heap, checked);        }    }private:    /**     * The dataset used by this index     */    const Matrix<ElementType> dataset;    /**     * Parameters used by this index     */    IndexParams params;    /**     * Number of features in the dataset.     */    size_t size_;    /**     * Length of each feature.     */    size_t veclen_;    /**     * The root node in the tree.     */    NodePtr* root;    /**     *  Array of indices to vectors in the dataset.     */    int** indices;    /**     * The distance     */    Distance distance;    /**     * Pooled memory allocator.     *     * Using a pooled memory allocator is more efficient     * than allocating memory directly when there is a large     * number small of memory allocations.     */    PooledAllocator pool;    /**     * Memory occupied by the index.     */    int memoryCounter;    /** index parameters */    int branching_;    int trees_;    flann_centers_init_t centers_init_;    int leaf_size_;};}#endif /* OPENCV_FLANN_HIERARCHICAL_CLUSTERING_INDEX_H_ */
 |