| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258 | /*M///////////////////////////////////////////////////////////////////////////////////////////  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.////  By downloading, copying, installing or using the software you agree to this license.//  If you do not agree to this license, do not download, install,//  copy or use the software.//////                           License Agreement//                For Open Source Computer Vision Library//// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.// Copyright (C) 2009, Willow Garage Inc., all rights reserved.// Third party copyrights are property of their respective owners.//// Redistribution and use in source and binary forms, with or without modification,// are permitted provided that the following conditions are met:////   * Redistribution's of source code must retain the above copyright notice,//     this list of conditions and the following disclaimer.////   * Redistribution's in binary form must reproduce the above copyright notice,//     this list of conditions and the following disclaimer in the documentation//     and/or other materials provided with the distribution.////   * The name of the copyright holders may not be used to endorse or promote products//     derived from this software without specific prior written permission.//// This software is provided by the copyright holders and contributors "as is" and// any express or implied warranties, including, but not limited to, the implied// warranties of merchantability and fitness for a particular purpose are disclaimed.// In no event shall the Intel Corporation or contributors be liable for any direct,// indirect, incidental, special, exemplary, or consequential damages// (including, but not limited to, procurement of substitute goods or services;// loss of use, data, or profits; or business interruption) however caused// and on any theory of liability, whether in contract, strict liability,// or tort (including negligence or otherwise) arising in any way out of// the use of this software, even if advised of the possibility of such damage.////M*/#ifndef OPENCV_CUDA_SCAN_HPP#define OPENCV_CUDA_SCAN_HPP#include "opencv2/core/cuda/common.hpp"#include "opencv2/core/cuda/utility.hpp"#include "opencv2/core/cuda/warp.hpp"#include "opencv2/core/cuda/warp_shuffle.hpp"/** @file * @deprecated Use @ref cudev instead. *///! @cond IGNOREDnamespace cv { namespace cuda { namespace device{    enum ScanKind { EXCLUSIVE = 0,  INCLUSIVE = 1 };    template <ScanKind Kind, typename T, typename F> struct WarpScan    {        __device__ __forceinline__ WarpScan() {}        __device__ __forceinline__ WarpScan(const WarpScan& other) { (void)other; }        __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)        {            const unsigned int lane = idx & 31;            F op;            if ( lane >=  1) ptr [idx ] = op(ptr [idx -  1], ptr [idx]);            if ( lane >=  2) ptr [idx ] = op(ptr [idx -  2], ptr [idx]);            if ( lane >=  4) ptr [idx ] = op(ptr [idx -  4], ptr [idx]);            if ( lane >=  8) ptr [idx ] = op(ptr [idx -  8], ptr [idx]);            if ( lane >= 16) ptr [idx ] = op(ptr [idx - 16], ptr [idx]);            if( Kind == INCLUSIVE )                return ptr [idx];            else                return (lane > 0) ? ptr [idx - 1] : 0;        }        __device__ __forceinline__ unsigned int index(const unsigned int tid)        {            return tid;        }        __device__ __forceinline__ void init(volatile T *ptr){}        static const int warp_offset      = 0;        typedef WarpScan<INCLUSIVE, T, F>  merge;    };    template <ScanKind Kind , typename T, typename F> struct WarpScanNoComp    {        __device__ __forceinline__ WarpScanNoComp() {}        __device__ __forceinline__ WarpScanNoComp(const WarpScanNoComp& other) { (void)other; }        __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)        {            const unsigned int lane = threadIdx.x & 31;            F op;            ptr [idx ] = op(ptr [idx -  1], ptr [idx]);            ptr [idx ] = op(ptr [idx -  2], ptr [idx]);            ptr [idx ] = op(ptr [idx -  4], ptr [idx]);            ptr [idx ] = op(ptr [idx -  8], ptr [idx]);            ptr [idx ] = op(ptr [idx - 16], ptr [idx]);            if( Kind == INCLUSIVE )                return ptr [idx];            else                return (lane > 0) ? ptr [idx - 1] : 0;        }        __device__ __forceinline__ unsigned int index(const unsigned int tid)        {            return (tid >> warp_log) * warp_smem_stride + 16 + (tid & warp_mask);        }        __device__ __forceinline__ void init(volatile T *ptr)        {            ptr[threadIdx.x] = 0;        }        static const int warp_smem_stride = 32 + 16 + 1;        static const int warp_offset      = 16;        static const int warp_log         = 5;        static const int warp_mask        = 31;        typedef WarpScanNoComp<INCLUSIVE, T, F> merge;    };    template <ScanKind Kind , typename T, typename Sc, typename F> struct BlockScan    {        __device__ __forceinline__ BlockScan() {}        __device__ __forceinline__ BlockScan(const BlockScan& other) { (void)other; }        __device__ __forceinline__ T operator()(volatile T *ptr)        {            const unsigned int tid  = threadIdx.x;            const unsigned int lane = tid & warp_mask;            const unsigned int warp = tid >> warp_log;            Sc scan;            typename Sc::merge merge_scan;            const unsigned int idx = scan.index(tid);            T val = scan(ptr, idx);            __syncthreads ();            if( warp == 0)                scan.init(ptr);            __syncthreads ();            if( lane == 31 )                ptr [scan.warp_offset + warp ] = (Kind == INCLUSIVE) ? val : ptr [idx];            __syncthreads ();            if( warp == 0 )                merge_scan(ptr, idx);            __syncthreads();            if ( warp > 0)                val = ptr [scan.warp_offset + warp - 1] + val;            __syncthreads ();            ptr[idx] = val;            __syncthreads ();            return val ;        }        static const int warp_log  = 5;        static const int warp_mask = 31;    };    template <typename T>    __device__ T warpScanInclusive(T idata, volatile T* s_Data, unsigned int tid)    {    #if __CUDA_ARCH__ >= 300        const unsigned int laneId = cv::cuda::device::Warp::laneId();        // scan on shuffl functions        #pragma unroll        for (int i = 1; i <= (OPENCV_CUDA_WARP_SIZE / 2); i *= 2)        {            const T n = cv::cuda::device::shfl_up(idata, i);            if (laneId >= i)                  idata += n;        }        return idata;    #else        unsigned int pos = 2 * tid - (tid & (OPENCV_CUDA_WARP_SIZE - 1));        s_Data[pos] = 0;        pos += OPENCV_CUDA_WARP_SIZE;        s_Data[pos] = idata;        s_Data[pos] += s_Data[pos - 1];        s_Data[pos] += s_Data[pos - 2];        s_Data[pos] += s_Data[pos - 4];        s_Data[pos] += s_Data[pos - 8];        s_Data[pos] += s_Data[pos - 16];        return s_Data[pos];    #endif    }    template <typename T>    __device__ __forceinline__ T warpScanExclusive(T idata, volatile T* s_Data, unsigned int tid)    {        return warpScanInclusive(idata, s_Data, tid) - idata;    }    template <int tiNumScanThreads, typename T>    __device__ T blockScanInclusive(T idata, volatile T* s_Data, unsigned int tid)    {        if (tiNumScanThreads > OPENCV_CUDA_WARP_SIZE)        {            //Bottom-level inclusive warp scan            T warpResult = warpScanInclusive(idata, s_Data, tid);            //Save top elements of each warp for exclusive warp scan            //sync to wait for warp scans to complete (because s_Data is being overwritten)            __syncthreads();            if ((tid & (OPENCV_CUDA_WARP_SIZE - 1)) == (OPENCV_CUDA_WARP_SIZE - 1))            {                s_Data[tid >> OPENCV_CUDA_LOG_WARP_SIZE] = warpResult;            }            //wait for warp scans to complete            __syncthreads();            if (tid < (tiNumScanThreads / OPENCV_CUDA_WARP_SIZE) )            {                //grab top warp elements                T val = s_Data[tid];                //calculate exclusive scan and write back to shared memory                s_Data[tid] = warpScanExclusive(val, s_Data, tid);            }            //return updated warp scans with exclusive scan results            __syncthreads();            return warpResult + s_Data[tid >> OPENCV_CUDA_LOG_WARP_SIZE];        }        else        {            return warpScanInclusive(idata, s_Data, tid);        }    }}}}//! @endcond#endif // OPENCV_CUDA_SCAN_HPP
 |