cuda.hpp 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906
  1. /*M///////////////////////////////////////////////////////////////////////////////////////
  2. //
  3. // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
  4. //
  5. // By downloading, copying, installing or using the software you agree to this license.
  6. // If you do not agree to this license, do not download, install,
  7. // copy or use the software.
  8. //
  9. //
  10. // License Agreement
  11. // For Open Source Computer Vision Library
  12. //
  13. // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14. // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15. // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  16. // Third party copyrights are property of their respective owners.
  17. //
  18. // Redistribution and use in source and binary forms, with or without modification,
  19. // are permitted provided that the following conditions are met:
  20. //
  21. // * Redistribution's of source code must retain the above copyright notice,
  22. // this list of conditions and the following disclaimer.
  23. //
  24. // * Redistribution's in binary form must reproduce the above copyright notice,
  25. // this list of conditions and the following disclaimer in the documentation
  26. // and/or other materials provided with the distribution.
  27. //
  28. // * The name of the copyright holders may not be used to endorse or promote products
  29. // derived from this software without specific prior written permission.
  30. //
  31. // This software is provided by the copyright holders and contributors "as is" and
  32. // any express or implied warranties, including, but not limited to, the implied
  33. // warranties of merchantability and fitness for a particular purpose are disclaimed.
  34. // In no event shall the Intel Corporation or contributors be liable for any direct,
  35. // indirect, incidental, special, exemplary, or consequential damages
  36. // (including, but not limited to, procurement of substitute goods or services;
  37. // loss of use, data, or profits; or business interruption) however caused
  38. // and on any theory of liability, whether in contract, strict liability,
  39. // or tort (including negligence or otherwise) arising in any way out of
  40. // the use of this software, even if advised of the possibility of such damage.
  41. //
  42. //M*/
  43. #ifndef OPENCV_CORE_CUDA_HPP
  44. #define OPENCV_CORE_CUDA_HPP
  45. #ifndef __cplusplus
  46. # error cuda.hpp header must be compiled as C++
  47. #endif
  48. #include "opencv2/core.hpp"
  49. #include "opencv2/core/cuda_types.hpp"
  50. /**
  51. @defgroup cuda CUDA-accelerated Computer Vision
  52. @{
  53. @defgroup cudacore Core part
  54. @{
  55. @defgroup cudacore_init Initalization and Information
  56. @defgroup cudacore_struct Data Structures
  57. @}
  58. @}
  59. */
  60. namespace cv { namespace cuda {
  61. //! @addtogroup cudacore_struct
  62. //! @{
  63. //===================================================================================
  64. // GpuMat
  65. //===================================================================================
  66. /** @brief Base storage class for GPU memory with reference counting.
  67. Its interface matches the Mat interface with the following limitations:
  68. - no arbitrary dimensions support (only 2D)
  69. - no functions that return references to their data (because references on GPU are not valid for
  70. CPU)
  71. - no expression templates technique support
  72. Beware that the latter limitation may lead to overloaded matrix operators that cause memory
  73. allocations. The GpuMat class is convertible to cuda::PtrStepSz and cuda::PtrStep so it can be
  74. passed directly to the kernel.
  75. @note In contrast with Mat, in most cases GpuMat::isContinuous() == false . This means that rows are
  76. aligned to a size depending on the hardware. Single-row GpuMat is always a continuous matrix.
  77. @note You are not recommended to leave static or global GpuMat variables allocated, that is, to rely
  78. on its destructor. The destruction order of such variables and CUDA context is undefined. GPU memory
  79. release function returns error if the CUDA context has been destroyed before.
  80. @sa Mat
  81. */
  82. class CV_EXPORTS GpuMat
  83. {
  84. public:
  85. class CV_EXPORTS Allocator
  86. {
  87. public:
  88. virtual ~Allocator() {}
  89. // allocator must fill data, step and refcount fields
  90. virtual bool allocate(GpuMat* mat, int rows, int cols, size_t elemSize) = 0;
  91. virtual void free(GpuMat* mat) = 0;
  92. };
  93. //! default allocator
  94. static Allocator* defaultAllocator();
  95. static void setDefaultAllocator(Allocator* allocator);
  96. //! default constructor
  97. explicit GpuMat(Allocator* allocator = defaultAllocator());
  98. //! constructs GpuMat of the specified size and type
  99. GpuMat(int rows, int cols, int type, Allocator* allocator = defaultAllocator());
  100. GpuMat(Size size, int type, Allocator* allocator = defaultAllocator());
  101. //! constucts GpuMat and fills it with the specified value _s
  102. GpuMat(int rows, int cols, int type, Scalar s, Allocator* allocator = defaultAllocator());
  103. GpuMat(Size size, int type, Scalar s, Allocator* allocator = defaultAllocator());
  104. //! copy constructor
  105. GpuMat(const GpuMat& m);
  106. //! constructor for GpuMat headers pointing to user-allocated data
  107. GpuMat(int rows, int cols, int type, void* data, size_t step = Mat::AUTO_STEP);
  108. GpuMat(Size size, int type, void* data, size_t step = Mat::AUTO_STEP);
  109. //! creates a GpuMat header for a part of the bigger matrix
  110. GpuMat(const GpuMat& m, Range rowRange, Range colRange);
  111. GpuMat(const GpuMat& m, Rect roi);
  112. //! builds GpuMat from host memory (Blocking call)
  113. explicit GpuMat(InputArray arr, Allocator* allocator = defaultAllocator());
  114. //! destructor - calls release()
  115. ~GpuMat();
  116. //! assignment operators
  117. GpuMat& operator =(const GpuMat& m);
  118. //! allocates new GpuMat data unless the GpuMat already has specified size and type
  119. void create(int rows, int cols, int type);
  120. void create(Size size, int type);
  121. //! decreases reference counter, deallocate the data when reference counter reaches 0
  122. void release();
  123. //! swaps with other smart pointer
  124. void swap(GpuMat& mat);
  125. //! pefroms upload data to GpuMat (Blocking call)
  126. void upload(InputArray arr);
  127. //! pefroms upload data to GpuMat (Non-Blocking call)
  128. void upload(InputArray arr, Stream& stream);
  129. //! pefroms download data from device to host memory (Blocking call)
  130. void download(OutputArray dst) const;
  131. //! pefroms download data from device to host memory (Non-Blocking call)
  132. void download(OutputArray dst, Stream& stream) const;
  133. //! returns deep copy of the GpuMat, i.e. the data is copied
  134. GpuMat clone() const;
  135. //! copies the GpuMat content to device memory (Blocking call)
  136. void copyTo(OutputArray dst) const;
  137. //! copies the GpuMat content to device memory (Non-Blocking call)
  138. void copyTo(OutputArray dst, Stream& stream) const;
  139. //! copies those GpuMat elements to "m" that are marked with non-zero mask elements (Blocking call)
  140. void copyTo(OutputArray dst, InputArray mask) const;
  141. //! copies those GpuMat elements to "m" that are marked with non-zero mask elements (Non-Blocking call)
  142. void copyTo(OutputArray dst, InputArray mask, Stream& stream) const;
  143. //! sets some of the GpuMat elements to s (Blocking call)
  144. GpuMat& setTo(Scalar s);
  145. //! sets some of the GpuMat elements to s (Non-Blocking call)
  146. GpuMat& setTo(Scalar s, Stream& stream);
  147. //! sets some of the GpuMat elements to s, according to the mask (Blocking call)
  148. GpuMat& setTo(Scalar s, InputArray mask);
  149. //! sets some of the GpuMat elements to s, according to the mask (Non-Blocking call)
  150. GpuMat& setTo(Scalar s, InputArray mask, Stream& stream);
  151. //! converts GpuMat to another datatype (Blocking call)
  152. void convertTo(OutputArray dst, int rtype) const;
  153. //! converts GpuMat to another datatype (Non-Blocking call)
  154. void convertTo(OutputArray dst, int rtype, Stream& stream) const;
  155. //! converts GpuMat to another datatype with scaling (Blocking call)
  156. void convertTo(OutputArray dst, int rtype, double alpha, double beta = 0.0) const;
  157. //! converts GpuMat to another datatype with scaling (Non-Blocking call)
  158. void convertTo(OutputArray dst, int rtype, double alpha, Stream& stream) const;
  159. //! converts GpuMat to another datatype with scaling (Non-Blocking call)
  160. void convertTo(OutputArray dst, int rtype, double alpha, double beta, Stream& stream) const;
  161. void assignTo(GpuMat& m, int type=-1) const;
  162. //! returns pointer to y-th row
  163. uchar* ptr(int y = 0);
  164. const uchar* ptr(int y = 0) const;
  165. //! template version of the above method
  166. template<typename _Tp> _Tp* ptr(int y = 0);
  167. template<typename _Tp> const _Tp* ptr(int y = 0) const;
  168. template <typename _Tp> operator PtrStepSz<_Tp>() const;
  169. template <typename _Tp> operator PtrStep<_Tp>() const;
  170. //! returns a new GpuMat header for the specified row
  171. GpuMat row(int y) const;
  172. //! returns a new GpuMat header for the specified column
  173. GpuMat col(int x) const;
  174. //! ... for the specified row span
  175. GpuMat rowRange(int startrow, int endrow) const;
  176. GpuMat rowRange(Range r) const;
  177. //! ... for the specified column span
  178. GpuMat colRange(int startcol, int endcol) const;
  179. GpuMat colRange(Range r) const;
  180. //! extracts a rectangular sub-GpuMat (this is a generalized form of row, rowRange etc.)
  181. GpuMat operator ()(Range rowRange, Range colRange) const;
  182. GpuMat operator ()(Rect roi) const;
  183. //! creates alternative GpuMat header for the same data, with different
  184. //! number of channels and/or different number of rows
  185. GpuMat reshape(int cn, int rows = 0) const;
  186. //! locates GpuMat header within a parent GpuMat
  187. void locateROI(Size& wholeSize, Point& ofs) const;
  188. //! moves/resizes the current GpuMat ROI inside the parent GpuMat
  189. GpuMat& adjustROI(int dtop, int dbottom, int dleft, int dright);
  190. //! returns true iff the GpuMat data is continuous
  191. //! (i.e. when there are no gaps between successive rows)
  192. bool isContinuous() const;
  193. //! returns element size in bytes
  194. size_t elemSize() const;
  195. //! returns the size of element channel in bytes
  196. size_t elemSize1() const;
  197. //! returns element type
  198. int type() const;
  199. //! returns element type
  200. int depth() const;
  201. //! returns number of channels
  202. int channels() const;
  203. //! returns step/elemSize1()
  204. size_t step1() const;
  205. //! returns GpuMat size : width == number of columns, height == number of rows
  206. Size size() const;
  207. //! returns true if GpuMat data is NULL
  208. bool empty() const;
  209. /*! includes several bit-fields:
  210. - the magic signature
  211. - continuity flag
  212. - depth
  213. - number of channels
  214. */
  215. int flags;
  216. //! the number of rows and columns
  217. int rows, cols;
  218. //! a distance between successive rows in bytes; includes the gap if any
  219. size_t step;
  220. //! pointer to the data
  221. uchar* data;
  222. //! pointer to the reference counter;
  223. //! when GpuMat points to user-allocated data, the pointer is NULL
  224. int* refcount;
  225. //! helper fields used in locateROI and adjustROI
  226. uchar* datastart;
  227. const uchar* dataend;
  228. //! allocator
  229. Allocator* allocator;
  230. };
  231. /** @brief Creates a continuous matrix.
  232. @param rows Row count.
  233. @param cols Column count.
  234. @param type Type of the matrix.
  235. @param arr Destination matrix. This parameter changes only if it has a proper type and area (
  236. \f$\texttt{rows} \times \texttt{cols}\f$ ).
  237. Matrix is called continuous if its elements are stored continuously, that is, without gaps at the
  238. end of each row.
  239. */
  240. CV_EXPORTS void createContinuous(int rows, int cols, int type, OutputArray arr);
  241. /** @brief Ensures that the size of a matrix is big enough and the matrix has a proper type.
  242. @param rows Minimum desired number of rows.
  243. @param cols Minimum desired number of columns.
  244. @param type Desired matrix type.
  245. @param arr Destination matrix.
  246. The function does not reallocate memory if the matrix has proper attributes already.
  247. */
  248. CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr);
  249. /** @brief BufferPool for use with CUDA streams
  250. * BufferPool utilizes cuda::Stream's allocator to create new buffers. It is
  251. * particularly useful when BufferPoolUsage is set to true, or a custom
  252. * allocator is specified for the cuda::Stream, and you want to implement your
  253. * own stream based functions utilizing the same underlying GPU memory
  254. * management.
  255. */
  256. class CV_EXPORTS BufferPool
  257. {
  258. public:
  259. //! Gets the BufferPool for the given stream.
  260. explicit BufferPool(Stream& stream);
  261. //! Allocates a new GpuMat of given size and type.
  262. GpuMat getBuffer(int rows, int cols, int type);
  263. //! Allocates a new GpuMat of given size and type.
  264. GpuMat getBuffer(Size size, int type) { return getBuffer(size.height, size.width, type); }
  265. //! Returns the allocator associated with the stream.
  266. Ptr<GpuMat::Allocator> getAllocator() const { return allocator_; }
  267. private:
  268. Ptr<GpuMat::Allocator> allocator_;
  269. };
  270. //! BufferPool management (must be called before Stream creation)
  271. CV_EXPORTS void setBufferPoolUsage(bool on);
  272. CV_EXPORTS void setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount);
  273. //===================================================================================
  274. // HostMem
  275. //===================================================================================
  276. /** @brief Class with reference counting wrapping special memory type allocation functions from CUDA.
  277. Its interface is also Mat-like but with additional memory type parameters.
  278. - **PAGE_LOCKED** sets a page locked memory type used commonly for fast and asynchronous
  279. uploading/downloading data from/to GPU.
  280. - **SHARED** specifies a zero copy memory allocation that enables mapping the host memory to GPU
  281. address space, if supported.
  282. - **WRITE_COMBINED** sets the write combined buffer that is not cached by CPU. Such buffers are
  283. used to supply GPU with data when GPU only reads it. The advantage is a better CPU cache
  284. utilization.
  285. @note Allocation size of such memory types is usually limited. For more details, see *CUDA 2.2
  286. Pinned Memory APIs* document or *CUDA C Programming Guide*.
  287. */
  288. class CV_EXPORTS HostMem
  289. {
  290. public:
  291. enum AllocType { PAGE_LOCKED = 1, SHARED = 2, WRITE_COMBINED = 4 };
  292. static MatAllocator* getAllocator(AllocType alloc_type = PAGE_LOCKED);
  293. explicit HostMem(AllocType alloc_type = PAGE_LOCKED);
  294. HostMem(const HostMem& m);
  295. HostMem(int rows, int cols, int type, AllocType alloc_type = PAGE_LOCKED);
  296. HostMem(Size size, int type, AllocType alloc_type = PAGE_LOCKED);
  297. //! creates from host memory with coping data
  298. explicit HostMem(InputArray arr, AllocType alloc_type = PAGE_LOCKED);
  299. ~HostMem();
  300. HostMem& operator =(const HostMem& m);
  301. //! swaps with other smart pointer
  302. void swap(HostMem& b);
  303. //! returns deep copy of the matrix, i.e. the data is copied
  304. HostMem clone() const;
  305. //! allocates new matrix data unless the matrix already has specified size and type.
  306. void create(int rows, int cols, int type);
  307. void create(Size size, int type);
  308. //! creates alternative HostMem header for the same data, with different
  309. //! number of channels and/or different number of rows
  310. HostMem reshape(int cn, int rows = 0) const;
  311. //! decrements reference counter and released memory if needed.
  312. void release();
  313. //! returns matrix header with disabled reference counting for HostMem data.
  314. Mat createMatHeader() const;
  315. /** @brief Maps CPU memory to GPU address space and creates the cuda::GpuMat header without reference counting
  316. for it.
  317. This can be done only if memory was allocated with the SHARED flag and if it is supported by the
  318. hardware. Laptops often share video and CPU memory, so address spaces can be mapped, which
  319. eliminates an extra copy.
  320. */
  321. GpuMat createGpuMatHeader() const;
  322. // Please see cv::Mat for descriptions
  323. bool isContinuous() const;
  324. size_t elemSize() const;
  325. size_t elemSize1() const;
  326. int type() const;
  327. int depth() const;
  328. int channels() const;
  329. size_t step1() const;
  330. Size size() const;
  331. bool empty() const;
  332. // Please see cv::Mat for descriptions
  333. int flags;
  334. int rows, cols;
  335. size_t step;
  336. uchar* data;
  337. int* refcount;
  338. uchar* datastart;
  339. const uchar* dataend;
  340. AllocType alloc_type;
  341. };
  342. /** @brief Page-locks the memory of matrix and maps it for the device(s).
  343. @param m Input matrix.
  344. */
  345. CV_EXPORTS void registerPageLocked(Mat& m);
  346. /** @brief Unmaps the memory of matrix and makes it pageable again.
  347. @param m Input matrix.
  348. */
  349. CV_EXPORTS void unregisterPageLocked(Mat& m);
  350. //===================================================================================
  351. // Stream
  352. //===================================================================================
  353. /** @brief This class encapsulates a queue of asynchronous calls.
  354. @note Currently, you may face problems if an operation is enqueued twice with different data. Some
  355. functions use the constant GPU memory, and next call may update the memory before the previous one
  356. has been finished. But calling different operations asynchronously is safe because each operation
  357. has its own constant buffer. Memory copy/upload/download/set operations to the buffers you hold are
  358. also safe.
  359. @note The Stream class is not thread-safe. Please use different Stream objects for different CPU threads.
  360. @code
  361. void thread1()
  362. {
  363. cv::cuda::Stream stream1;
  364. cv::cuda::func1(..., stream1);
  365. }
  366. void thread2()
  367. {
  368. cv::cuda::Stream stream2;
  369. cv::cuda::func2(..., stream2);
  370. }
  371. @endcode
  372. @note By default all CUDA routines are launched in Stream::Null() object, if the stream is not specified by user.
  373. In multi-threading environment the stream objects must be passed explicitly (see previous note).
  374. */
  375. class CV_EXPORTS Stream
  376. {
  377. typedef void (Stream::*bool_type)() const;
  378. void this_type_does_not_support_comparisons() const {}
  379. public:
  380. typedef void (*StreamCallback)(int status, void* userData);
  381. //! creates a new asynchronous stream
  382. Stream();
  383. //! creates a new asynchronous stream with custom allocator
  384. Stream(const Ptr<GpuMat::Allocator>& allocator);
  385. /** @brief Returns true if the current stream queue is finished. Otherwise, it returns false.
  386. */
  387. bool queryIfComplete() const;
  388. /** @brief Blocks the current CPU thread until all operations in the stream are complete.
  389. */
  390. void waitForCompletion();
  391. /** @brief Makes a compute stream wait on an event.
  392. */
  393. void waitEvent(const Event& event);
  394. /** @brief Adds a callback to be called on the host after all currently enqueued items in the stream have
  395. completed.
  396. @note Callbacks must not make any CUDA API calls. Callbacks must not perform any synchronization
  397. that may depend on outstanding device work or other callbacks that are not mandated to run earlier.
  398. Callbacks without a mandated order (in independent streams) execute in undefined order and may be
  399. serialized.
  400. */
  401. void enqueueHostCallback(StreamCallback callback, void* userData);
  402. //! return Stream object for default CUDA stream
  403. static Stream& Null();
  404. //! returns true if stream object is not default (!= 0)
  405. operator bool_type() const;
  406. class Impl;
  407. private:
  408. Ptr<Impl> impl_;
  409. Stream(const Ptr<Impl>& impl);
  410. friend struct StreamAccessor;
  411. friend class BufferPool;
  412. friend class DefaultDeviceInitializer;
  413. };
  414. class CV_EXPORTS Event
  415. {
  416. public:
  417. enum CreateFlags
  418. {
  419. DEFAULT = 0x00, /**< Default event flag */
  420. BLOCKING_SYNC = 0x01, /**< Event uses blocking synchronization */
  421. DISABLE_TIMING = 0x02, /**< Event will not record timing data */
  422. INTERPROCESS = 0x04 /**< Event is suitable for interprocess use. DisableTiming must be set */
  423. };
  424. explicit Event(CreateFlags flags = DEFAULT);
  425. //! records an event
  426. void record(Stream& stream = Stream::Null());
  427. //! queries an event's status
  428. bool queryIfComplete() const;
  429. //! waits for an event to complete
  430. void waitForCompletion();
  431. //! computes the elapsed time between events
  432. static float elapsedTime(const Event& start, const Event& end);
  433. class Impl;
  434. private:
  435. Ptr<Impl> impl_;
  436. Event(const Ptr<Impl>& impl);
  437. friend struct EventAccessor;
  438. };
  439. //! @} cudacore_struct
  440. //===================================================================================
  441. // Initialization & Info
  442. //===================================================================================
  443. //! @addtogroup cudacore_init
  444. //! @{
  445. /** @brief Returns the number of installed CUDA-enabled devices.
  446. Use this function before any other CUDA functions calls. If OpenCV is compiled without CUDA support,
  447. this function returns 0. If the CUDA driver is not installed, or is incompatible, this function
  448. returns -1.
  449. */
  450. CV_EXPORTS int getCudaEnabledDeviceCount();
  451. /** @brief Sets a device and initializes it for the current thread.
  452. @param device System index of a CUDA device starting with 0.
  453. If the call of this function is omitted, a default device is initialized at the fist CUDA usage.
  454. */
  455. CV_EXPORTS void setDevice(int device);
  456. /** @brief Returns the current device index set by cuda::setDevice or initialized by default.
  457. */
  458. CV_EXPORTS int getDevice();
  459. /** @brief Explicitly destroys and cleans up all resources associated with the current device in the current
  460. process.
  461. Any subsequent API call to this device will reinitialize the device.
  462. */
  463. CV_EXPORTS void resetDevice();
  464. /** @brief Enumeration providing CUDA computing features.
  465. */
  466. enum FeatureSet
  467. {
  468. FEATURE_SET_COMPUTE_10 = 10,
  469. FEATURE_SET_COMPUTE_11 = 11,
  470. FEATURE_SET_COMPUTE_12 = 12,
  471. FEATURE_SET_COMPUTE_13 = 13,
  472. FEATURE_SET_COMPUTE_20 = 20,
  473. FEATURE_SET_COMPUTE_21 = 21,
  474. FEATURE_SET_COMPUTE_30 = 30,
  475. FEATURE_SET_COMPUTE_32 = 32,
  476. FEATURE_SET_COMPUTE_35 = 35,
  477. FEATURE_SET_COMPUTE_50 = 50,
  478. GLOBAL_ATOMICS = FEATURE_SET_COMPUTE_11,
  479. SHARED_ATOMICS = FEATURE_SET_COMPUTE_12,
  480. NATIVE_DOUBLE = FEATURE_SET_COMPUTE_13,
  481. WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30,
  482. DYNAMIC_PARALLELISM = FEATURE_SET_COMPUTE_35
  483. };
  484. //! checks whether current device supports the given feature
  485. CV_EXPORTS bool deviceSupports(FeatureSet feature_set);
  486. /** @brief Class providing a set of static methods to check what NVIDIA\* card architecture the CUDA module was
  487. built for.
  488. According to the CUDA C Programming Guide Version 3.2: "PTX code produced for some specific compute
  489. capability can always be compiled to binary code of greater or equal compute capability".
  490. */
  491. class CV_EXPORTS TargetArchs
  492. {
  493. public:
  494. /** @brief The following method checks whether the module was built with the support of the given feature:
  495. @param feature_set Features to be checked. See :ocvcuda::FeatureSet.
  496. */
  497. static bool builtWith(FeatureSet feature_set);
  498. /** @brief There is a set of methods to check whether the module contains intermediate (PTX) or binary CUDA
  499. code for the given architecture(s):
  500. @param major Major compute capability version.
  501. @param minor Minor compute capability version.
  502. */
  503. static bool has(int major, int minor);
  504. static bool hasPtx(int major, int minor);
  505. static bool hasBin(int major, int minor);
  506. static bool hasEqualOrLessPtx(int major, int minor);
  507. static bool hasEqualOrGreater(int major, int minor);
  508. static bool hasEqualOrGreaterPtx(int major, int minor);
  509. static bool hasEqualOrGreaterBin(int major, int minor);
  510. };
  511. /** @brief Class providing functionality for querying the specified GPU properties.
  512. */
  513. class CV_EXPORTS DeviceInfo
  514. {
  515. public:
  516. //! creates DeviceInfo object for the current GPU
  517. DeviceInfo();
  518. /** @brief The constructors.
  519. @param device_id System index of the CUDA device starting with 0.
  520. Constructs the DeviceInfo object for the specified device. If device_id parameter is missed, it
  521. constructs an object for the current device.
  522. */
  523. DeviceInfo(int device_id);
  524. /** @brief Returns system index of the CUDA device starting with 0.
  525. */
  526. int deviceID() const;
  527. //! ASCII string identifying device
  528. const char* name() const;
  529. //! global memory available on device in bytes
  530. size_t totalGlobalMem() const;
  531. //! shared memory available per block in bytes
  532. size_t sharedMemPerBlock() const;
  533. //! 32-bit registers available per block
  534. int regsPerBlock() const;
  535. //! warp size in threads
  536. int warpSize() const;
  537. //! maximum pitch in bytes allowed by memory copies
  538. size_t memPitch() const;
  539. //! maximum number of threads per block
  540. int maxThreadsPerBlock() const;
  541. //! maximum size of each dimension of a block
  542. Vec3i maxThreadsDim() const;
  543. //! maximum size of each dimension of a grid
  544. Vec3i maxGridSize() const;
  545. //! clock frequency in kilohertz
  546. int clockRate() const;
  547. //! constant memory available on device in bytes
  548. size_t totalConstMem() const;
  549. //! major compute capability
  550. int majorVersion() const;
  551. //! minor compute capability
  552. int minorVersion() const;
  553. //! alignment requirement for textures
  554. size_t textureAlignment() const;
  555. //! pitch alignment requirement for texture references bound to pitched memory
  556. size_t texturePitchAlignment() const;
  557. //! number of multiprocessors on device
  558. int multiProcessorCount() const;
  559. //! specified whether there is a run time limit on kernels
  560. bool kernelExecTimeoutEnabled() const;
  561. //! device is integrated as opposed to discrete
  562. bool integrated() const;
  563. //! device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer
  564. bool canMapHostMemory() const;
  565. enum ComputeMode
  566. {
  567. ComputeModeDefault, /**< default compute mode (Multiple threads can use cudaSetDevice with this device) */
  568. ComputeModeExclusive, /**< compute-exclusive-thread mode (Only one thread in one process will be able to use cudaSetDevice with this device) */
  569. ComputeModeProhibited, /**< compute-prohibited mode (No threads can use cudaSetDevice with this device) */
  570. ComputeModeExclusiveProcess /**< compute-exclusive-process mode (Many threads in one process will be able to use cudaSetDevice with this device) */
  571. };
  572. //! compute mode
  573. ComputeMode computeMode() const;
  574. //! maximum 1D texture size
  575. int maxTexture1D() const;
  576. //! maximum 1D mipmapped texture size
  577. int maxTexture1DMipmap() const;
  578. //! maximum size for 1D textures bound to linear memory
  579. int maxTexture1DLinear() const;
  580. //! maximum 2D texture dimensions
  581. Vec2i maxTexture2D() const;
  582. //! maximum 2D mipmapped texture dimensions
  583. Vec2i maxTexture2DMipmap() const;
  584. //! maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory
  585. Vec3i maxTexture2DLinear() const;
  586. //! maximum 2D texture dimensions if texture gather operations have to be performed
  587. Vec2i maxTexture2DGather() const;
  588. //! maximum 3D texture dimensions
  589. Vec3i maxTexture3D() const;
  590. //! maximum Cubemap texture dimensions
  591. int maxTextureCubemap() const;
  592. //! maximum 1D layered texture dimensions
  593. Vec2i maxTexture1DLayered() const;
  594. //! maximum 2D layered texture dimensions
  595. Vec3i maxTexture2DLayered() const;
  596. //! maximum Cubemap layered texture dimensions
  597. Vec2i maxTextureCubemapLayered() const;
  598. //! maximum 1D surface size
  599. int maxSurface1D() const;
  600. //! maximum 2D surface dimensions
  601. Vec2i maxSurface2D() const;
  602. //! maximum 3D surface dimensions
  603. Vec3i maxSurface3D() const;
  604. //! maximum 1D layered surface dimensions
  605. Vec2i maxSurface1DLayered() const;
  606. //! maximum 2D layered surface dimensions
  607. Vec3i maxSurface2DLayered() const;
  608. //! maximum Cubemap surface dimensions
  609. int maxSurfaceCubemap() const;
  610. //! maximum Cubemap layered surface dimensions
  611. Vec2i maxSurfaceCubemapLayered() const;
  612. //! alignment requirements for surfaces
  613. size_t surfaceAlignment() const;
  614. //! device can possibly execute multiple kernels concurrently
  615. bool concurrentKernels() const;
  616. //! device has ECC support enabled
  617. bool ECCEnabled() const;
  618. //! PCI bus ID of the device
  619. int pciBusID() const;
  620. //! PCI device ID of the device
  621. int pciDeviceID() const;
  622. //! PCI domain ID of the device
  623. int pciDomainID() const;
  624. //! true if device is a Tesla device using TCC driver, false otherwise
  625. bool tccDriver() const;
  626. //! number of asynchronous engines
  627. int asyncEngineCount() const;
  628. //! device shares a unified address space with the host
  629. bool unifiedAddressing() const;
  630. //! peak memory clock frequency in kilohertz
  631. int memoryClockRate() const;
  632. //! global memory bus width in bits
  633. int memoryBusWidth() const;
  634. //! size of L2 cache in bytes
  635. int l2CacheSize() const;
  636. //! maximum resident threads per multiprocessor
  637. int maxThreadsPerMultiProcessor() const;
  638. //! gets free and total device memory
  639. void queryMemory(size_t& totalMemory, size_t& freeMemory) const;
  640. size_t freeMemory() const;
  641. size_t totalMemory() const;
  642. /** @brief Provides information on CUDA feature support.
  643. @param feature_set Features to be checked. See cuda::FeatureSet.
  644. This function returns true if the device has the specified CUDA feature. Otherwise, it returns false
  645. */
  646. bool supports(FeatureSet feature_set) const;
  647. /** @brief Checks the CUDA module and device compatibility.
  648. This function returns true if the CUDA module can be run on the specified device. Otherwise, it
  649. returns false .
  650. */
  651. bool isCompatible() const;
  652. private:
  653. int device_id_;
  654. };
  655. CV_EXPORTS void printCudaDeviceInfo(int device);
  656. CV_EXPORTS void printShortCudaDeviceInfo(int device);
  657. /** @brief Converts an array to half precision floating number.
  658. @param _src input array.
  659. @param _dst output array.
  660. @param stream Stream for the asynchronous version.
  661. @sa convertFp16
  662. */
  663. CV_EXPORTS void convertFp16(InputArray _src, OutputArray _dst, Stream& stream = Stream::Null());
  664. //! @} cudacore_init
  665. }} // namespace cv { namespace cuda {
  666. #include "opencv2/core/cuda.inl.hpp"
  667. #endif /* OPENCV_CORE_CUDA_HPP */