Point Cloud Library (PCL) 1.15.1
Loading...
Searching...
No Matches
cutil_inline_runtime.h
1/*
2 * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
3 *
4 * Please refer to the NVIDIA end user license agreement (EULA) associated
5 * with this source code for terms and conditions that govern your use of
6 * this software. Any use, reproduction, disclosure, or distribution of
7 * this software and related documentation outside the terms of the EULA
8 * is strictly prohibited.
9 *
10 */
11
12#pragma once
13
14#ifdef _WIN32
15#ifdef _DEBUG // Do this only in debug mode...
16# define WINDOWS_LEAN_AND_MEAN
17# include <windows.h>
18# include <stdlib.h>
19# undef min
20# undef max
21#endif
22#endif
23
24#include <stdio.h>
25#include <string.h>
26#include <stdlib.h>
27
28#include <cufft.h>
29
30// We define these calls here, so the user doesn't need to include __FILE__ and __LINE__
31// The advantage is the developers gets to use the inline function so they can debug
32#define cutilSafeCallNoSync(err) __cudaSafeCallNoSync(err, __FILE__, __LINE__)
33#define cutilSafeCall(err) __cudaSafeCall (err, __FILE__, __LINE__)
34#define cutilSafeThreadSync() __cudaSafeThreadSync(__FILE__, __LINE__)
35#define cufftSafeCall(err) __cufftSafeCall (err, __FILE__, __LINE__)
36#define cutilCheckError(err) __cutilCheckError (err, __FILE__, __LINE__)
37#define cutilCheckMsg(msg) __cutilGetLastError (msg, __FILE__, __LINE__)
38#define cutilCheckMsgAndSync(msg) __cutilGetLastErrorAndSync (msg, __FILE__, __LINE__)
39#define cutilSafeMalloc(mallocCall) __cutilSafeMalloc ((mallocCall), __FILE__, __LINE__)
40#define cutilCondition(val) __cutilCondition (val, __FILE__, __LINE__)
41#define cutilExit(argc, argv) __cutilExit (argc, argv)
42
43inline cudaError cutilDeviceSynchronize()
44{
45 return cudaDeviceSynchronize();
46}
47
48inline cudaError cutilDeviceReset()
49{
50 return cudaDeviceReset();
51}
52
53inline void __cutilCondition(int val, char *file, int line)
54{
55 if( CUTFalse == cutCheckCondition( val, file, line ) ) {
56 exit(EXIT_FAILURE);
57 }
58}
59
60inline void __cutilExit(int argc, char **argv)
61{
62 if (!cutCheckCmdLineFlag(argc, (const char**)argv, "noprompt")) {
63 printf("\nPress ENTER to exit...\n");
64 fflush( stdout);
65 fflush( stderr);
66 getchar();
67 }
68 exit(EXIT_SUCCESS);
69}
70
71#define MIN(a,b) ((a < b) ? a : b)
72#define MAX(a,b) ((a > b) ? a : b)
73
74// Beginning of GPU Architecture definitions
75inline int _ConvertSMVer2Cores(int major, int minor)
76{
77 // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
78 struct sSMtoCores {
79 int SM; // 0xMm (hexadecimal notation), M = SM Major version, and m = SM minor version
80 int Cores;
81 };
82
83 sSMtoCores nGpuArchCoresPerSM[] =
84 { { 0x10, 8 },
85 { 0x11, 8 },
86 { 0x12, 8 },
87 { 0x13, 8 },
88 { 0x20, 32 },
89 { 0x21, 48 },
90 { -1, -1 }
91 };
92
93 int index = 0;
94 while (nGpuArchCoresPerSM[index].SM != -1) {
95 if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
96 return nGpuArchCoresPerSM[index].Cores;
97 }
98 index++;
99 }
100 printf("MapSMtoCores undefined SMversion %d.%d!\n", major, minor);
101 return -1;
102}
103// end of GPU Architecture definitions
104
105// This function returns the best GPU (with maximum GFLOPS)
106inline int cutGetMaxGflopsDeviceId()
107{
108 int current_device = 0;
109 int max_compute_perf = 0;
110 int max_perf_device = 0;
111 int device_count = 0;
112 int best_SM_arch = 0;
113 int clock_rate = 0;
114
115 cudaGetDeviceCount( &device_count );
116 // Find the best major SM Architecture GPU device
117 while ( current_device < device_count ) {
118 cudaDeviceProp deviceProp;
119 cudaGetDeviceProperties( &deviceProp, current_device );
120 if (deviceProp.major > 0 && deviceProp.major < 9999) {
121 best_SM_arch = MAX(best_SM_arch, deviceProp.major);
122 }
123 current_device++;
124 }
125
126 // Find the best CUDA capable GPU device
127 current_device = 0;
128 while( current_device < device_count ) {
129 cudaDeviceProp deviceProp;
130 cudaGetDeviceProperties( &deviceProp, current_device );
131 int sm_per_multiproc = (deviceProp.major == 9999 && deviceProp.minor == 9999) ? 1 : _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
132
133 cudaDeviceGetAttribute(&clock_rate, cudaDevAttrClockRate, current_device);
134 int compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * clock_rate;
135 if( compute_perf > max_compute_perf ) {
136 // If we find GPU with SM major > 2, search only these
137 if ( best_SM_arch > 2 ) {
138 // If our device==dest_SM_arch, choose this, or else pass
139 if (deviceProp.major == best_SM_arch) {
140 max_compute_perf = compute_perf;
141 max_perf_device = current_device;
142 }
143 } else {
144 max_compute_perf = compute_perf;
145 max_perf_device = current_device;
146 }
147 }
148 ++current_device;
149 }
150 return max_perf_device;
151}
152
153// This function returns the best GPU (with maximum GFLOPS)
154inline int cutGetMaxGflopsGraphicsDeviceId()
155{
156 int current_device = 0;
157 int max_compute_perf = 0;
158 int max_perf_device = 0;
159 int device_count = 0;
160 int best_SM_arch = 0;
161 int bTCC = 0;
162 int clock_rate = 0;
163
164 cudaGetDeviceCount( &device_count );
165 // Find the best major SM Architecture GPU device that is graphics capable
166 while ( current_device < device_count ) {
167 cudaDeviceProp deviceProp;
168 cudaGetDeviceProperties( &deviceProp, current_device );
169
170 if (deviceProp.tccDriver) bTCC = 1;
171
172 if (!bTCC) {
173 if (deviceProp.major > 0 && deviceProp.major < 9999) {
174 best_SM_arch = MAX(best_SM_arch, deviceProp.major);
175 }
176 }
177 current_device++;
178 }
179
180 // Find the best CUDA capable GPU device
181 current_device = 0;
182 while( current_device < device_count ) {
183 cudaDeviceProp deviceProp;
184 cudaGetDeviceProperties( &deviceProp, current_device );
185 int sm_per_multiproc = (deviceProp.major == 9999 && deviceProp.minor == 9999) ? 1 : _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
186
187 if (deviceProp.tccDriver) bTCC = 1;
188
189 if (!bTCC) // Is this GPU running the TCC driver? If so we pass on this
190 {
191 cudaDeviceGetAttribute(&clock_rate, cudaDevAttrClockRate, current_device);
192 int compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * clock_rate;
193 if( compute_perf > max_compute_perf ) {
194 // If we find GPU with SM major > 2, search only these
195 if ( best_SM_arch > 2 ) {
196 // If our device==dest_SM_arch, choose this, or else pass
197 if (deviceProp.major == best_SM_arch) {
198 max_compute_perf = compute_perf;
199 max_perf_device = current_device;
200 }
201 }
202 else {
203 max_compute_perf = compute_perf;
204 max_perf_device = current_device;
205 }
206 }
207 }
208 ++current_device;
209 }
210 return max_perf_device;
211}
212
213// Give a little more for Windows : the console window often disappears before we can read the message
214#ifdef _WIN32
215# if 1//ndef UNICODE
216# ifdef _DEBUG // Do this only in debug mode...
217 inline void VSPrintf(FILE *file, LPCSTR fmt, ...)
218 {
219 std::size_t fmt2_sz = 2048;
220 char *fmt2 = (char*)malloc(fmt2_sz);
221 va_list vlist;
222 va_start(vlist, fmt);
223 while((_vsnprintf(fmt2, fmt2_sz, fmt, vlist)) < 0) // means there wasn't anough room
224 {
225 fmt2_sz *= 2;
226 if(fmt2) free(fmt2);
227 fmt2 = (char*)malloc(fmt2_sz);
228 }
229 OutputDebugStringA(fmt2);
230 fprintf(file, fmt2);
231 free(fmt2);
232 }
233#define FPRINTF(a) VSPrintf a
234#else //debug
235#define FPRINTF(a) fprintf a
236// For other than Win32
237#endif //debug
238#else //unicode
239// Unicode case... let's give-up for now and keep basic printf
240#define FPRINTF(a) fprintf a
241#endif //unicode
242#else //win32
243#define FPRINTF(a) fprintf a
244#endif //win32
245
246// NOTE: "%s(%i) : " allows Visual Studio to directly jump to the file at the right line
247// when the user double clicks on the error line in the Output pane. Like any compile error.
248
249inline void __cudaSafeCallNoSync( cudaError err, const char *file, const int line )
250{
251 if( cudaSuccess != err) {
252 FPRINTF((stderr, "%s(%i) : cudaSafeCallNoSync() Runtime API error : %s.\n",
253 file, line, cudaGetErrorString( err) ));
254 exit(-1);
255 }
256}
257
258inline void __cudaSafeCall( cudaError err, const char *file, const int line )
259{
260 if( cudaSuccess != err) {
261 FPRINTF((stderr, "%s(%i) : cudaSafeCall() Runtime API error : %s.\n",
262 file, line, cudaGetErrorString( err) ));
263 exit(-1);
264 }
265}
266
267inline void __cudaSafeThreadSync( const char *file, const int line )
268{
269 cudaError err = cutilDeviceSynchronize();
270 if ( cudaSuccess != err) {
271 FPRINTF((stderr, "%s(%i) : cudaDeviceSynchronize() Runtime API error : %s.\n",
272 file, line, cudaGetErrorString( err) ));
273 exit(-1);
274 }
275}
276
277inline void __cufftSafeCall( cufftResult err, const char *file, const int line )
278{
279 if( CUFFT_SUCCESS != err) {
280 FPRINTF((stderr, "%s(%i) : cufftSafeCall() CUFFT error.\n",
281 file, line));
282 exit(-1);
283 }
284}
285
286inline void __cutilCheckError( CUTBoolean err, const char *file, const int line )
287{
288 if( CUTTrue != err) {
289 FPRINTF((stderr, "%s(%i) : CUTIL CUDA error.\n",
290 file, line));
291 exit(-1);
292 }
293}
294
295inline void __cutilGetLastError( const char *errorMessage, const char *file, const int line )
296{
297 cudaError_t err = cudaGetLastError();
298 if( cudaSuccess != err) {
299 FPRINTF((stderr, "%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : %s.\n",
300 file, line, errorMessage, cudaGetErrorString( err) ));
301 exit(-1);
302 }
303}
304
305inline void __cutilGetLastErrorAndSync( const char *errorMessage, const char *file, const int line )
306{
307 cudaError_t err = cudaGetLastError();
308 if( cudaSuccess != err) {
309 FPRINTF((stderr, "%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : %s.\n",
310 file, line, errorMessage, cudaGetErrorString( err) ));
311 exit(-1);
312 }
313
314 err = cutilDeviceSynchronize();
315 if( cudaSuccess != err) {
316 FPRINTF((stderr, "%s(%i) : cutilCheckMsg cudaDeviceSynchronize error: %s : %s.\n",
317 file, line, errorMessage, cudaGetErrorString( err) ));
318 exit(-1);
319 }
320}
321
322inline void __cutilSafeMalloc( void *pointer, const char *file, const int line )
323{
324 if( !(pointer)) {
325 FPRINTF((stderr, "%s(%i) : cutilSafeMalloc host malloc failure\n",
326 file, line));
327 exit(-1);
328 }
329}
330
331#if __DEVICE_EMULATION__
332 inline int cutilDeviceInit(int ARGC, char **ARGV) { }
333 inline int cutilChooseCudaDevice(int ARGC, char **ARGV) { }
334#else
335 inline int cutilDeviceInit(int ARGC, char **ARGV)
336 {
337 int deviceCount;
338 cutilSafeCallNoSync(cudaGetDeviceCount(&deviceCount));
339 if (deviceCount == 0) {
340 FPRINTF((stderr, "CUTIL CUDA error: no devices supporting CUDA.\n"));
341 exit(-1);
342 }
343 int dev = 0;
344 cutGetCmdLineArgumenti(ARGC, (const char **) ARGV, "device", &dev);
345 if (dev < 0)
346 dev = 0;
347 if (dev > deviceCount-1) {
348 fprintf(stderr, "\n");
349 fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
350 fprintf(stderr, ">> cutilDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
351 fprintf(stderr, "\n");
352 return -dev;
353 }
354 cudaDeviceProp deviceProp;
355 cutilSafeCallNoSync(cudaGetDeviceProperties(&deviceProp, dev));
356 if (deviceProp.major < 1) {
357 FPRINTF((stderr, "cutil error: GPU device does not support CUDA.\n"));
358 exit(-1); \
359 }
360 printf("> Using CUDA device [%d]: %s\n", dev, deviceProp.name);
361 cutilSafeCall(cudaSetDevice(dev));
362
363 return dev;
364 }
365
366 // General initialization call to pick the best CUDA Device
367 inline int cutilChooseCudaDevice(int argc, char **argv)
368 {
369 cudaDeviceProp deviceProp;
370 int devID = 0;
371 // If the command-line has a device number specified, use it
372 if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
373 devID = cutilDeviceInit(argc, argv);
374 if (devID < 0) {
375 printf("exiting...\n");
376 cutilExit(argc, argv);
377 exit(0);
378 }
379 } else {
380 // Otherwise pick the device with highest Gflops/s
381 devID = cutGetMaxGflopsDeviceId();
382 cutilSafeCallNoSync( cudaSetDevice( devID ) );
383 cutilSafeCallNoSync( cudaGetDeviceProperties(&deviceProp, devID) );
384 printf("> Using CUDA device [%d]: %s\n", devID, deviceProp.name);
385 }
386 return devID;
387 }
388#endif
389
390
391//! Check for CUDA context lost
392inline void cutilCudaCheckCtxLost(const char *errorMessage, const char *file, const int line )
393{
394 cudaError_t err = cudaGetLastError();
395 if( cudaSuccess != err) {
396 FPRINTF((stderr, "%s(%i) : CUDA error: %s : %s.\n",
397 file, line, errorMessage, cudaGetErrorString( err) ));
398 exit(-1);
399 }
400 err = cutilDeviceSynchronize();
401 if( cudaSuccess != err) {
402 FPRINTF((stderr, "%s(%i) : CUDA error: %s : %s.\n",
403 file, line, errorMessage, cudaGetErrorString( err) ));
404 exit(-1);
405 }
406}
407
408#ifndef STRCASECMP
409#ifdef _WIN32
410#define STRCASECMP _stricmp
411#else
412#define STRCASECMP strcasecmp
413#endif
414#endif
415
416#ifndef STRNCASECMP
417#ifdef _WIN32
418#define STRNCASECMP _strnicmp
419#else
420#define STRNCASECMP strncasecmp
421#endif
422#endif
423
424inline void __cutilQAFinish(int argc, char **argv, bool bStatus)
425{
426 const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
427
428 bool bFlag = false;
429 for (int i=1; i < argc; i++) {
430 if (!STRCASECMP(argv[i], "-qatest") || !STRCASECMP(argv[i], "-noprompt")) {
431 bFlag |= true;
432 }
433 }
434
435 if (bFlag) {
436 printf("&&&& %s %s", sStatus[bStatus], argv[0]);
437 for (int i=1; i < argc; i++) printf(" %s", argv[i]);
438 }
439 else {
440 printf("[%s] test result\n%s\n", argv[0], sStatus[bStatus]);
441 }
442}
443
444// General check for CUDA GPU SM Capabilities
445inline bool cutilCudaCapabilities(int major_version, int minor_version, int argc, char **argv)
446{
447 cudaDeviceProp deviceProp;
448 deviceProp.major = 0;
449 deviceProp.minor = 0;
450 int dev;
451
452#ifdef __DEVICE_EMULATION__
453 printf("> Compute Device Emulation Mode \n");
454#endif
455
456 cutilSafeCall( cudaGetDevice(&dev) );
457 cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev));
458
459 if((deviceProp.major > major_version) ||
460 (deviceProp.major == major_version && deviceProp.minor >= minor_version)) {
461 printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", dev, deviceProp.name, deviceProp.major, deviceProp.minor);
462 return true;
463 }
464 else {
465 printf("There is no device supporting CUDA compute capability %d.%d.\n", major_version, minor_version);
466 __cutilQAFinish(argc, argv, true);
467 return false;
468 }
469}