nv的这个函数的实现藏在cuda sdk中,但是这个函数的调用源代码在nv cuda sdk 的安装好的头文件中;
比如在cuda 12.1中调用源代码如下:
- #define __cudaRegisterBinary(X) \
- __cudaFatCubinHandle = __cudaRegisterFatBinary((void*)&__fatDeviceText); \
- { void (*callback_fp)(void **) = (void (*)(void **))(X); (*callback_fp)(__cudaFatCubinHandle); __cudaRegisterFatBinaryEnd(__cudaFatCubinHandle); }\
- atexit(__cudaUnregisterBinaryUtil)
-
- #define __cudaRegisterVariable(handle, var, ext, size, constant, global) \
- __cudaRegisterVar(handle, (char*)&__host##var, (char*)__device##var, __name##var, ext, size, constant, global)
- #define __cudaRegisterManagedVariable(handle, var, ext, size, constant, global) \
- __cudaRegisterManagedVar(handle, (void **)&__host##var, (char*)__device##var, __name##var, ext, size, constant, global)
-
- #define __cudaRegisterGlobalTexture(handle, tex, dim, norm, ext) \
- __cudaRegisterTexture(handle, (const struct textureReference*)&tex, (const void**)(void*)__device##tex, __name##tex, dim, norm, ext)
- #define __cudaRegisterGlobalSurface(handle, surf, dim, ext) \
- __cudaRegisterSurface(handle, (const struct surfaceReference*)&surf, (const void**)(void*)__device##surf, __name##surf, dim, ext)
- #define __cudaRegisterEntry(handle, funptr, fun, thread_limit) \
- __cudaRegisterFunction(handle, (const char*)funptr, (char*)__device_fun(fun), #fun, -1, (uint3*)0, (uint3*)0, (dim3*)0, (dim3*)0, (int*)0)
这个函数会被编译器注入进 cuda app 的 main 数之前先调用;
起到的作用大概是调用进 cuda runtime,把 cuda app中需要使用的 fatbin 和其中的cubin 解析出来,并发送给 gpu 预备好调用;
fatbin文件放置在 cuda app 文件的 nv官方自定义段 nvFatBinSegment 中;
官方文档:
__cudaRegisterBinary(void*) 被执行到的代码逻辑如下:
- void** CUDARTAPI __cudaRegisterFatBinary( void *fatCubin )
- {
- #if (CUDART_VERSION < 2010)
- printf("GPGPU-Sim PTX: ERROR ** this version of GPGPU-Sim requires CUDA 2.1 or higher\n");
- exit(1);
- #endif
- CUctx_st *context = GPGPUSim_Context();
- static unsigned next_fat_bin_handle = 1;
- if(context->get_device()->get_gpgpu()->get_config().use_cuobjdump()) {
- // The following workaround has only been verified on 64-bit systems.
- if (sizeof(void*) == 4)
- printf("GPGPU-Sim PTX: FatBin file name extraction has not been tested on 32-bit system.\n");
-
- // FatBin handle from the .fatbin.c file (one of the intermediate files generated by NVCC)
- typedef struct {int m; int v; const unsigned long long* d; char* f;} __fatDeviceText __attribute__ ((aligned (8)));
- __fatDeviceText * fatDeviceText = (__fatDeviceText *) fatCubin;
-
- // Extract the source code file name that generate the given FatBin.
- // - Obtains the pointer to the actual fatbin structure from the FatBin handle (fatCubin).
- // - An integer inside the fatbin structure contains the relative offset to the source code file name.
- // - This offset differs among different CUDA and GCC versions.
- char * pfatbin = (char*) fatDeviceText->d;
- int offset = *((int*)(pfatbin+48));
- char * filename = (pfatbin+16+offset);
-
- // The extracted file name is associated with a fat_cubin_handle passed
- // into cudaLaunch(). Inside cudaLaunch(), the associated file name is
- // used to find the PTX/SASS section from cuobjdump, which contains the
- // PTX/SASS code for the launched kernel function.
- // This allows us to work around the fact that cuobjdump only outputs the
- // file name associated with each section.
- unsigned long long fat_cubin_handle = next_fat_bin_handle;
- next_fat_bin_handle++;
- printf("GPGPU-Sim PTX: __cudaRegisterFatBinary, fat_cubin_handle = %llu, filename=%s\n", fat_cubin_handle, filename);
- /*!
- * This function extracts all data from all files in first call
- * then for next calls, only returns the appropriate number
- */
- assert(fat_cubin_handle >= 1);
- if (fat_cubin_handle==1) cuobjdumpInit();
- cuobjdumpRegisterFatBinary(fat_cubin_handle, filename);
-
- return (void**)fat_cubin_handle;
- }else{ ... }
-
- }
刚开始一波的调用关系如下:

代码方便索引,此处整理的整体关系为下面的函数调用上面首先定义的函数:
- class gpgpu_functional_sim_config
- { ...
- int m_ptx_use_cuobjdump;
- ...
- }
-
- void gpgpu_functional_sim_config::reg_options(class OptionParser * opp)
- { ...
- option_parser_register(opp,
- "-gpgpu_ptx_use_cuobjdump", OPT_BOOL,
- &m_ptx_use_cuobjdump,
- "Use cuobjdump to extract ptx and sass from binaries",
- "1");//CUDART_VERSION >= 4000
- ...
- }
-
- gpgpu_sim *gpgpu_ptx_sim_init_perf()
- { ...
- g_the_gpu_config.reg_options(opp);
- ...
- }
-
- class _cuda_device_id *GPGPUSim_Init()
- { ...
- gpgpu_sim *the_gpu = gpgpu_ptx_sim_init_perf();
- the_gpu->set_prop(prop);
- the_device = new _cuda_device_id(the_gpu);
- start_sim_thread(1);
- ...
- }
-
- void** CUDARTAPI __cudaRegisterFatBinary( void *fatCubin )
- { ...
- static CUctx_st* GPGPUSim_Context()
- class _cuda_device_id *GPGPUSim_Init()
- CUctx_st( _cuda_device_id *gpu ) { m_gpu = gpu; }//the_context = new CUctx_st(the_gpu);
- cuobjdumpInit();
- cuobjdumpRegisterFatBinary(fat_cubin_handle, filename);
- ...
- }
GPGPUSim_Context()->get_device()->get_gpgpu()->get_config().use_cuobjdump() 表示什么含义