玩转gpgpu-sim 04记—— __cudaRegisterBinary() of gpgpu-sim 到底做了什么

1. 参考 nv 的 __cudaRegisterFatBinary

nv的这个函数的实现藏在cuda sdk中，但是这个函数的调用源代码在nv cuda sdk 的安装好的头文件中；

比如在cuda 12.1中调用源代码如下：


#define __cudaRegisterBinary(X)                                                  \
        __cudaFatCubinHandle = __cudaRegisterFatBinary((void*)&__fatDeviceText); \
        { void (*callback_fp)(void **) =  (void (*)(void **))(X); (*callback_fp)(__cudaFatCubinHandle); __cudaRegisterFatBinaryEnd(__cudaFatCubinHandle); }\
        atexit(__cudaUnregisterBinaryUtil)
 
#define __cudaRegisterVariable(handle, var, ext, size, constant, global) \
        __cudaRegisterVar(handle, (char*)&__host##var, (char*)__device##var, __name##var, ext, size, constant, global)
#define __cudaRegisterManagedVariable(handle, var, ext, size, constant, global) \
        __cudaRegisterManagedVar(handle, (void **)&__host##var, (char*)__device##var, __name##var, ext, size, constant, global)
 
#define __cudaRegisterGlobalTexture(handle, tex, dim, norm, ext) \
        __cudaRegisterTexture(handle, (const struct textureReference*)&tex, (const void**)(void*)__device##tex, __name##tex, dim, norm, ext)
#define __cudaRegisterGlobalSurface(handle, surf, dim, ext) \
        __cudaRegisterSurface(handle, (const struct surfaceReference*)&surf, (const void**)(void*)__device##surf, __name##surf, dim, ext)
#define __cudaRegisterEntry(handle, funptr, fun, thread_limit) \
        __cudaRegisterFunction(handle, (const char*)funptr, (char*)__device_fun(fun), #fun, -1, (uint3*)0, (uint3*)0, (dim3*)0, (dim3*)0, (int*)0)

这个函数会被编译器注入进 cuda app 的 main 数之前先调用；

起到的作用大概是调用进 cuda runtime，把 cuda app中需要使用的 fatbin 和其中的cubin 解析出来，并发送给 gpu 预备好调用；

fatbin文件放置在 cuda app 文件的 nv官方自定义段 nvFatBinSegment 中；

2. gpgpu-sim 的 __cudaRegisterFatBinary

官方文档：

GPGPU-Sim 3.x Manual

__cudaRegisterBinary(void*) 被执行到的代码逻辑如下：


void** CUDARTAPI __cudaRegisterFatBinary( void *fatCubin )
{
#if (CUDART_VERSION < 2010)
	printf("GPGPU-Sim PTX: ERROR ** this version of GPGPU-Sim requires CUDA 2.1 or higher\n");
	exit(1);
#endif
	CUctx_st *context = GPGPUSim_Context();
	static unsigned next_fat_bin_handle = 1;
	if(context->get_device()->get_gpgpu()->get_config().use_cuobjdump()) {
		// The following workaround has only been verified on 64-bit systems. 
		if (sizeof(void*) == 4) 
			printf("GPGPU-Sim PTX: FatBin file name extraction has not been tested on 32-bit system.\n"); 
 
		// FatBin handle from the .fatbin.c file (one of the intermediate files generated by NVCC)
		typedef struct {int m; int v; const unsigned long long* d; char* f;} __fatDeviceText __attribute__ ((aligned (8))); 
		__fatDeviceText * fatDeviceText = (__fatDeviceText *) fatCubin;
 
		// Extract the source code file name that generate the given FatBin. 
		// - Obtains the pointer to the actual fatbin structure from the FatBin handle (fatCubin).
		// - An integer inside the fatbin structure contains the relative offset to the source code file name.
		// - This offset differs among different CUDA and GCC versions. 
		char * pfatbin = (char*) fatDeviceText->d; 
		int offset = *((int*)(pfatbin+48)); 
		char * filename = (pfatbin+16+offset); 
 
		// The extracted file name is associated with a fat_cubin_handle passed
		// into cudaLaunch().  Inside cudaLaunch(), the associated file name is
		// used to find the PTX/SASS section from cuobjdump, which contains the
		// PTX/SASS code for the launched kernel function.  
		// This allows us to work around the fact that cuobjdump only outputs the
		// file name associated with each section. 
		unsigned long long fat_cubin_handle = next_fat_bin_handle;
		next_fat_bin_handle++;
		printf("GPGPU-Sim PTX: __cudaRegisterFatBinary, fat_cubin_handle = %llu, filename=%s\n", fat_cubin_handle, filename);
		/*!
		 * This function extracts all data from all files in first call
		 * then for next calls, only returns the appropriate number
		 */
		assert(fat_cubin_handle >= 1);
		if (fat_cubin_handle==1) cuobjdumpInit();
		cuobjdumpRegisterFatBinary(fat_cubin_handle, filename);
 
		return (void**)fat_cubin_handle;
	}else{ ... }
 
}

2.1. 调用关系

刚开始一波的调用关系如下：

代码方便索引，此处整理的整体关系为下面的函数调用上面首先定义的函数：


class gpgpu_functional_sim_config 
{	...
	int m_ptx_use_cuobjdump;
	...
}
 
void gpgpu_functional_sim_config::reg_options(class OptionParser * opp)
{	...
	option_parser_register(opp, 
						   "-gpgpu_ptx_use_cuobjdump", OPT_BOOL,
						   &m_ptx_use_cuobjdump,
						   "Use cuobjdump to extract ptx and sass from binaries",
						   "1");//CUDART_VERSION >= 4000
	...
}
 
gpgpu_sim *gpgpu_ptx_sim_init_perf()
{	...
	g_the_gpu_config.reg_options(opp);
	...
}
 
class _cuda_device_id *GPGPUSim_Init()
{	...
	gpgpu_sim *the_gpu = gpgpu_ptx_sim_init_perf();
	the_gpu->set_prop(prop);
	the_device = new _cuda_device_id(the_gpu);
	start_sim_thread(1);
	...
}
 
void** CUDARTAPI __cudaRegisterFatBinary( void *fatCubin )
{	...
	static CUctx_st* GPGPUSim_Context()
		class _cuda_device_id *GPGPUSim_Init()
		CUctx_st( _cuda_device_id *gpu ) { m_gpu = gpu; }//the_context = new CUctx_st(the_gpu);
	cuobjdumpInit();
	cuobjdumpRegisterFatBinary(fat_cubin_handle, filename);
	...
}

2.2. GPGPUSim_Context() 做了什么

2.3. 表示什么含义

GPGPUSim_Context()->get_device()->get_gpgpu()->get_config().use_cuobjdump() 表示什么含义

2.4. cuobjdumpInit() 做了什么

2.5. cuobjdumpRegisterFatBinary() 做了什么

相关阅读:
音视频基础知识
 GoLand 2023.2.3(go语言开发)
【20221114】【每日一题】子集
 Java 自定义Excel数据排序
 CMakeLists.txt 详解
 网站被黑后处理方法及删除批量恶意代码的方法步骤
 PerfView专题 (第十一篇)：使用 Diff 功能洞察 C# 内存泄漏增量
 系列文章｜云原生时代下微服务架构进阶之路 - Spring Cloud
vue3+TS实现简易组件库
 代码随想录 | Day 48 - LeetCode 198. 打家劫舍、LeetCode 213. 打家劫舍II、LeetCode 337. 打家劫舍III
原文地址：https://blog.csdn.net/eloudy/article/details/133280872