cuMemcpyHtoDAsync failed: invalid argument

tenorrt运行报错：

pycuda._driver.LogicError: cuMemcpyHtoDAsync failed: invalid argument
-------------------------------------------------------------------
PyCUDA ERROR: The context stack was not empty upon module cleanup.
-------------------------------------------------------------------
A context was still active when the context stack was being
cleaned up. At this point in our execution, CUDA may already
have been deinitialized, so there is no way we can finish
cleanly. The program will be aborted now.
Use Context.pop() to avoid this problem.
-------------------------------------------------------------------

tensorrt 推理代码：


 
import sys
sys.path.append('../../tools/')
import cv2
import time
 
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
 
print('trt version',trt.__version__)
 
TRT_LOGGER = trt.Logger()
 
class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem
 
    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
 
    def __repr__(self):
        return self.__str__()
 
# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine, context):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for i, binding in enumerate(engine):
        size = trt.volume(context.get_binding_shape(i))
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream
 
# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream, batch_size):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]
 
# 用numpy重写softmax
def softmax(out_np, dim):
    s_value = np.exp(out_np) / np.sum(np.exp(out_np), axis=dim, keepdims=True)
    return s_value
 
 
class FaceClassify(object):
    def __init__(self, configs):
        self.engine_path = configs.face_classify_engine
        self.input_size = configs.classify_input_size
        self.image_size = self.input_size
        self.MEAN = configs.classify_mean
        self.STD = configs.classify_std
        self.engine = self.get_engine()
        self.context = self.engine.create_execution_context()
 
 
    def get_engine(self):
        # If a serialized engine exists, use it instead of building an engine.
        f = open(self.engine_path, 'rb')
        runtime = trt.Runtime(TRT_LOGGER)
        return runtime.deserialize_cuda_engine(f.read())
 
 
    def detect(self, image_src, cuda_ctx = pycuda.autoinit.context):
        cuda_ctx.push()
        start_all=time.time()
        IN_IMAGE_H, IN_IMAGE_W = self.image_size
 
        # Input
        img_in = cv2.cvtColor(image_src, cv2.COLOR_BGR2RGB)
        img_in = cv2.resize(img_in, (IN_IMAGE_W, IN_IMAGE_H), interpolation=cv2.INTER_LINEAR)
 
        img_in = np.transpose(img_in, (2, 0, 1)).astype(np.float32)  # (3, 240, 240)
        img_in /= 255.0  # 归一化[0, 1]
 
        # mean = (0.485, 0.456, 0.406)
        mean0 = np.expand_dims(self.MEAN[0] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
        mean1 = np.expand_dims(self.MEAN[1] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
        mean2 = np.expand_dims(self.MEAN[2] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
        mean = np.concatenate((mean0, mean1, mean2), axis=0)
 
        # std = (0.229, 0.224, 0.225)
        std0 = np.expand_dims(self.STD[0] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
        std1 = np.expand_dims(self.STD[1] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
        std2 = np.expand_dims(self.STD[2] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
        std = np.concatenate((std0, std1, std2), axis=0)
 
        img_in = ((img_in - mean) / std).astype(np.float32)
        img_in = np.expand_dims(img_in, axis=0)  # (1, 3, 240, 240)
 
        img_in = np.ascontiguousarray(img_in)
 
        start=time.time()
        # 动态输入
        self.context.active_optimization_profile = 0
        origin_inputshape = self.context.get_binding_shape(0)
        origin_inputshape[0], origin_inputshape[1], origin_inputshape[2], origin_inputshape[3] = img_in.shape
        self.context.set_binding_shape(0, (origin_inputshape))  # 若每个输入的size不一样，可根据inputs的size更改对应的context中的size
 
        inputs, outputs, bindings, stream = allocate_buffers(self.engine, self.context)
        # Do inference
        inputs[0].host = img_in
        trt_outputs = do_inference(self.context, bindings=bindings, inputs=inputs, outputs=outputs,
                                   stream=stream, batch_size=1)
        print('infer time',time.time()-start,trt_outputs)
        if cuda_ctx:
            cuda_ctx.pop()
 
 
        labels_sm = softmax(trt_outputs, dim=0)
        labels_max = np.argmax(labels_sm, axis=1)
        print('time_a',time.time()-start_all)
        return labels_max.item() ,trt_outputs
 
if __name__ == '__main__':
    class Params:
        pass
 
    opt = Params()
    opt.face_classify_engine = 'efficientnet_b1.trt'
    opt.classify_input_size = [128 ,128]
    opt.classify_mean = [0.5 ,0.5 ,0.5]
    opt.classify_std = [0.5 ,0.5 ,0.5]
 
    face =FaceClassify(opt)
    image_src =cv2.imread(r'987.jpg')
    # image_src =cv2.imread(r'F:\project\detect\yolov5\tensorrt\yolo-tensorrt_dll_trt8\sln\x64\Release\16_1.jpg')
 
    for i in range(10):
        labels_max ,trt_outputs =face.detect(image_src)
    print(trt_outputs)
    print(labels_max)

原因，数据没有格式化为float32类型，

解决方法：

img_in = ((img_in - mean) / std).astype(np.float32)

网友的答案也可以参考：

我个人感觉的原因是输入的数据和模型数据入口所申请的地址不匹配:

输入图片数据shape不对, 可能不是(N, C, H, W)
输入图片数据的dtype不对我是这种情况, 由于我是pytorch 转 ONNX 再转 tensorRT的, 在ONNX中的输入是不支持float64为,只支持单精度的数据格式, 而我自己在tensorRT里的输入没有这么转, 输入了float64的图片,所以报错, 把它改成float32 就稳了.

原文链接：https://blog.csdn.net/GungnirsPledge/article/details/108428651

文章也有解决方法。

相关阅读:
Selenium自动化测试之学会元素定位
 【问题探讨】exists & in 使用效率探究
 告别EXCEL，易点易动库存管理系统帮助企业提升固定资产管理效率
 ChatGPT、New Bing、文心一言、通义千问等 AI 工具到底哪个更AI？ - 第二期
 数据库操作语言：DML（data management lauguage）
2022pycharm：虚拟环境的启用与删除
 【FAQ】视频直播点播平台EasyDSS如何单独保存录像计划文件？
MAC 机器上 Python 程序打包
 论文阅读笔记(十一)——BioInformatics Agent (BIA)
minio网站登录invalid Login怎么解决
原文地址：https://blog.csdn.net/jacke121/article/details/125904060