当前位置：网站首页>cuMemcpyHtoDAsync failed: invalid argument

cuMemcpyHtoDAsync failed: invalid argument

2022-07-21 19:43:00 【AI vision netqi】

tenorrt Operation error reporting ：

pycuda._driver.LogicError: cuMemcpyHtoDAsync failed: invalid argument
-------------------------------------------------------------------
PyCUDA ERROR: The context stack was not empty upon module cleanup.
-------------------------------------------------------------------
A context was still active when the context stack was being
cleaned up. At this point in our execution, CUDA may already
have been deinitialized, so there is no way we can finish
cleanly. The program will be aborted now.
Use Context.pop() to avoid this problem.
-------------------------------------------------------------------

tensorrt Inference code ：


import sys
sys.path.append('../../tools/')
import cv2
import time

import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit

print('trt version',trt.__version__)

TRT_LOGGER = trt.Logger()

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine, context):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for i, binding in enumerate(engine):
        size = trt.volume(context.get_binding_shape(i))
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream

# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream, batch_size):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]

#  use numpy rewrite softmax
def softmax(out_np, dim):
    s_value = np.exp(out_np) / np.sum(np.exp(out_np), axis=dim, keepdims=True)
    return s_value


class FaceClassify(object):
    def __init__(self, configs):
        self.engine_path = configs.face_classify_engine
        self.input_size = configs.classify_input_size
        self.image_size = self.input_size
        self.MEAN = configs.classify_mean
        self.STD = configs.classify_std
        self.engine = self.get_engine()
        self.context = self.engine.create_execution_context()


    def get_engine(self):
        # If a serialized engine exists, use it instead of building an engine.
        f = open(self.engine_path, 'rb')
        runtime = trt.Runtime(TRT_LOGGER)
        return runtime.deserialize_cuda_engine(f.read())


    def detect(self, image_src, cuda_ctx = pycuda.autoinit.context):
        cuda_ctx.push()
        start_all=time.time()
        IN_IMAGE_H, IN_IMAGE_W = self.image_size

        # Input
        img_in = cv2.cvtColor(image_src, cv2.COLOR_BGR2RGB)
        img_in = cv2.resize(img_in, (IN_IMAGE_W, IN_IMAGE_H), interpolation=cv2.INTER_LINEAR)

        img_in = np.transpose(img_in, (2, 0, 1)).astype(np.float32)  # (3, 240, 240)
        img_in /= 255.0  #  normalization [0, 1]

        # mean = (0.485, 0.456, 0.406)
        mean0 = np.expand_dims(self.MEAN[0] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
        mean1 = np.expand_dims(self.MEAN[1] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
        mean2 = np.expand_dims(self.MEAN[2] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
        mean = np.concatenate((mean0, mean1, mean2), axis=0)

        # std = (0.229, 0.224, 0.225)
        std0 = np.expand_dims(self.STD[0] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
        std1 = np.expand_dims(self.STD[1] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
        std2 = np.expand_dims(self.STD[2] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
        std = np.concatenate((std0, std1, std2), axis=0)

        img_in = ((img_in - mean) / std).astype(np.float32)
        img_in = np.expand_dims(img_in, axis=0)  # (1, 3, 240, 240)

        img_in = np.ascontiguousarray(img_in)

        start=time.time()
        #  Dynamic input 
        self.context.active_optimization_profile = 0
        origin_inputshape = self.context.get_binding_shape(0)
        origin_inputshape[0], origin_inputshape[1], origin_inputshape[2], origin_inputshape[3] = img_in.shape
        self.context.set_binding_shape(0, (origin_inputshape))  #  If each input size Dissimilarity , According to the inputs Of size Change the corresponding context Medium size

        inputs, outputs, bindings, stream = allocate_buffers(self.engine, self.context)
        # Do inference
        inputs[0].host = img_in
        trt_outputs = do_inference(self.context, bindings=bindings, inputs=inputs, outputs=outputs,
                                   stream=stream, batch_size=1)
        print('infer time',time.time()-start,trt_outputs)
        if cuda_ctx:
            cuda_ctx.pop()


        labels_sm = softmax(trt_outputs, dim=0)
        labels_max = np.argmax(labels_sm, axis=1)
        print('time_a',time.time()-start_all)
        return labels_max.item() ,trt_outputs

if __name__ == '__main__':
    class Params:
        pass

    opt = Params()
    opt.face_classify_engine = 'efficientnet_b1.trt'
    opt.classify_input_size = [128 ,128]
    opt.classify_mean = [0.5 ,0.5 ,0.5]
    opt.classify_std = [0.5 ,0.5 ,0.5]

    face =FaceClassify(opt)
    image_src =cv2.imread(r'987.jpg')
    # image_src =cv2.imread(r'F:\project\detect\yolov5\tensorrt\yolo-tensorrt_dll_trt8\sln\x64\Release\16_1.jpg')

    for i in range(10):
        labels_max ,trt_outputs =face.detect(image_src)
    print(trt_outputs)
    print(labels_max)

reason , The data is not formatted as float32 type ,

resolvent ：

img_in = ((img_in - mean) / std).astype(np.float32)

The answers of netizens can also be referred to ：

My personal feeling is that the input data does not match the address applied for by the model data entry :

Enter picture data shape incorrect , Maybe not (N, C, H, W)
Input image data dtype incorrect This is the case with me , Because I am pytorch turn ONNX Re turn tensorRT Of , stay ONNX The input in is not supported float64 by , Only single precision data formats are supported , And I am tensorRT The input in does not turn like this , Input float64 Pictures of the , So wrong reporting , Change it to float32 It's stable .

Link to the original text ：https://blog.csdn.net/GungnirsPledge/article/details/108428651

The article also has solutions .

原网站

版权声明
本文为[AI vision netqi]所创，转载请带上原文链接，感谢
https://yzsam.com/2022/202/202207210426279451.html

当前位置：网站首页>cuMemcpyHtoDAsync failed: invalid argument

cuMemcpyHtoDAsync failed: invalid argument

边栏推荐

猜你喜欢

随机推荐