• 3d稀疏卷积——spconv源码剖析(四)


    普通3d稀疏卷积RuleBook构建

    我们继续看普通稀疏卷积RuleBook的建立过程,返回src/spconv/spconv_ops.cc,看getIndicePairs函数的普通3D稀疏卷积部分

        // torch.numel()统计元素的个数 N*2*27/2+1
        auto indicePairUnique = torch::full({indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),torch::dtype(torch::kInt32).device(indices.device()));
        // [N*27,4]
        torch::Tensor outInds = torch::zeros({numAct * kernelVolume, coorDim + 1},torch::dtype(torch::kInt32).device(indices.device()));
        if (indices.device().type() == torch::kCPU) { // CPU
          numActOut = create_conv_indice_pair_cpu(indices, outInds, gridOut, indicePairs, indiceNum, kernelSize, stride,padding, dilation, outSpatialShape, transpose, false, useHash);
        }
    #ifdef TV_CUDA
        else if (indices.device().type() == torch::kCUDA) { // GPU
          numActOut = create_conv_indice_pair_p1_cuda(indices,          // torch.Size([N, 4]) voxel空间索引 
                                                      indicePairs,      // torch.Size([2,27,N]),-1填充 保存 rulebook
                                                      indiceNum,        // torch.Size([27]) 用于保存卷积核每一个位置上的总的计算的次数
                                                      indicePairUnique, // N*27+1
                                                      kernelSize,       // [3,3,3]
                                                      stride,           // [2,2,2]
                                                      padding,          // [1,1,1]
                                                      dilation,         // [0,0,0]
                                                      outSpatialShape,  // [21, 720, 720]
                                                      transpose         // False
                                                      );
          if (numActOut > 0) {
            auto res = torch::_unique(indicePairUnique);
            indicePairUnique = std::get<0>(res);
            numActOut = create_conv_indice_pair_p2_cuda(indices, 
                                                        outInds, 
                                                        gridOut, 
                                                        indicePairs, 
                                                        indiceNum, 
                                                        indicePairUnique,
                                                        outSpatialShape, 
                                                        transpose, 
                                                        false, 
                                                        useHash);
            if (numActOut == -1) {
              auto device = indices.device();
              outInds = outInds.to({torch::kCPU});
              indicePairs = indicePairs.to({torch::kCPU});
              indiceNum = indiceNum.to({torch::kCPU});
              indices = indices.to({torch::kCPU});
              numActOut = create_conv_indice_pair_cpu(indices, outInds, gridOut, indicePairs, indiceNum, kernelSize,stride, padding, dilation, outSpatialShape, transpose, false,useHash);
    
              return {outInds.to(device).slice(0, 0, numActOut),indicePairs.to(device), indiceNum.to(device)};
            }
          }
        }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45

    普通3d稀疏卷积调用create_conv_indice_pair_p1_cudacreate_conv_indice_pair_p2_cuda,我们先看
    create_conv_indice_pair_p1_cuda函数,位于src/spconv/indice.cu

    int create_conv_indice_pair_p1_cuda(
        torch::Tensor indicesIn,                // torch.Size([N, 4])
        torch::Tensor indicePairs,              // torch.Size([2,27,N]),-1填充 保存 rulebook
        torch::Tensor indiceNum,                // torch.Size([27]) 用于保存卷积核每一个位置上的总的计算的次数
        torch::Tensor indicePairUnique,         // N*27+1
        std::vector<int64_t> kernelSize,        // [3,3,3]
        std::vector<int64_t> stride,            // [2,2,2]
        std::vector<int64_t> padding,           // [1,1,1]
        std::vector<int64_t> dilation,          // [0,0,0]
        std::vector<int64_t> outSpatialShape,   // [21, 720, 720]
        bool transpose                          // False
        ) {
      auto stream = at::cuda::getCurrentCUDAStream();
      auto ndim = kernelSize.size();         // 3
      auto numActIn = indicesIn.size(0);     // N
      auto kernelVolume = indiceNum.size(0); // 27
      if (numActIn == 0)
        return 0;
      tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
        using Index = TV_DECLTYPE(IndexValue);
        using IndexGrid = int32_t;
        tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
          constexpr int NDim = TV_DECLTYPE(I)::value;
          // 将参数信息复制到tv::SimpleVector类型相关变量上
          tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
          tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
          tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
          tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
          tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),outSpatialShape.end());
          tv::DispatchInt<max_kernel_vol_t>()(kernelVolume, std::less_equal<int>(), [&](auto I2) {
                constexpr int MaxKernelVolume = TV_DECLTYPE(I2)::value;
                if (transpose) { // False
                  prepareDeConvIndicePairsKernel<Index, NDim, MaxKernelVolume>
                      <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
                         0, stream>>>(tv::torch2tv<Index>(indicesIn),
                                      tv::torch2tv<Index>(indicePairs),
                                      tv::torch2tv<Index>(indiceNum),
                                      tv::torch2tv<Index>(indicePairUnique), ks, st,pa, di, ou);
                  TV_CHECK_CUDA_ERR_V2("prepareDeConvIndicePairsKernel failed");
                } else {
                  prepareIndicePairsKernel<Index, NDim, MaxKernelVolume>
                      <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,0, stream>>>(
                                        tv::torch2tv<Index>(indicesIn),         // torch.Size([N, 4])
                                        tv::torch2tv<Index>(indicePairs),       // torch.Size([2,27,N]),-1填充 保存 rulebook
                                        tv::torch2tv<Index>(indiceNum),         // torch.Size([27]) 用于保存卷积核每一个位置上的总的计算的次数
                                        tv::torch2tv<Index>(indicePairUnique), 
                                        ks,                                     // 卷积核尺寸
                                        st,                                     // 步长
                                        pa,                                     // 填充
                                        di,                                     // 膨胀卷积
                                        ou                                      // 输出形状
                                        );
                  TV_CHECK_CUDA_ERR_V2("prepareIndicePairsKernel failed");
                }
    #ifdef TV_LOG_KERNEL_INFO
                cudaFuncAttributes attr;
                checkCudaErrors(cudaFuncGetAttributes(
                    &attr,
                    prepareDeConvIndicePairsKernel<Index, NDim, MaxKernelVolume>));
                tv::ssprint("prepareIndicePairsKernel<", tv::type_s<Index>, NDim,
                            MaxKernelVolume, ">", attr.numRegs);
    #endif
              });
        });
      });
      return 1;
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67

    重点看prepareIndicePairsKernel核函数

    template <typename Index, unsigned NDim, int KernelMaxVolume = 256,typename Index1D = int>
    __global__ void prepareIndicePairsKernel(
        tv::TensorView<const Index> indicesIn,  // torch.Size([N, 4])
        tv::TensorView<Index> indicePairs,      // torch.Size([2,27,N]),-1填充 保存 rulebook
        tv::TensorView<Index> indiceNum,        // torch.Size([27]) 用于保存卷积核每一个位置上的总的计算的次数
        tv::TensorView<Index1D> indicePairUnique,
        const tv::SimpleVector<Index, NDim> kernelSize,     // 卷积核尺寸
        const tv::SimpleVector<Index, NDim> stride,         // 步长
        const tv::SimpleVector<Index, NDim> padding,        // 填充
        const tv::SimpleVector<Index, NDim> dilation,       // 膨胀卷积
        const tv::SimpleVector<Index, NDim> outSpatialShape // 输出形状[21, 720, 720]
        ) {
      auto numActIn = indicesIn.dim(0); // N
      Index spatialVolume = 1; // 没用到
    #pragma unroll
      for (int i = 0; i < NDim; ++i) {
        spatialVolume *= outSpatialShape[i]; // 21*720*720
      }
      Index kernelVolume = 1;
    #pragma unroll
      for (int i = 0; i < NDim; ++i) {
        kernelVolume *= kernelSize[i];      // 3*3*3
      }
      Index numValidPoints = 0;
      Index validPoints[KernelMaxVolume * (NDim + 1)]; // 27*4
      Index *pointPtr = nullptr;
      auto indicePairsDim2 = indicePairs.dim(2); // N
      Index index;
      for (int ix : tv::KernelLoopX<int>(numActIn)) {
        numValidPoints = getValidOutPos<Index, NDim>(
            indicesIn.data() + ix * (NDim + 1) + 1, 
            kernelSize.data(),      // 卷积核尺寸 3,3,3
            stride.data(),          // 步长       2,2,2
            padding.data(),         // 填充       1,1,1     
            dilation.data(),        // 膨胀卷积   1,1,1
            outSpatialShape.data(), // 输出形状[21, 720, 720]
            validPoints             // 输出哈希表
            );
        // 依靠 getValidOutPos 计算得到的 out 数组完成rulebook的建立
        for (Index i = 0; i < numValidPoints; ++i) {
          pointPtr = validPoints + i * (NDim + 1);
          auto offset = pointPtr[NDim]; // 表示输出用到卷积核那个weight来计算
          Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); // 代表Rulebook的count
          // 输入张量到输入序号
          indicePairs(0, offset, oldNum) = ix;
          // 输出序号
          index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
            pointPtr, outSpatialShape.data(), 0) +
            spatialVolume * indicesIn(ix, 0);
          // 输出张量到输出序号
          indicePairs(1, offset, oldNum) = index;
          indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
        }
      }
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55

    getValidOutPos作用根据输入点计算输出哈希表和输出所用到的卷积核权重的位置,同时返回有效输出个数

    直接看下列代码,注释比较详细了

    template <typename Index, unsigned NDim>
    TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,  // 有效active点的输入位置坐标
                                        const Index *kernelSize, // [3,3,3]
                                        const Index *stride,     // [2,2,2] 
                                        const Index *padding,    // [1,1,1] 
                                        const Index *dilation,   // [0,0,0]
                                        const Index *outSpatialShape, // [21, 720, 720]
                                        Index *out   // 输出哈希表
                                        ) {
      Index lowers[NDim];     // 输入点对应输出点坐标的上限
      Index uppers[NDim];     // 输入点对应输出点坐标的下限
      Index counter[NDim];
      Index counterSize[NDim];//  各个维度的输出点数
      Index pointCounter = 0; // 有效的输出点数
      Index val;              // 输出序号
      Index numPoints = 1;
      Index m, offset;
      bool valid = false;
    #pragma unroll
      // 在各个维度上计算输入点对应输出点的上限和下限
      for (int i = 0; i < NDim; ++i) {
        lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 + stride[i] + padding[i]) / stride[i];
        uppers[i] = (input_pos[i] + padding[i]) / stride[i];
      }
    
    #pragma unroll
      // 计算每个输入对应输出点数numPoints
      for (unsigned i = 0; i < NDim; ++i) {
        counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
        numPoints *= counterSize[i];
      }
    
    #pragma unroll
      // 初始化
      for (int i = 0; i < NDim; ++i) {
        counter[i] = 0;
      }
    
      // 对输出数组做一个有效的填充,可以把out理解为一个[N][Ndim+1]的二维数组
      // 每一行表示一个输出位置i,out[i][0],...,out[i][Ndim-1]存储第i个输出位置的索引
      for (int i = 0; i < numPoints; ++i) {
        valid = true;
        m = 1;
        offset = 0;
    #pragma unroll
        // 各特征维度遍历,存储第i个输出位置的各个维度索引
        for (int j = NDim - 1; j >= 0; --j) {
          val = uppers[j] - counter[j] * dilation[j];// 输出序号
          // 输入对应的输出哈希表
          out[pointCounter * (NDim + 1) + j] = val;
          // 越界
          if (val < 0 || (val > outSpatialShape[j] - 1)) {
            valid = false;
            // break;
          }
          // 输入对应每个输出点在卷积核上的偏移
          offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j];
          m *= kernelSize[j]; // m*3
        }
        // out[i][Ndim]存储于输入相作用kernel的偏移(offset)(即用卷积核中的那个权重计算)
        out[pointCounter * (NDim + 1) + NDim] = offset;
        if (valid)
          ++pointCounter;
        // 让counter[2]值在0~counterSize[2]循环变化
        counter[NDim - 1] += 1; 
    #pragma unroll
        // 下面for循环作用:计算输出点各个维度的索引值(即counter[2],counter[1],counter[0])
        // 遍历完第2维度后,有counter[2]=counterSize[2]-->counter[1]++,counter[2]=0
        // 遍历完第1维度后,有counter[1]=counterSize[1]-->counter[0]++,counter[1]=0
        // 遍历完第0维度后,有counter[1]=counterSize[1]
        for (int c = NDim - 1; c >= 0; --c) {
          if (counter[c] == counterSize[c] && c > 0) {
            counter[c - 1] += 1;
            counter[c] = 0;
          }
        }
      }
      return pointCounter;
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79

    关于输出上下限如何得出,计算过程如下:

    1-dim卷积为例:给定输入点的输出点取决于内核大小 k、步长 s、扩张 d 和填充 p

    对于输入位置 x,它到特征图边界的距离为: x + p x+p x+p
    假设输出点的最小值为n,有以下关系:
    s ∗ ( n − 1 ) + k ′ = x + p s*(n-1)+k'=x+p s(n1)+k=x+p
    其中 k ′ k' k是有效内核大小,它取决于内核大小和膨胀: k ′ = ( k − 1 ) ∗ ( d − 1 ) + k k'=(k-1)*(d-1)+k k=(k1)(d1)+k

    带入 k ′ k' k等式变为:
    s ∗ ( n − 1 ) + ( k − 1 ) ∗ ( d − 1 ) + k = x + p ​ s*(n-1)+(k-1)*(d-1)+k=x+p​ s(n1)+(k1)(d1)+k=x+p
    重新排列,计算lowers为:
    n = ( x − d ∗ ( k − 1 ) − 1 + s + p ) / s n=(x-d*(k-1)-1+s+p)/s n=(xd(k1)1+s+p)/s
    同理,假设输出点的最大值为n,则有如下关系:
    s ∗ n = x + p s*n=x+p sn=x+p
    则计算uppers为:
    n = ( x + p ) / s n=(x+p)/s n=(x+p)/s
    参考:https://github.com/traveller59/spconv/issues/224

    对于counter变量含义可以参考注释代码,如哪些地方理解有误,也麻烦大家指出来。

    create_conv_indice_pair_p2_cuda位于:src/spconv/indice.cu

    int create_conv_indice_pair_p2_cuda(
        torch::Tensor indicesIn,                // torch.Size([N, 4]) indices
        torch::Tensor indicesOut,               // torch.Size([N*27, 4])
        torch::Tensor gridsOut,                 // [4,21*720*720]
        torch::Tensor indicePairs,              // torch.Size([2,27,N])
        torch::Tensor indiceNum,                // torch.Size([27]) 用于保存卷积核每一个位置上的总的计算的次数
        torch::Tensor indicePairUnique,         // N*27+1
        std::vector outSpatialShape,   // [21, 720, 720]
        bool transpose,                         // False
        bool resetGrid,                         // False
        bool useHash                            // False
        ) {
      auto stream = at::cuda::getCurrentCUDAStream();
      auto ndim = outSpatialShape.size(); // 3
      auto numActIn = indicesIn.size(0);  // N
      int batchSize = gridsOut.size(0);   // 4
      int numAct = indicePairUnique.size(0) - 1;// 不重复输出序号个数-1
    
      auto kernelVolume = indiceNum.size(0);
      if (numActIn == 0)
        return 0;
      bool failed = false;
      tv::dispatch_torch(indicesIn.scalar_type(), [&](auto IndexValue) {
        using Index = TV_DECLTYPE(IndexValue);
        using IndexGrid = int32_t;
        tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
          constexpr int NDim = TV_DECLTYPE(I)::value;
          using IndexGrid = int32_t;
          tv::SimpleVector ou(outSpatialShape.begin(),outSpatialShape.end());
          if (useHash) { // False
              ...... // 略
          } else {   // True
            assignGridAndIndiceOutKernel
                <<>>(
                    tv::torch2tv(indicesOut),    // torch.Size([N*27, 4])
                    tv::torch2tv(gridsOut),  // [4,21*720*720] 
                    numAct,                             // 不重复输出序号个数-1
                    tv::torch2tv(indicePairs),   // torch.Size([2,27,N])
                    tv::torch2tv(indicePairUnique),  // 不重复输出序号
                    ou,                                 // 输出形状
                    batchSize                           // 4
                    );
            TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed");
            assignIndicePairsKernel
                <<>>(tv::torch2tv(indicesOut),
                             tv::torch2tv(gridsOut), numActIn,
                             tv::torch2tv(indicePairs),
                             tv::torch2tv(indicePairUnique), ou);
            TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
    #ifdef TV_LOG_KERNEL_INFO
              ...... // 日志略
    #endif
          }
    
          if (resetGrid && (!useHash)) { // False
            resetGridKernel
                <<>>(indicePairUnique.data_ptr(),tv::torch2tv(gridsOut), numAct);
            TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
          }
        });
      });
      if (failed){
        return -1;
      }
      return numAct;
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67

    assignGridAndIndiceOutKernel位于:include/spconv/indice.cu.h

    template 
    __global__ void assignGridAndIndiceOutKernel(
        tv::TensorView indicesOut,                     // torch.Size([N*27, 4]) 需要计算的
        tv::TensorView gridsOut,                   // [4,21*720*720]        需要计算的
        int numAct,                                           // 不重复输出序号个数-1
        tv::TensorView indicePairs,                    // torch.Size([2,27,N])
        tv::TensorView indicePairUnique,               // 不重复输出序号
        const tv::SimpleVector outSpatialShape,  // 输出形状
        int batchSize                                         // 4
        ) {
    
      Index index;
      auto indicesOutPtr = indicesOut.data();
      for (int ix : tv::KernelLoopX(numAct)) {
        index = indicePairUnique[ix];
        gridsOut[index] = ix;
    
        index = tv::rowArrayIdxInv(index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
        indicesOut[ix * (NDim + 1)] = index % batchSize;
      }
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21

    rowArrayIdxInv位于:include/tensorview/tensorview.h

    template 
    TV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output,const Index *shape) {
    #pragma unroll
      for (int i = NDim - 1; i >= 0; --i) {
        output[i] = index % shape[i];
        index -= output[i];
        index /= shape[i];
      }
      return index;
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10

    继续看assignIndicePairsKernel

    template 
    __global__ void
    assignIndicePairsKernel(tv::TensorView indicesOut,
                            tv::TensorView gridsOut, int numActIn,
                            tv::TensorView indicePairs,
                            tv::TensorView indicePairUnique,
                            const tv::SimpleVector outSpatialShape) {
    
      Index index;
      int kernelVolume = indicePairs.dim(1);
      auto indicePairsOut = indicePairs.subview(1); // 从rulebook中获取输出张量到输出序号的哈希表
    
      for (int ix : tv::KernelLoopX(numActIn)) {
        for (int i = 0; i < kernelVolume; ++i) {
          index = indicePairsOut(i, ix);
          if (index > -1) {
            indicePairsOut(i, ix) = gridsOut[index];
          }
        }
      }
    }
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22

    subview位于:include/tensorview/tensorview.h ,意思应该获取子集

      TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex>
      subview(SimpleVector<int> ids) const {
        Shape start = ids;
        for (int i = ids.size(); i < ndim(); ++i) {
          start.push_back(0);
        }
        return TensorView<T, Rank, PtrTraits, Tindex>(
            ptr_ + rowArrayIdx(shape_, start), shape_.subshape(ids.size()));
      }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
  • 相关阅读:
    【百战GAN】SRGAN人脸低分辨率老照片修复代码实战
    基于人工蜂群算法的新型概率密度模型的无人机路径规划(Matlab代码实现)
    基于传统Session的登录
    OpenGL_Learn15(投光物)
    详细Ubuntu16~20TLS安装NVIDIA驱动教程
    js input手机号正则限制11位数字
    【定义】线性方程组
    Python pyenv install 下载安装慢(失败)完美解决
    neo4j安装使用(windows10)
    警惕看不见的重试机制:为什么使用RPC必须考虑幂等性
  • 原文地址:https://blog.csdn.net/weixin_42905141/article/details/127953063