3d稀疏卷积——spconv源码剖析（四）

普通3d稀疏卷积RuleBook构建

我们继续看普通稀疏卷积RuleBook的建立过程，返回src/spconv/spconv_ops.cc,看getIndicePairs函数的普通3D稀疏卷积部分

    // torch.numel()统计元素的个数 N*2*27/2+1
    auto indicePairUnique = torch::full({indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),torch::dtype(torch::kInt32).device(indices.device()));
    // [N*27,4]
    torch::Tensor outInds = torch::zeros({numAct * kernelVolume, coorDim + 1},torch::dtype(torch::kInt32).device(indices.device()));
    if (indices.device().type() == torch::kCPU) { // CPU
      numActOut = create_conv_indice_pair_cpu(indices, outInds, gridOut, indicePairs, indiceNum, kernelSize, stride,padding, dilation, outSpatialShape, transpose, false, useHash);
    }
#ifdef TV_CUDA
    else if (indices.device().type() == torch::kCUDA) { // GPU
      numActOut = create_conv_indice_pair_p1_cuda(indices,          // torch.Size([N, 4]) voxel空间索引 
                                                  indicePairs,      // torch.Size([2,27,N]),-1填充 保存 rulebook
                                                  indiceNum,        // torch.Size([27]) 用于保存卷积核每一个位置上的总的计算的次数
                                                  indicePairUnique, // N*27+1
                                                  kernelSize,       // [3,3,3]
                                                  stride,           // [2,2,2]
                                                  padding,          // [1,1,1]
                                                  dilation,         // [0,0,0]
                                                  outSpatialShape,  // [21, 720, 720]
                                                  transpose         // False
                                                  );
      if (numActOut > 0) {
        auto res = torch::_unique(indicePairUnique);
        indicePairUnique = std::get<0>(res);
        numActOut = create_conv_indice_pair_p2_cuda(indices, 
                                                    outInds, 
                                                    gridOut, 
                                                    indicePairs, 
                                                    indiceNum, 
                                                    indicePairUnique,
                                                    outSpatialShape, 
                                                    transpose, 
                                                    false, 
                                                    useHash);
        if (numActOut == -1) {
          auto device = indices.device();
          outInds = outInds.to({torch::kCPU});
          indicePairs = indicePairs.to({torch::kCPU});
          indiceNum = indiceNum.to({torch::kCPU});
          indices = indices.to({torch::kCPU});
          numActOut = create_conv_indice_pair_cpu(indices, outInds, gridOut, indicePairs, indiceNum, kernelSize,stride, padding, dilation, outSpatialShape, transpose, false,useHash);

          return {outInds.to(device).slice(0, 0, numActOut),indicePairs.to(device), indiceNum.to(device)};
        }
      }
    }
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45

普通3d稀疏卷积调用create_conv_indice_pair_p1_cuda和create_conv_indice_pair_p2_cuda,我们先看
create_conv_indice_pair_p1_cuda函数,位于src/spconv/indice.cu

int create_conv_indice_pair_p1_cuda(
    torch::Tensor indicesIn,                // torch.Size([N, 4])
    torch::Tensor indicePairs,              // torch.Size([2,27,N]),-1填充 保存 rulebook
    torch::Tensor indiceNum,                // torch.Size([27]) 用于保存卷积核每一个位置上的总的计算的次数
    torch::Tensor indicePairUnique,         // N*27+1
    std::vector<int64_t> kernelSize,        // [3,3,3]
    std::vector<int64_t> stride,            // [2,2,2]
    std::vector<int64_t> padding,           // [1,1,1]
    std::vector<int64_t> dilation,          // [0,0,0]
    std::vector<int64_t> outSpatialShape,   // [21, 720, 720]
    bool transpose                          // False
    ) {
  auto stream = at::cuda::getCurrentCUDAStream();
  auto ndim = kernelSize.size();         // 3
  auto numActIn = indicesIn.size(0);     // N
  auto kernelVolume = indiceNum.size(0); // 27
  if (numActIn == 0)
    return 0;
  tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
    using Index = TV_DECLTYPE(IndexValue);
    using IndexGrid = int32_t;
    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
      constexpr int NDim = TV_DECLTYPE(I)::value;
      // 将参数信息复制到tv::SimpleVector类型相关变量上
      tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
      tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
      tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
      tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
      tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),outSpatialShape.end());
      tv::DispatchInt<max_kernel_vol_t>()(kernelVolume, std::less_equal<int>(), [&](auto I2) {
            constexpr int MaxKernelVolume = TV_DECLTYPE(I2)::value;
            if (transpose) { // False
              prepareDeConvIndicePairsKernel<Index, NDim, MaxKernelVolume>
                  <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
                     0, stream>>>(tv::torch2tv<Index>(indicesIn),
                                  tv::torch2tv<Index>(indicePairs),
                                  tv::torch2tv<Index>(indiceNum),
                                  tv::torch2tv<Index>(indicePairUnique), ks, st,pa, di, ou);
              TV_CHECK_CUDA_ERR_V2("prepareDeConvIndicePairsKernel failed");
            } else {
              prepareIndicePairsKernel<Index, NDim, MaxKernelVolume>
                  <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,0, stream>>>(
                                    tv::torch2tv<Index>(indicesIn),         // torch.Size([N, 4])
                                    tv::torch2tv<Index>(indicePairs),       // torch.Size([2,27,N]),-1填充 保存 rulebook
                                    tv::torch2tv<Index>(indiceNum),         // torch.Size([27]) 用于保存卷积核每一个位置上的总的计算的次数
                                    tv::torch2tv<Index>(indicePairUnique), 
                                    ks,                                     // 卷积核尺寸
                                    st,                                     // 步长
                                    pa,                                     // 填充
                                    di,                                     // 膨胀卷积
                                    ou                                      // 输出形状
                                    );
              TV_CHECK_CUDA_ERR_V2("prepareIndicePairsKernel failed");
            }
#ifdef TV_LOG_KERNEL_INFO
            cudaFuncAttributes attr;
            checkCudaErrors(cudaFuncGetAttributes(
                &attr,
                prepareDeConvIndicePairsKernel<Index, NDim, MaxKernelVolume>));
            tv::ssprint("prepareIndicePairsKernel<", tv::type_s<Index>, NDim,
                        MaxKernelVolume, ">", attr.numRegs);
#endif
          });
    });
  });
  return 1;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67

重点看prepareIndicePairsKernel核函数

template <typename Index, unsigned NDim, int KernelMaxVolume = 256,typename Index1D = int>
__global__ void prepareIndicePairsKernel(
    tv::TensorView<const Index> indicesIn,  // torch.Size([N, 4])
    tv::TensorView<Index> indicePairs,      // torch.Size([2,27,N]),-1填充 保存 rulebook
    tv::TensorView<Index> indiceNum,        // torch.Size([27]) 用于保存卷积核每一个位置上的总的计算的次数
    tv::TensorView<Index1D> indicePairUnique,
    const tv::SimpleVector<Index, NDim> kernelSize,     // 卷积核尺寸
    const tv::SimpleVector<Index, NDim> stride,         // 步长
    const tv::SimpleVector<Index, NDim> padding,        // 填充
    const tv::SimpleVector<Index, NDim> dilation,       // 膨胀卷积
    const tv::SimpleVector<Index, NDim> outSpatialShape // 输出形状[21, 720, 720]
    ) {
  auto numActIn = indicesIn.dim(0); // N
  Index spatialVolume = 1; // 没用到
#pragma unroll
  for (int i = 0; i < NDim; ++i) {
    spatialVolume *= outSpatialShape[i]; // 21*720*720
  }
  Index kernelVolume = 1;
#pragma unroll
  for (int i = 0; i < NDim; ++i) {
    kernelVolume *= kernelSize[i];      // 3*3*3
  }
  Index numValidPoints = 0;
  Index validPoints[KernelMaxVolume * (NDim + 1)]; // 27*4
  Index *pointPtr = nullptr;
  auto indicePairsDim2 = indicePairs.dim(2); // N
  Index index;
  for (int ix : tv::KernelLoopX<int>(numActIn)) {
    numValidPoints = getValidOutPos<Index, NDim>(
        indicesIn.data() + ix * (NDim + 1) + 1, 
        kernelSize.data(),      // 卷积核尺寸 3,3,3
        stride.data(),          // 步长       2,2,2
        padding.data(),         // 填充       1,1,1     
        dilation.data(),        // 膨胀卷积   1,1,1
        outSpatialShape.data(), // 输出形状[21, 720, 720]
        validPoints             // 输出哈希表
        );
    // 依靠 getValidOutPos 计算得到的 out 数组完成rulebook的建立
    for (Index i = 0; i < numValidPoints; ++i) {
      pointPtr = validPoints + i * (NDim + 1);
      auto offset = pointPtr[NDim]; // 表示输出用到卷积核那个weight来计算
      Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); // 代表Rulebook的count
      // 输入张量到输入序号
      indicePairs(0, offset, oldNum) = ix;
      // 输出序号
      index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
        pointPtr, outSpatialShape.data(), 0) +
        spatialVolume * indicesIn(ix, 0);
      // 输出张量到输出序号
      indicePairs(1, offset, oldNum) = index;
      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
    }
  }
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55

getValidOutPos作用根据输入点计算输出哈希表和输出所用到的卷积核权重的位置，同时返回有效输出个数

直接看下列代码，注释比较详细了

template <typename Index, unsigned NDim>
TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,  // 有效active点的输入位置坐标
                                    const Index *kernelSize, // [3,3,3]
                                    const Index *stride,     // [2,2,2] 
                                    const Index *padding,    // [1,1,1] 
                                    const Index *dilation,   // [0,0,0]
                                    const Index *outSpatialShape, // [21, 720, 720]
                                    Index *out   // 输出哈希表
                                    ) {
  Index lowers[NDim];     // 输入点对应输出点坐标的上限
  Index uppers[NDim];     // 输入点对应输出点坐标的下限
  Index counter[NDim];
  Index counterSize[NDim];//  各个维度的输出点数
  Index pointCounter = 0; // 有效的输出点数
  Index val;              // 输出序号
  Index numPoints = 1;
  Index m, offset;
  bool valid = false;
#pragma unroll
  // 在各个维度上计算输入点对应输出点的上限和下限
  for (int i = 0; i < NDim; ++i) {
    lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 + stride[i] + padding[i]) / stride[i];
    uppers[i] = (input_pos[i] + padding[i]) / stride[i];
  }

#pragma unroll
  // 计算每个输入对应输出点数numPoints
  for (unsigned i = 0; i < NDim; ++i) {
    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
    numPoints *= counterSize[i];
  }

#pragma unroll
  // 初始化
  for (int i = 0; i < NDim; ++i) {
    counter[i] = 0;
  }

  // 对输出数组做一个有效的填充，可以把out理解为一个[N][Ndim+1]的二维数组
  // 每一行表示一个输出位置i,out[i][0],...,out[i][Ndim-1]存储第i个输出位置的索引
  for (int i = 0; i < numPoints; ++i) {
    valid = true;
    m = 1;
    offset = 0;
#pragma unroll
    // 各特征维度遍历，存储第i个输出位置的各个维度索引
    for (int j = NDim - 1; j >= 0; --j) {
      val = uppers[j] - counter[j] * dilation[j];// 输出序号
      // 输入对应的输出哈希表
      out[pointCounter * (NDim + 1) + j] = val;
      // 越界
      if (val < 0 || (val > outSpatialShape[j] - 1)) {
        valid = false;
        // break;
      }
      // 输入对应每个输出点在卷积核上的偏移
      offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j];
      m *= kernelSize[j]; // m*3
    }
    // out[i][Ndim]存储于输入相作用kernel的偏移(offset)（即用卷积核中的那个权重计算）
    out[pointCounter * (NDim + 1) + NDim] = offset;
    if (valid)
      ++pointCounter;
    // 让counter[2]值在0~counterSize[2]循环变化
    counter[NDim - 1] += 1; 
#pragma unroll
    // 下面for循环作用：计算输出点各个维度的索引值（即counter[2],counter[1],counter[0])
    // 遍历完第2维度后,有counter[2]=counterSize[2]-->counter[1]++,counter[2]=0
    // 遍历完第1维度后,有counter[1]=counterSize[1]-->counter[0]++,counter[1]=0
    // 遍历完第0维度后,有counter[1]=counterSize[1]
    for (int c = NDim - 1; c >= 0; --c) {
      if (counter[c] == counterSize[c] && c > 0) {
        counter[c - 1] += 1;
        counter[c] = 0;
      }
    }
  }
  return pointCounter;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79

关于输出上下限如何得出，计算过程如下：

以 1-dim卷积为例：给定输入点的输出点取决于内核大小 k、步长 s、扩张 d 和填充 p

对于输入位置 x，它到特征图边界的距离为： $x + p$
假设输出点的最小值为n，有以下关系：
$s * (n - 1) + k^{'} = x + p$
其中 $k^{'}$ 是有效内核大小，它取决于内核大小和膨胀： $k^{'} = (k - 1) * (d - 1) + k$

带入 $k^{'}$ 等式变为：
$s * (n - 1) + (k - 1) * (d - 1) + k = x + p $
重新排列，计算lowers为：
$n = (x - d * (k - 1) - 1 + s + p) / s$
同理，假设输出点的最大值为n，则有如下关系：
$s * n = x + p$
则计算uppers为：
$n = (x + p) / s$
参考：https://github.com/traveller59/spconv/issues/224

对于counter变量含义可以参考注释代码，如哪些地方理解有误，也麻烦大家指出来。

create_conv_indice_pair_p2_cuda位于:src/spconv/indice.cu

int create_conv_indice_pair_p2_cuda(
    torch::Tensor indicesIn,                // torch.Size([N, 4]) indices
    torch::Tensor indicesOut,               // torch.Size([N*27, 4])
    torch::Tensor gridsOut,                 // [4,21*720*720]
    torch::Tensor indicePairs,              // torch.Size([2,27,N])
    torch::Tensor indiceNum,                // torch.Size([27]) 用于保存卷积核每一个位置上的总的计算的次数
    torch::Tensor indicePairUnique,         // N*27+1
    std::vector outSpatialShape,   // [21, 720, 720]
    bool transpose,                         // False
    bool resetGrid,                         // False
    bool useHash                            // False
    ) {
  auto stream = at::cuda::getCurrentCUDAStream();
  auto ndim = outSpatialShape.size(); // 3
  auto numActIn = indicesIn.size(0);  // N
  int batchSize = gridsOut.size(0);   // 4
  int numAct = indicePairUnique.size(0) - 1;// 不重复输出序号个数-1

  auto kernelVolume = indiceNum.size(0);
  if (numActIn == 0)
    return 0;
  bool failed = false;
  tv::dispatch_torch(indicesIn.scalar_type(), [&](auto IndexValue) {
    using Index = TV_DECLTYPE(IndexValue);
    using IndexGrid = int32_t;
    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
      constexpr int NDim = TV_DECLTYPE(I)::value;
      using IndexGrid = int32_t;
      tv::SimpleVector ou(outSpatialShape.begin(),outSpatialShape.end());
      if (useHash) { // False
          ...... // 略
      } else {   // True
        assignGridAndIndiceOutKernel
            <<>>(
                tv::torch2tv(indicesOut),    // torch.Size([N*27, 4])
                tv::torch2tv(gridsOut),  // [4,21*720*720] 
                numAct,                             // 不重复输出序号个数-1
                tv::torch2tv(indicePairs),   // torch.Size([2,27,N])
                tv::torch2tv(indicePairUnique),  // 不重复输出序号
                ou,                                 // 输出形状
                batchSize                           // 4
                );
        TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed");
        assignIndicePairsKernel
            <<>>(tv::torch2tv(indicesOut),
                         tv::torch2tv(gridsOut), numActIn,
                         tv::torch2tv(indicePairs),
                         tv::torch2tv(indicePairUnique), ou);
        TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
#ifdef TV_LOG_KERNEL_INFO
          ...... // 日志略
#endif
      }

      if (resetGrid && (!useHash)) { // False
        resetGridKernel
            <<>>(indicePairUnique.data_ptr(),tv::torch2tv(gridsOut), numAct);
        TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
      }
    });
  });
  if (failed){
    return -1;
  }
  return numAct;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67

assignGridAndIndiceOutKernel位于:include/spconv/indice.cu.h

template 
__global__ void assignGridAndIndiceOutKernel(
    tv::TensorView indicesOut,                     // torch.Size([N*27, 4]) 需要计算的
    tv::TensorView gridsOut,                   // [4,21*720*720]        需要计算的
    int numAct,                                           // 不重复输出序号个数-1
    tv::TensorView indicePairs,                    // torch.Size([2,27,N])
    tv::TensorView indicePairUnique,               // 不重复输出序号
    const tv::SimpleVector outSpatialShape,  // 输出形状
    int batchSize                                         // 4
    ) {

  Index index;
  auto indicesOutPtr = indicesOut.data();
  for (int ix : tv::KernelLoopX(numAct)) {
    index = indicePairUnique[ix];
    gridsOut[index] = ix;

    index = tv::rowArrayIdxInv(index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
    indicesOut[ix * (NDim + 1)] = index % batchSize;
  }
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

rowArrayIdxInv位于：include/tensorview/tensorview.h

template 
TV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output,const Index *shape) {
#pragma unroll
  for (int i = NDim - 1; i >= 0; --i) {
    output[i] = index % shape[i];
    index -= output[i];
    index /= shape[i];
  }
  return index;
}
1
2
3
4
5
6
7
8
9
10

继续看assignIndicePairsKernel：

template 
__global__ void
assignIndicePairsKernel(tv::TensorView indicesOut,
                        tv::TensorView gridsOut, int numActIn,
                        tv::TensorView indicePairs,
                        tv::TensorView indicePairUnique,
                        const tv::SimpleVector outSpatialShape) {

  Index index;
  int kernelVolume = indicePairs.dim(1);
  auto indicePairsOut = indicePairs.subview(1); // 从rulebook中获取输出张量到输出序号的哈希表

  for (int ix : tv::KernelLoopX(numActIn)) {
    for (int i = 0; i < kernelVolume; ++i) {
      index = indicePairsOut(i, ix);
      if (index > -1) {
        indicePairsOut(i, ix) = gridsOut[index];
      }
    }
  }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22

subview位于：include/tensorview/tensorview.h ，意思应该获取子集

  TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex>
  subview(SimpleVector<int> ids) const {
    Shape start = ids;
    for (int i = ids.size(); i < ndim(); ++i) {
      start.push_back(0);
    }
    return TensorView<T, Rank, PtrTraits, Tindex>(
        ptr_ + rowArrayIdx(shape_, start), shape_.subshape(ids.size()));
  }
1
2
3
4
5
6
7
8
9

相关阅读:
【百战GAN】SRGAN人脸低分辨率老照片修复代码实战
基于人工蜂群算法的新型概率密度模型的无人机路径规划（Matlab代码实现）
基于传统Session的登录
OpenGL_Learn15（投光物）
详细Ubuntu16~20TLS安装NVIDIA驱动教程
js input手机号正则限制11位数字
【定义】线性方程组
Python pyenv install 下载安装慢（失败）完美解决
neo4j安装使用（windows10）
警惕看不见的重试机制：为什么使用RPC必须考虑幂等性

原文地址：https://blog.csdn.net/weixin_42905141/article/details/127953063