From f193f8d60156cb56c31bea0c5a950cd37ccc9dcb Mon Sep 17 00:00:00 2001 From: CptCaptain Date: Mon, 13 Dec 2021 17:34:28 +0100 Subject: [PATCH] Remove unnecessary code from torchext --- torchext/ext/co_types.h | 10 -- torchext/ext/common.h | 135 --------------- torchext/ext/common_cuda.h | 173 ------------------ torchext/ext/ext.h | 347 ------------------------------------- torchext/ext/ext_cpu.cpp | 198 --------------------- torchext/ext/ext_cuda.cpp | 135 --------------- torchext/ext/ext_kernel.cu | 112 ------------ torchext/functions.py | 122 ------------- 8 files changed, 1232 deletions(-) delete mode 100644 torchext/ext/co_types.h delete mode 100644 torchext/ext/common.h delete mode 100644 torchext/ext/common_cuda.h delete mode 100644 torchext/ext/ext.h delete mode 100644 torchext/ext/ext_cpu.cpp delete mode 100644 torchext/ext/ext_cuda.cpp delete mode 100644 torchext/ext/ext_kernel.cu diff --git a/torchext/ext/co_types.h b/torchext/ext/co_types.h deleted file mode 100644 index 6a9b9bb..0000000 --- a/torchext/ext/co_types.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef TYPES_H -#define TYPES_H - -#ifdef __CUDA_ARCH__ -#define CPU_GPU_FUNCTION __host__ __device__ -#else -#define CPU_GPU_FUNCTION -#endif - -#endif diff --git a/torchext/ext/common.h b/torchext/ext/common.h deleted file mode 100644 index d94501f..0000000 --- a/torchext/ext/common.h +++ /dev/null @@ -1,135 +0,0 @@ -#ifndef COMMON_H -#define COMMON_H - -#include "co_types.h" -#include -#include - -#if defined(_OPENMP) -#include -#endif - - -#define DISABLE_COPY_AND_ASSIGN(classname) \ -private:\ - classname(const classname&) = delete;\ - classname& operator=(const classname&) = delete; - - -template -CPU_GPU_FUNCTION -void fill(T* arr, int N, T val) { - for(int idx = 0; idx < N; ++idx) { - arr[idx] = val; - } -} - -template -CPU_GPU_FUNCTION -void fill_zero(T* arr, int N) { - for(int idx = 0; idx < N; ++idx) { - arr[idx] = 0; - } -} - -template -CPU_GPU_FUNCTION -inline T distance_euclidean(const T* q, const T* t, int N) { - T out = 0; - for(int idx = 0; idx < N; idx++) { - T diff = q[idx] - t[idx]; - out += diff * diff; - } - return out; -} - -template -CPU_GPU_FUNCTION -inline T distance_l2(const T* q, const T* t, int N) { - T out = distance_euclidean(q, t, N); - out = std::sqrt(out); - return out; -} - - - - -template -struct FillFunctor { - T* arr; - const T val; - - FillFunctor(T* arr, const T val) : arr(arr), val(val) {} - CPU_GPU_FUNCTION void operator()(const int idx) { - arr[idx] = val; - } -}; - -template -CPU_GPU_FUNCTION -T mmin(const T& a, const T& b) { -#ifdef __CUDA_ARCH__ - return min(a, b); -#else - return std::min(a, b); -#endif -} - -template -CPU_GPU_FUNCTION -T mmax(const T& a, const T& b) { -#ifdef __CUDA_ARCH__ - return max(a, b); -#else - return std::max(a, b); -#endif -} - -template -CPU_GPU_FUNCTION -T mround(const T& a) { -#ifdef __CUDA_ARCH__ - return round(a); -#else - return round(a); -#endif -} - - -#ifdef __CUDA_ARCH__ -#if __CUDA_ARCH__ < 600 -__device__ double atomicAdd(double* address, double val) -{ - unsigned long long int* address_as_ull = - (unsigned long long int*)address; - unsigned long long int old = *address_as_ull, assumed; - - do { - assumed = old; - old = atomicCAS(address_as_ull, assumed, - __double_as_longlong(val + - __longlong_as_double(assumed))); - - // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) - } while (assumed != old); - - return __longlong_as_double(old); -} -#endif -#endif - - -template -CPU_GPU_FUNCTION -void matomic_add(T* addr, T val) { -#ifdef __CUDA_ARCH__ - atomicAdd(addr, val); -#else -#if defined(_OPENMP) -#pragma omp atomic -#endif - *addr += val; -#endif -} - -#endif diff --git a/torchext/ext/common_cuda.h b/torchext/ext/common_cuda.h deleted file mode 100644 index 4fe0b8a..0000000 --- a/torchext/ext/common_cuda.h +++ /dev/null @@ -1,173 +0,0 @@ -#ifndef COMMON_CUDA -#define COMMON_CUDA - -#include -#include - -#define DEBUG 0 -#define CUDA_DEBUG_DEVICE_SYNC 0 - -// cuda check for cudaMalloc and so on -#define CUDA_CHECK(condition) \ - /* Code block avoids redefinition of cudaError_t error */ \ - do { \ - if(CUDA_DEBUG_DEVICE_SYNC) { cudaDeviceSynchronize(); } \ - cudaError_t error = condition; \ - if(error != cudaSuccess) { \ - printf("%s in %s at %d\n", cudaGetErrorString(error), __FILE__, __LINE__); \ - exit(-1); \ - } \ - } while (0) - -/// Get error string for error code. -/// @param error -inline const char* cublasGetErrorString(cublasStatus_t error) { - switch (error) { - case CUBLAS_STATUS_SUCCESS: - return "CUBLAS_STATUS_SUCCESS"; - case CUBLAS_STATUS_NOT_INITIALIZED: - return "CUBLAS_STATUS_NOT_INITIALIZED"; - case CUBLAS_STATUS_ALLOC_FAILED: - return "CUBLAS_STATUS_ALLOC_FAILED"; - case CUBLAS_STATUS_INVALID_VALUE: - return "CUBLAS_STATUS_INVALID_VALUE"; - case CUBLAS_STATUS_ARCH_MISMATCH: - return "CUBLAS_STATUS_ARCH_MISMATCH"; - case CUBLAS_STATUS_MAPPING_ERROR: - return "CUBLAS_STATUS_MAPPING_ERROR"; - case CUBLAS_STATUS_EXECUTION_FAILED: - return "CUBLAS_STATUS_EXECUTION_FAILED"; - case CUBLAS_STATUS_INTERNAL_ERROR: - return "CUBLAS_STATUS_INTERNAL_ERROR"; - case CUBLAS_STATUS_NOT_SUPPORTED: - return "CUBLAS_STATUS_NOT_SUPPORTED"; - case CUBLAS_STATUS_LICENSE_ERROR: - return "CUBLAS_STATUS_LICENSE_ERROR"; - } - return "Unknown cublas status"; -} - -#define CUBLAS_CHECK(condition) \ - do { \ - if(CUDA_DEBUG_DEVICE_SYNC) { cudaDeviceSynchronize(); } \ - cublasStatus_t status = condition; \ - if(status != CUBLAS_STATUS_SUCCESS) { \ - printf("%s in %s at %d\n", cublasGetErrorString(status), __FILE__, __LINE__); \ - exit(-1); \ - } \ - } while (0) - -// check if there is a error after kernel execution -#define CUDA_POST_KERNEL_CHECK \ - CUDA_CHECK(cudaPeekAtLastError()); \ - CUDA_CHECK(cudaGetLastError()); - -#define CUDA_KERNEL_LOOP(i, n) \ - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) - -const int CUDA_NUM_THREADS = 1024; - -inline int GET_BLOCKS(const int N, const int N_THREADS=CUDA_NUM_THREADS) { - return (N + N_THREADS - 1) / N_THREADS; -} - -template -T* device_malloc(long N) { - T* dptr; - CUDA_CHECK(cudaMalloc(&dptr, N * sizeof(T))); - if(DEBUG) { printf("[DEBUG] device_malloc %p, %ld\n", dptr, N); } - return dptr; -} - -template -void device_free(T* dptr) { - if(DEBUG) { printf("[DEBUG] device_free %p\n", dptr); } - CUDA_CHECK(cudaFree(dptr)); -} - -template -void host_to_device(const T* hptr, T* dptr, long N) { - if(DEBUG) { printf("[DEBUG] host_to_device %p => %p, %ld\n", hptr, dptr, N); } - CUDA_CHECK(cudaMemcpy(dptr, hptr, N * sizeof(T), cudaMemcpyHostToDevice)); -} - -template -T* host_to_device_malloc(const T* hptr, long N) { - T* dptr = device_malloc(N); - host_to_device(hptr, dptr, N); - return dptr; -} - -template -void device_to_host(const T* dptr, T* hptr, long N) { - if(DEBUG) { printf("[DEBUG] device_to_host %p => %p, %ld\n", dptr, hptr, N); } - CUDA_CHECK(cudaMemcpy(hptr, dptr, N * sizeof(T), cudaMemcpyDeviceToHost)); -} - -template -T* device_to_host_malloc(const T* dptr, long N) { - T* hptr = new T[N]; - device_to_host(dptr, hptr, N); - return hptr; -} - -template -void device_to_device(const T* dptr, T* hptr, long N) { - if(DEBUG) { printf("[DEBUG] device_to_device %p => %p, %ld\n", dptr, hptr, N); } - CUDA_CHECK(cudaMemcpy(hptr, dptr, N * sizeof(T), cudaMemcpyDeviceToDevice)); -} - -// https://github.com/parallel-forall/code-samples/blob/master/posts/cuda-aware-mpi-example/src/Device.cu -// https://github.com/treecode/Bonsai/blob/master/runtime/profiling/derived_atomic_functions.h -__device__ __forceinline__ void atomicMaxF(float * const address, const float value) { - if (*address >= value) { - return; - } - - int * const address_as_i = (int *)address; - int old = * address_as_i, assumed; - - do { - assumed = old; - if (__int_as_float(assumed) >= value) { - break; - } - - old = atomicCAS(address_as_i, assumed, __float_as_int(value)); - } while (assumed != old); -} - -__device__ __forceinline__ void atomicMinF(float * const address, const float value) { - if (*address <= value) { - return; - } - - int * const address_as_i = (int *)address; - int old = * address_as_i, assumed; - - do { - assumed = old; - if (__int_as_float(assumed) <= value) { - break; - } - - old = atomicCAS(address_as_i, assumed, __float_as_int(value)); - } while (assumed != old); -} - - -template -__global__ void iterate_kernel(FunctorT functor, int N) { - CUDA_KERNEL_LOOP(idx, N) { - functor(idx); - } -} - -template -void iterate_cuda(FunctorT functor, int N, int N_THREADS=CUDA_NUM_THREADS) { - iterate_kernel<<>>(functor, N); - CUDA_POST_KERNEL_CHECK; -} - - -#endif diff --git a/torchext/ext/ext.h b/torchext/ext/ext.h deleted file mode 100644 index 21285ec..0000000 --- a/torchext/ext/ext.h +++ /dev/null @@ -1,347 +0,0 @@ -#pragma once - -#include "common.h" - - -#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor") -#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") - -#define CHECK_INPUT_CPU(x) CHECK_CONTIGUOUS(x) -#define CHECK_INPUT_CUDA(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) - - -template -struct NNFunctor { - const T* in0; // nelem0 x dim - const T* in1; // nelem1 x dim - const long nelem0; - const long nelem1; - long* out; // nelem0 - - NNFunctor(const T* in0, const T* in1, long nelem0, long nelem1, long* out) : in0(in0), in1(in1), nelem0(nelem0), nelem1(nelem1), out(out) {} - - CPU_GPU_FUNCTION void operator()(long idx0) { - // idx0 \in [nelem0] - - const T* vec0 = in0 + idx0 * dim; - - T min_dist = 1e9; - long min_arg = -1; - for(long idx1 = 0; idx1 < nelem1; ++idx1) { - const T* vec1 = in1 + idx1 * dim; - T dist = 0; - for(long didx = 0; didx < dim; ++didx) { - T diff = vec0[didx] - vec1[didx]; - dist += diff * diff; - } - - if(dist < min_dist) { - min_dist = dist; - min_arg = idx1; - } - } - - out[idx0] = min_arg; - } -}; - -struct CrossCheckFunctor { - const long* in0; // nelem0 - const long* in1; // nelem1 - const long nelem0; - const long nelem1; - uint8_t* out; // nelem0 - - CrossCheckFunctor(const long* in0, const long* in1, long nelem0, long nelem1, uint8_t* out) : in0(in0), in1(in1), nelem0(nelem0), nelem1(nelem1), out(out) {} - - CPU_GPU_FUNCTION void operator()(long idx0) { - // idx0 \in [nelem0] - int idx1 = in0[idx0]; - out[idx0] = idx1 >=0 && in1[idx1] >= 0 && idx0 == in1[idx1]; - // out[idx0] = idx0 == in1[in0[idx0]]; - } -}; - -template -struct ProjNNFunctor { - // xyz0, xyz1 in coord sys of 1 - const T* xyz0; // bs x height x width x 3 - const T* xyz1; // bs x height x width x 3 - const T* K; // 3 x 3 - const long batch_size; - const long height; - const long width; - const long patch_size; - long* out; // bs x height x width - - ProjNNFunctor(const T* xyz0, const T* xyz1, const T* K, long batch_size, long height, long width, long patch_size, long* out) - : xyz0(xyz0), xyz1(xyz1), K(K), batch_size(batch_size), height(height), width(width), patch_size(patch_size), out(out) {} - - CPU_GPU_FUNCTION void operator()(long idx0) { - // idx0 \in [0, bs x height x width] - - const long bs = idx0 / (height * width); - - const T x = xyz0[idx0 * 3 + 0]; - const T y = xyz0[idx0 * 3 + 1]; - const T z = xyz0[idx0 * 3 + 2]; - const T d = K[6] * x + K[7] * y + K[8] * z; - const T u = (K[0] * x + K[1] * y + K[2] * z) / d; - const T v = (K[3] * x + K[4] * y + K[5] * z) / d; - - int u0 = u + 0.5; - int v0 = v + 0.5; - - long min_idx1 = -1; - T min_dist = 1e9; - for(int pidx = 0; pidx < patch_size*patch_size; ++pidx) { - int pu = pidx % patch_size; - int pv = pidx / patch_size; - - int u1 = u0 + pu - patch_size/2; - int v1 = v0 + pv - patch_size/2; - - if(u1 >= 0 && v1 >= 0 && u1 < width && v1 < height) { - const long idx1 = (bs * height + v1) * width + u1; - const T* xyz1n = xyz1 + idx1 * 3; - const T d = (x-xyz1n[0]) * (x-xyz1n[0]) + (y-xyz1n[1]) * (y-xyz1n[1]) + (z-xyz1n[2]) * (z-xyz1n[2]); - if(d < min_dist) { - min_dist = d; - min_idx1 = idx1; - } - } - } - - out[idx0] = min_idx1; - } -}; - - -template -struct XCorrVolFunctor { - const T* in0; // channels x height x width - const T* in1; // channels x height x width - const long channels; - const long height; - const long width; - const long n_disps; - const long block_size; - T* out; // nelem0 - - XCorrVolFunctor(const T* in0, const T* in1, long channels, long height, long width, long n_disps, long block_size, T* out) : in0(in0), in1(in1), channels(channels), height(height), width(width), n_disps(n_disps), block_size(block_size), out(out) {} - - CPU_GPU_FUNCTION void operator()(long oidx) { - // idx0 \in [n_disps x height x width] - - auto d = oidx / (height * width); - auto h = (oidx / width) % height; - auto w = oidx % width; - - long block_size2 = block_size * block_size; - - T val = 0; - for(int c = 0; c < channels; ++c) { - // compute means - T mu0 = 0; - T mu1 = 0; - for(int bh = 0; bh < block_size; ++bh) { - long h0 = h + bh - block_size / 2; - h0 = mmax(long(0), mmin(height-1, h0)); - for(int bw = 0; bw < block_size; ++bw) { - long w0 = w + bw - block_size / 2; - long w1 = w0 - d; - w0 = mmax(long(0), mmin(width-1, w0)); - w1 = mmax(long(0), mmin(width-1, w1)); - long idx0 = (c * height + h0) * width + w0; - long idx1 = (c * height + h0) * width + w1; - mu0 += in0[idx0] / block_size2; - mu1 += in1[idx1] / block_size2; - } - } - - // compute stds and dot product - T sigma0 = 0; - T sigma1 = 0; - T dot = 0; - for(int bh = 0; bh < block_size; ++bh) { - long h0 = h + bh - block_size / 2; - h0 = mmax(long(0), mmin(height-1, h0)); - for(int bw = 0; bw < block_size; ++bw) { - long w0 = w + bw - block_size / 2; - long w1 = w0 - d; - w0 = mmax(long(0), mmin(width-1, w0)); - w1 = mmax(long(0), mmin(width-1, w1)); - long idx0 = (c * height + h0) * width + w0; - long idx1 = (c * height + h0) * width + w1; - T v0 = in0[idx0] - mu0; - T v1 = in1[idx1] - mu1; - - dot += v0 * v1; - sigma0 += v0 * v0; - sigma1 += v1 * v1; - } - } - - T norm = sqrt(sigma0 * sigma1) + 1e-8; - val += dot / norm; - } - - out[oidx] = val; - } -}; - - - - -const int PHOTOMETRIC_LOSS_MSE = 0; -const int PHOTOMETRIC_LOSS_SAD = 1; -const int PHOTOMETRIC_LOSS_CENSUS_MSE = 2; -const int PHOTOMETRIC_LOSS_CENSUS_SAD = 3; - -template -struct PhotometricLossForward { - const T* es; // batch_size x channels x height x width; - const T* ta; - const int block_size; - const int block_size2; - const T eps; - const int batch_size; - const int channels; - const int height; - const int width; - T* out; // batch_size x channels x height x width; - - PhotometricLossForward(const T* es, const T* ta, int block_size, T eps, int batch_size, int channels, int height, int width, T* out) : - es(es), ta(ta), block_size(block_size), block_size2(block_size*block_size), eps(eps), batch_size(batch_size), channels(channels), height(height), width(width), out(out) {} - - CPU_GPU_FUNCTION void operator()(int outidx) { - // outidx \in [0, batch_size x height x width] - - int w = outidx % width; - int h = (outidx / width) % height; - int n = outidx / (height * width); - - T loss = 0; - for(int bidx = 0; bidx < block_size2; ++bidx) { - int bh = bidx / block_size; - int bw = bidx % block_size; - int h0 = h + bh - block_size / 2; - int w0 = w + bw - block_size / 2; - - h0 = mmin(height-1, mmax(0, h0)); - w0 = mmin(width-1, mmax(0, w0)); - - for(int c = 0; c < channels; ++c) { - int inidx = ((n * channels + c) * height + h0) * width + w0; - if(type == PHOTOMETRIC_LOSS_SAD || type == PHOTOMETRIC_LOSS_MSE) { - T diff = es[inidx] - ta[inidx]; - if(type == PHOTOMETRIC_LOSS_MSE) { - loss += diff * diff / block_size2; - } - else if(type == PHOTOMETRIC_LOSS_SAD) { - loss += fabs(diff) / block_size2; - } - } - else if(type == PHOTOMETRIC_LOSS_CENSUS_SAD || type == PHOTOMETRIC_LOSS_CENSUS_MSE) { - int inidxc = ((n * channels + c) * height + h) * width + w; - T des = es[inidx] - es[inidxc]; - T dta = ta[inidx] - ta[inidxc]; - T h_des = 0.5 * (1 + des / sqrt(des * des + eps)); - T h_dta = 0.5 * (1 + dta / sqrt(dta * dta + eps)); - T diff = h_des - h_dta; - // printf("%d,%d %d,%d: des=%f, dta=%f, h_des=%f, h_dta=%f, diff=%f\n", h,w, h0,w0, des,dta, h_des,h_dta, diff); - // printf("%d,%d %d,%d: h_des=%f = 0.5 * (1 + %f / %f); %f, %f, %f\n", h,w, h0,w0, h_des, des, sqrt(des * des + eps), des*des, des*des+eps, eps); - if(type == PHOTOMETRIC_LOSS_CENSUS_MSE) { - loss += diff * diff / block_size2; - } - else if(type == PHOTOMETRIC_LOSS_CENSUS_SAD) { - loss += fabs(diff) / block_size2; - } - } - } - } - - out[outidx] = loss; - } -}; - -template -struct PhotometricLossBackward { - const T* es; // batch_size x channels x height x width; - const T* ta; - const T* grad_out; - const int block_size; - const int block_size2; - const T eps; - const int batch_size; - const int channels; - const int height; - const int width; - T* grad_in; // batch_size x channels x height x width; - - PhotometricLossBackward(const T* es, const T* ta, const T* grad_out, int block_size, T eps, int batch_size, int channels, int height, int width, T* grad_in) : - es(es), ta(ta), grad_out(grad_out), block_size(block_size), block_size2(block_size*block_size), eps(eps), batch_size(batch_size), channels(channels), height(height), width(width), grad_in(grad_in) {} - - CPU_GPU_FUNCTION void operator()(int outidx) { - // outidx \in [0, batch_size x height x width] - - int w = outidx % width; - int h = (outidx / width) % height; - int n = outidx / (height * width); - - for(int bidx = 0; bidx < block_size2; ++bidx) { - int bh = bidx / block_size; - int bw = bidx % block_size; - int h0 = h + bh - block_size / 2; - int w0 = w + bw - block_size / 2; - - h0 = mmin(height-1, mmax(0, h0)); - w0 = mmin(width-1, mmax(0, w0)); - - const T go = grad_out[outidx]; - - for(int c = 0; c < channels; ++c) { - int inidx = ((n * channels + c) * height + h0) * width + w0; - if(type == PHOTOMETRIC_LOSS_SAD || type == PHOTOMETRIC_LOSS_MSE) { - T diff = es[inidx] - ta[inidx]; - T grad = 0; - if(type == PHOTOMETRIC_LOSS_MSE) { - grad = 2 * diff; - } - else if(type == PHOTOMETRIC_LOSS_SAD) { - grad = diff < 0 ? -1 : (diff > 0 ? 1 : 0); - } - grad = grad / block_size2 * go; - matomic_add(grad_in + inidx, grad); - } - else if(type == PHOTOMETRIC_LOSS_CENSUS_SAD || type == PHOTOMETRIC_LOSS_CENSUS_MSE) { - int inidxc = ((n * channels + c) * height + h) * width + w; - T des = es[inidx] - es[inidxc]; - T dta = ta[inidx] - ta[inidxc]; - T h_des = 0.5 * (1 + des / sqrt(des * des + eps)); - T h_dta = 0.5 * (1 + dta / sqrt(dta * dta + eps)); - T diff = h_des - h_dta; - - T grad_loss = 0; - if(type == PHOTOMETRIC_LOSS_CENSUS_MSE) { - grad_loss = 2 * diff; - } - else if(type == PHOTOMETRIC_LOSS_CENSUS_SAD) { - grad_loss = diff < 0 ? -1 : (diff > 0 ? 1 : 0); - } - grad_loss = grad_loss / block_size2; - - T tmp = des * des + eps; - T grad_heaviside = 0.5 * eps / sqrt(tmp * tmp * tmp); - - T grad = go * grad_loss * grad_heaviside; - matomic_add(grad_in + inidx, grad); - matomic_add(grad_in + inidxc, -grad); - } - } - } - } -}; - - - diff --git a/torchext/ext/ext_cpu.cpp b/torchext/ext/ext_cpu.cpp deleted file mode 100644 index 436f2a2..0000000 --- a/torchext/ext/ext_cpu.cpp +++ /dev/null @@ -1,198 +0,0 @@ -#include - -#include - -#include "ext.h" - -template -void iterate_cpu(FunctorT functor, int N) { - for(int idx = 0; idx < N; ++idx) { - functor(idx); - } -} - -at::Tensor nn_cpu(at::Tensor in0, at::Tensor in1) { - CHECK_INPUT_CPU(in0); - CHECK_INPUT_CPU(in1); - - auto nelem0 = in0.size(0); - auto nelem1 = in1.size(0); - auto dim = in0.size(1); - - AT_ASSERTM(dim == in1.size(1), "in0 and in1 have to be the same shape"); - AT_ASSERTM(dim == 3, "dim hast to be 3"); - AT_ASSERTM(in0.dim() == 2, "in0 has to be N0 x 3"); - AT_ASSERTM(in1.dim() == 2, "in1 has to be N1 x 3"); - - auto out = at::empty({nelem0}, torch::CPU(at::kLong)); - - AT_DISPATCH_FLOATING_TYPES(in0.scalar_type(), "nn", ([&] { - iterate_cpu( - NNFunctor(in0.data(), in1.data(), nelem0, nelem1, out.data()), - nelem0); - })); - - return out; -} - - -at::Tensor crosscheck_cpu(at::Tensor in0, at::Tensor in1) { - CHECK_INPUT_CPU(in0); - CHECK_INPUT_CPU(in1); - - AT_ASSERTM(in0.dim() == 1, ""); - AT_ASSERTM(in1.dim() == 1, ""); - - auto nelem0 = in0.size(0); - auto nelem1 = in1.size(0); - - auto out = at::empty({nelem0}, torch::CPU(at::kByte)); - - iterate_cpu( - CrossCheckFunctor(in0.data(), in1.data(), nelem0, nelem1, out.data()), - nelem0); - - return out; -} - - -at::Tensor proj_nn_cpu(at::Tensor xyz0, at::Tensor xyz1, at::Tensor K, int patch_size) { - CHECK_INPUT_CPU(xyz0); - CHECK_INPUT_CPU(xyz1); - CHECK_INPUT_CPU(K); - - auto batch_size = xyz0.size(0); - auto height = xyz0.size(1); - auto width = xyz0.size(2); - - AT_ASSERTM(xyz0.size(0) == xyz1.size(0), ""); - AT_ASSERTM(xyz0.size(1) == xyz1.size(1), ""); - AT_ASSERTM(xyz0.size(2) == xyz1.size(2), ""); - AT_ASSERTM(xyz0.size(3) == xyz1.size(3), ""); - AT_ASSERTM(xyz0.size(3) == 3, ""); - AT_ASSERTM(xyz0.dim() == 4, ""); - AT_ASSERTM(xyz1.dim() == 4, ""); - - auto out = at::empty({batch_size, height, width}, torch::CPU(at::kLong)); - - AT_DISPATCH_FLOATING_TYPES(xyz0.scalar_type(), "proj_nn", ([&] { - iterate_cpu( - ProjNNFunctor(xyz0.data(), xyz1.data(), K.data(), batch_size, height, width, patch_size, out.data()), - batch_size * height * width); - })); - - return out; -} - - -at::Tensor xcorrvol_cpu(at::Tensor in0, at::Tensor in1, int n_disps, int block_size) { - CHECK_INPUT_CPU(in0); - CHECK_INPUT_CPU(in1); - - auto channels = in0.size(0); - auto height = in0.size(1); - auto width = in0.size(2); - - auto out = at::empty({n_disps, height, width}, in0.options()); - - AT_DISPATCH_FLOATING_TYPES(in0.scalar_type(), "xcorrvol", ([&] { - iterate_cpu( - XCorrVolFunctor(in0.data(), in1.data(), channels, height, width, n_disps, block_size, out.data()), - n_disps * height * width); - })); - - return out; -} - - - - -at::Tensor photometric_loss_forward(at::Tensor es, at::Tensor ta, int block_size, int type, float eps) { - CHECK_INPUT_CPU(es); - CHECK_INPUT_CPU(ta); - - auto batch_size = es.size(0); - auto channels = es.size(1); - auto height = es.size(2); - auto width = es.size(3); - - auto out = at::empty({batch_size, 1, height, width}, es.options()); - - AT_DISPATCH_FLOATING_TYPES(es.scalar_type(), "photometric_loss_forward_cpu", ([&] { - if(type == PHOTOMETRIC_LOSS_MSE) { - iterate_cpu( - PhotometricLossForward(es.data(), ta.data(), block_size, eps, batch_size, channels, height, width, out.data()), - out.numel()); - } - else if(type == PHOTOMETRIC_LOSS_SAD) { - iterate_cpu( - PhotometricLossForward(es.data(), ta.data(), block_size, eps, batch_size, channels, height, width, out.data()), - out.numel()); - } - else if(type == PHOTOMETRIC_LOSS_CENSUS_MSE) { - iterate_cpu( - PhotometricLossForward(es.data(), ta.data(), block_size, eps, batch_size, channels, height, width, out.data()), - out.numel()); - } - else if(type == PHOTOMETRIC_LOSS_CENSUS_SAD) { - iterate_cpu( - PhotometricLossForward(es.data(), ta.data(), block_size, eps, batch_size, channels, height, width, out.data()), - out.numel()); - } - })); - - return out; -} - -at::Tensor photometric_loss_backward(at::Tensor es, at::Tensor ta, at::Tensor grad_out, int block_size, int type, float eps) { - CHECK_INPUT_CPU(es); - CHECK_INPUT_CPU(ta); - CHECK_INPUT_CPU(grad_out); - - auto batch_size = es.size(0); - auto channels = es.size(1); - auto height = es.size(2); - auto width = es.size(3); - - CHECK_INPUT_CPU(ta); - auto grad_in = at::zeros({batch_size, channels, height, width}, grad_out.options()); - - AT_DISPATCH_FLOATING_TYPES(es.scalar_type(), "photometric_loss_backward_cpu", ([&] { - if(type == PHOTOMETRIC_LOSS_MSE) { - iterate_cpu( - PhotometricLossBackward(es.data(), ta.data(), grad_out.data(), block_size, eps, batch_size, channels, height, width, grad_in.data()), - grad_out.numel()); - } - else if(type == PHOTOMETRIC_LOSS_SAD) { - iterate_cpu( - PhotometricLossBackward(es.data(), ta.data(), grad_out.data(), block_size, eps, batch_size, channels, height, width, grad_in.data()), - grad_out.numel()); - } - else if(type == PHOTOMETRIC_LOSS_CENSUS_MSE) { - iterate_cpu( - PhotometricLossBackward(es.data(), ta.data(), grad_out.data(), block_size, eps, batch_size, channels, height, width, grad_in.data()), - grad_out.numel()); - } - else if(type == PHOTOMETRIC_LOSS_CENSUS_SAD) { - iterate_cpu( - PhotometricLossBackward(es.data(), ta.data(), grad_out.data(), block_size, eps, batch_size, channels, height, width, grad_in.data()), - grad_out.numel()); - } - })); - - return grad_in; -} - - - - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("nn_cpu", &nn_cpu, "nn_cpu"); - m.def("crosscheck_cpu", &crosscheck_cpu, "crosscheck_cpu"); - m.def("proj_nn_cpu", &proj_nn_cpu, "proj_nn_cpu"); - - m.def("xcorrvol_cpu", &xcorrvol_cpu, "xcorrvol_cpu"); - - m.def("photometric_loss_forward", &photometric_loss_forward); - m.def("photometric_loss_backward", &photometric_loss_backward); -} diff --git a/torchext/ext/ext_cuda.cpp b/torchext/ext/ext_cuda.cpp deleted file mode 100644 index d6c93ca..0000000 --- a/torchext/ext/ext_cuda.cpp +++ /dev/null @@ -1,135 +0,0 @@ -#include - -#include - -#include "ext.h" - -void nn_kernel(at::Tensor in0, at::Tensor in1, at::Tensor out); - -at::Tensor nn_cuda(at::Tensor in0, at::Tensor in1) { - CHECK_INPUT_CUDA(in0); - CHECK_INPUT_CUDA(in1); - - auto nelem0 = in0.size(0); - auto dim = in0.size(1); - - AT_ASSERTM(dim == in1.size(1), "in0 and in1 have to be the same shape"); - AT_ASSERTM(dim == 3, "dim hast to be 3"); - AT_ASSERTM(in0.dim() == 2, "in0 has to be N0 x 3"); - AT_ASSERTM(in1.dim() == 2, "in1 has to be N1 x 3"); - - auto out = at::empty({nelem0}, torch::CUDA(at::kLong)); - - nn_kernel(in0, in1, out); - - return out; -} - - -void crosscheck_kernel(at::Tensor in0, at::Tensor in1, at::Tensor out); - -at::Tensor crosscheck_cuda(at::Tensor in0, at::Tensor in1) { - CHECK_INPUT_CUDA(in0); - CHECK_INPUT_CUDA(in1); - - AT_ASSERTM(in0.dim() == 1, ""); - AT_ASSERTM(in1.dim() == 1, ""); - - auto nelem0 = in0.size(0); - auto out = at::empty({nelem0}, torch::CUDA(at::kByte)); - crosscheck_kernel(in0, in1, out); - - return out; -} - -void proj_nn_kernel(at::Tensor xyz0, at::Tensor xyz1, at::Tensor K, int patch_size, at::Tensor out); - -at::Tensor proj_nn_cuda(at::Tensor xyz0, at::Tensor xyz1, at::Tensor K, int patch_size) { - CHECK_INPUT_CUDA(xyz0); - CHECK_INPUT_CUDA(xyz1); - CHECK_INPUT_CUDA(K); - - auto batch_size = xyz0.size(0); - auto height = xyz0.size(1); - auto width = xyz0.size(2); - - AT_ASSERTM(xyz0.size(0) == xyz1.size(0), ""); - AT_ASSERTM(xyz0.size(1) == xyz1.size(1), ""); - AT_ASSERTM(xyz0.size(2) == xyz1.size(2), ""); - AT_ASSERTM(xyz0.size(3) == xyz1.size(3), ""); - AT_ASSERTM(xyz0.size(3) == 3, ""); - AT_ASSERTM(xyz0.dim() == 4, ""); - AT_ASSERTM(xyz1.dim() == 4, ""); - - auto out = at::empty({batch_size, height, width}, torch::CUDA(at::kLong)); - - proj_nn_kernel(xyz0, xyz1, K, patch_size, out); - - return out; -} - -void xcorrvol_kernel(at::Tensor in0, at::Tensor in1, int n_disps, int block_size, at::Tensor out); - -at::Tensor xcorrvol_cuda(at::Tensor in0, at::Tensor in1, int n_disps, int block_size) { - CHECK_INPUT_CUDA(in0); - CHECK_INPUT_CUDA(in1); - - // auto channels = in0.size(0); - auto height = in0.size(1); - auto width = in0.size(2); - - auto out = at::empty({n_disps, height, width}, in0.options()); - - xcorrvol_kernel(in0, in1, n_disps, block_size, out); - - return out; -} - - - -void photometric_loss_forward_kernel(at::Tensor es, at::Tensor ta, int block_size, int type, float eps, at::Tensor out); - -at::Tensor photometric_loss_forward(at::Tensor es, at::Tensor ta, int block_size, int type, float eps) { - CHECK_INPUT_CUDA(es); - CHECK_INPUT_CUDA(ta); - - auto batch_size = es.size(0); - auto height = es.size(2); - auto width = es.size(3); - - auto out = at::empty({batch_size, 1, height, width}, es.options()); - photometric_loss_forward_kernel(es, ta, block_size, type, eps, out); - - return out; -} - - -void photometric_loss_backward_kernel(at::Tensor es, at::Tensor ta, at::Tensor grad_out, int block_size, int type, float eps, at::Tensor grad_in); - -at::Tensor photometric_loss_backward(at::Tensor es, at::Tensor ta, at::Tensor grad_out, int block_size, int type, float eps) { - CHECK_INPUT_CUDA(es); - CHECK_INPUT_CUDA(ta); - CHECK_INPUT_CUDA(grad_out); - - auto batch_size = es.size(0); - auto channels = es.size(1); - auto height = es.size(2); - auto width = es.size(3); - - auto grad_in = at::zeros({batch_size, channels, height, width}, grad_out.options()); - photometric_loss_backward_kernel(es, ta, grad_out, block_size, type, eps, grad_in); - - return grad_in; -} - - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("nn_cuda", &nn_cuda, "nn_cuda"); - m.def("crosscheck_cuda", &crosscheck_cuda, "crosscheck_cuda"); - m.def("proj_nn_cuda", &proj_nn_cuda, "proj_nn_cuda"); - - m.def("xcorrvol_cuda", &xcorrvol_cuda, "xcorrvol_cuda"); - - m.def("photometric_loss_forward", &photometric_loss_forward); - m.def("photometric_loss_backward", &photometric_loss_backward); -} diff --git a/torchext/ext/ext_kernel.cu b/torchext/ext/ext_kernel.cu deleted file mode 100644 index 46df268..0000000 --- a/torchext/ext/ext_kernel.cu +++ /dev/null @@ -1,112 +0,0 @@ -#include - -#include "ext.h" -#include "common_cuda.h" - -void nn_kernel(at::Tensor in0, at::Tensor in1, at::Tensor out) { - auto nelem0 = in0.size(0); - auto nelem1 = in1.size(0); - auto dim = in0.size(1); - - AT_DISPATCH_FLOATING_TYPES(in0.scalar_type(), "nn", ([&] { - iterate_cuda( - NNFunctor(in0.data(), in1.data(), nelem0, nelem1, out.data()), - nelem0); - })); -} - - -void crosscheck_kernel(at::Tensor in0, at::Tensor in1, at::Tensor out) { - auto nelem0 = in0.size(0); - auto nelem1 = in1.size(0); - - iterate_cuda( - CrossCheckFunctor(in0.data(), in1.data(), nelem0, nelem1, out.data()), - nelem0); -} - -void proj_nn_kernel(at::Tensor xyz0, at::Tensor xyz1, at::Tensor K, int patch_size, at::Tensor out) { - auto batch_size = xyz0.size(0); - auto height = xyz0.size(1); - auto width = xyz0.size(2); - - AT_DISPATCH_FLOATING_TYPES(xyz0.scalar_type(), "proj_nn", ([&] { - iterate_cuda( - ProjNNFunctor(xyz0.data(), xyz1.data(), K.data(), batch_size, height, width, patch_size, out.data()), - batch_size * height * width); - })); -} - -void xcorrvol_kernel(at::Tensor in0, at::Tensor in1, int n_disps, int block_size, at::Tensor out) { - auto channels = in0.size(0); - auto height = in0.size(1); - auto width = in0.size(2); - - AT_DISPATCH_FLOATING_TYPES(in0.scalar_type(), "xcorrvol", ([&] { - iterate_cuda( - XCorrVolFunctor(in0.data(), in1.data(), channels, height, width, n_disps, block_size, out.data()), - n_disps * height * width, 512); - })); -} - - - -void photometric_loss_forward_kernel(at::Tensor es, at::Tensor ta, int block_size, int type, float eps, at::Tensor out) { - auto batch_size = es.size(0); - auto channels = es.size(1); - auto height = es.size(2); - auto width = es.size(3); - - AT_DISPATCH_FLOATING_TYPES(es.scalar_type(), "photometric_loss_forward_cuda", ([&] { - if(type == PHOTOMETRIC_LOSS_MSE) { - iterate_cuda( - PhotometricLossForward(es.data(), ta.data(), block_size, eps, batch_size, channels, height, width, out.data()), - out.numel()); - } - else if(type == PHOTOMETRIC_LOSS_SAD) { - iterate_cuda( - PhotometricLossForward(es.data(), ta.data(), block_size, eps, batch_size, channels, height, width, out.data()), - out.numel()); - } - else if(type == PHOTOMETRIC_LOSS_CENSUS_MSE) { - iterate_cuda( - PhotometricLossForward(es.data(), ta.data(), block_size, eps, batch_size, channels, height, width, out.data()), - out.numel()); - } - else if(type == PHOTOMETRIC_LOSS_CENSUS_SAD) { - iterate_cuda( - PhotometricLossForward(es.data(), ta.data(), block_size, eps, batch_size, channels, height, width, out.data()), - out.numel()); - } - })); -} - -void photometric_loss_backward_kernel(at::Tensor es, at::Tensor ta, at::Tensor grad_out, int block_size, int type, float eps, at::Tensor grad_in) { - auto batch_size = es.size(0); - auto channels = es.size(1); - auto height = es.size(2); - auto width = es.size(3); - - AT_DISPATCH_FLOATING_TYPES(es.scalar_type(), "photometric_loss_backward_cuda", ([&] { - if(type == PHOTOMETRIC_LOSS_MSE) { - iterate_cuda( - PhotometricLossBackward(es.data(), ta.data(), grad_out.data(), block_size, eps, batch_size, channels, height, width, grad_in.data()), - grad_out.numel()); - } - else if(type == PHOTOMETRIC_LOSS_SAD) { - iterate_cuda( - PhotometricLossBackward(es.data(), ta.data(), grad_out.data(), block_size, eps, batch_size, channels, height, width, grad_in.data()), - grad_out.numel()); - } - else if(type == PHOTOMETRIC_LOSS_CENSUS_MSE) { - iterate_cuda( - PhotometricLossBackward(es.data(), ta.data(), grad_out.data(), block_size, eps, batch_size, channels, height, width, grad_in.data()), - grad_out.numel()); - } - else if(type == PHOTOMETRIC_LOSS_CENSUS_SAD) { - iterate_cuda( - PhotometricLossBackward(es.data(), ta.data(), grad_out.data(), block_size, eps, batch_size, channels, height, width, grad_in.data()), - grad_out.numel()); - } - })); -} diff --git a/torchext/functions.py b/torchext/functions.py index 2885ae9..9ae1d4c 100644 --- a/torchext/functions.py +++ b/torchext/functions.py @@ -1,126 +1,4 @@ import torch -from . import ext_cpu -from . import ext_cuda - - -class NNFunction(torch.autograd.Function): - @staticmethod - def forward(ctx, in0, in1): - args = (in0, in1) - if in0.is_cuda: - out = ext_cuda.nn_cuda(*args) - else: - out = ext_cpu.nn_cpu(*args) - return out - - @staticmethod - def backward(ctx, grad_out): - return None, None - - -def nn(in0, in1): - return NNFunction.apply(in0, in1) - - -class CrossCheckFunction(torch.autograd.Function): - @staticmethod - def forward(ctx, in0, in1): - args = (in0, in1) - if in0.is_cuda: - out = ext_cuda.crosscheck_cuda(*args) - else: - out = ext_cpu.crosscheck_cpu(*args) - return out - - @staticmethod - def backward(ctx, grad_out): - return None, None - - -def crosscheck(in0, in1): - return CrossCheckFunction.apply(in0, in1) - - -class ProjNNFunction(torch.autograd.Function): - @staticmethod - def forward(ctx, xyz0, xyz1, K, patch_size): - args = (xyz0, xyz1, K, patch_size) - if xyz0.is_cuda: - out = ext_cuda.proj_nn_cuda(*args) - else: - out = ext_cpu.proj_nn_cpu(*args) - return out - - @staticmethod - def backward(ctx, grad_out): - return None, None, None, None - - -def proj_nn(xyz0, xyz1, K, patch_size): - return ProjNNFunction.apply(xyz0, xyz1, K, patch_size) - - -class XCorrVolFunction(torch.autograd.Function): - @staticmethod - def forward(ctx, in0, in1, n_disps, block_size): - args = (in0, in1, n_disps, block_size) - if in0.is_cuda: - out = ext_cuda.xcorrvol_cuda(*args) - else: - out = ext_cpu.xcorrvol_cpu(*args) - return out - - @staticmethod - def backward(ctx, grad_out): - return None, None, None, None - - -def xcorrvol(in0, in1, n_disps, block_size): - return XCorrVolFunction.apply(in0, in1, n_disps, block_size) - - -class PhotometricLossFunction(torch.autograd.Function): - @staticmethod - def forward(ctx, es, ta, block_size, type, eps): - args = (es, ta, block_size, type, eps) - ctx.save_for_backward(es, ta) - ctx.block_size = block_size - ctx.type = type - ctx.eps = eps - if es.is_cuda: - out = ext_cuda.photometric_loss_forward(*args) - else: - out = ext_cpu.photometric_loss_forward(*args) - return out - - @staticmethod - def backward(ctx, grad_out): - es, ta = ctx.saved_tensors - block_size = ctx.block_size - type = ctx.type - eps = ctx.eps - args = (es, ta, grad_out.contiguous(), block_size, type, eps) - if grad_out.is_cuda: - grad_es = ext_cuda.photometric_loss_backward(*args) - else: - grad_es = ext_cpu.photometric_loss_backward(*args) - return grad_es, None, None, None, None - - -def photometric_loss(es, ta, block_size, type='mse', eps=0.1): - type = type.lower() - if type == 'mse': - type = 0 - elif type == 'sad': - type = 1 - elif type == 'census_mse': - type = 2 - elif type == 'census_sad': - type = 3 - else: - raise Exception('invalid loss type') - return PhotometricLossFunction.apply(es, ta, block_size, type, eps) - def photometric_loss_pytorch(es, ta, block_size, type='mse', eps=0.1): type = type.lower()