From f193f8d60156cb56c31bea0c5a950cd37ccc9dcb Mon Sep 17 00:00:00 2001
From: CptCaptain <nilskoch@posteo.de>
Date: Mon, 13 Dec 2021 17:34:28 +0100
Subject: [PATCH] Remove unnecessary code from torchext

---
 torchext/ext/co_types.h    |  10 --
 torchext/ext/common.h      | 135 ---------------
 torchext/ext/common_cuda.h | 173 ------------------
 torchext/ext/ext.h         | 347 -------------------------------------
 torchext/ext/ext_cpu.cpp   | 198 ---------------------
 torchext/ext/ext_cuda.cpp  | 135 ---------------
 torchext/ext/ext_kernel.cu | 112 ------------
 torchext/functions.py      | 122 -------------
 8 files changed, 1232 deletions(-)
 delete mode 100644 torchext/ext/co_types.h
 delete mode 100644 torchext/ext/common.h
 delete mode 100644 torchext/ext/common_cuda.h
 delete mode 100644 torchext/ext/ext.h
 delete mode 100644 torchext/ext/ext_cpu.cpp
 delete mode 100644 torchext/ext/ext_cuda.cpp
 delete mode 100644 torchext/ext/ext_kernel.cu
diff --git a/torchext/ext/co_types.h b/torchext/ext/co_types.h
deleted file mode 100644
index 6a9b9bb..0000000
--- a/torchext/ext/co_types.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef TYPES_H
-#define TYPES_H
-
-#ifdef __CUDA_ARCH__
-#define CPU_GPU_FUNCTION __host__ __device__
-#else
-#define CPU_GPU_FUNCTION
-#endif
-
-#endif
diff --git a/torchext/ext/common.h b/torchext/ext/common.h
deleted file mode 100644
index d94501f..0000000
--- a/torchext/ext/common.h
+++ /dev/null
@@ -1,135 +0,0 @@
-#ifndef COMMON_H
-#define COMMON_H
-
-#include "co_types.h"
-#include <cmath>
-#include <algorithm>
-
-#if defined(_OPENMP)
-#include <omp.h>
-#endif
-
-
-#define DISABLE_COPY_AND_ASSIGN(classname) \
-private:\
-  classname(const classname&) = delete;\
-  classname& operator=(const classname&) = delete;
-
-
-template <typename T>
-CPU_GPU_FUNCTION
-void fill(T* arr, int N, T val) {
-  for(int idx = 0; idx < N; ++idx) {
-    arr[idx] = val;
-  }
-}
-
-template <typename T>
-CPU_GPU_FUNCTION
-void fill_zero(T* arr, int N) {
-  for(int idx = 0; idx < N; ++idx) {
-    arr[idx] = 0;
-  }
-}
-
-template <typename T>
-CPU_GPU_FUNCTION
-inline T distance_euclidean(const T* q, const T* t, int N) {
-  T out = 0;
-  for(int idx = 0; idx < N; idx++) {
-    T diff = q[idx] - t[idx];
-    out += diff * diff;
-  }
-  return out;
-}
-
-template <typename T>
-CPU_GPU_FUNCTION
-inline T distance_l2(const T* q, const T* t, int N) {
-  T out = distance_euclidean(q, t, N);
-  out = std::sqrt(out);
-  return out;
-}
-
-
-
-
-template <typename T>
-struct FillFunctor {
-  T* arr;
-  const T val;
-
-  FillFunctor(T* arr, const T val) : arr(arr), val(val) {}
-  CPU_GPU_FUNCTION void operator()(const int idx) {
-    arr[idx] = val;
-  }
-};
-
-template <typename T>
-CPU_GPU_FUNCTION
-T mmin(const T& a, const T& b) {
-#ifdef __CUDA_ARCH__
-  return min(a, b);
-#else
-  return std::min(a, b);
-#endif
-}
-
-template <typename T>
-CPU_GPU_FUNCTION
-T mmax(const T& a, const T& b) {
-#ifdef __CUDA_ARCH__
-  return max(a, b);
-#else
-  return std::max(a, b);
-#endif
-}
-
-template <typename T>
-CPU_GPU_FUNCTION
-T mround(const T& a) {
-#ifdef __CUDA_ARCH__
-  return round(a);
-#else
-  return round(a);
-#endif
-}
-
-
-#ifdef __CUDA_ARCH__
-#if __CUDA_ARCH__ < 600
-__device__ double atomicAdd(double* address, double val)
-{
-    unsigned long long int* address_as_ull =
-                              (unsigned long long int*)address;
-    unsigned long long int old = *address_as_ull, assumed;
-
-    do {
-        assumed = old;
-        old = atomicCAS(address_as_ull, assumed,
-                        __double_as_longlong(val +
-                               __longlong_as_double(assumed)));
-
-    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
-    } while (assumed != old);
-
-    return __longlong_as_double(old);
-}
-#endif
-#endif
-
-
-template <typename T>
-CPU_GPU_FUNCTION
-void matomic_add(T* addr, T val) {
-#ifdef __CUDA_ARCH__
-  atomicAdd(addr, val);
-#else
-#if defined(_OPENMP)
-#pragma omp atomic
-#endif
-  *addr += val;
-#endif
-}
-
-#endif
diff --git a/torchext/ext/common_cuda.h b/torchext/ext/common_cuda.h
deleted file mode 100644
index 4fe0b8a..0000000
--- a/torchext/ext/common_cuda.h
+++ /dev/null
@@ -1,173 +0,0 @@
-#ifndef COMMON_CUDA
-#define COMMON_CUDA
-
-#include <cublas_v2.h>
-#include <stdio.h>
-
-#define DEBUG 0
-#define CUDA_DEBUG_DEVICE_SYNC 0
-
-// cuda check for cudaMalloc and so on
-#define CUDA_CHECK(condition) \
-  /* Code block avoids redefinition of cudaError_t error */ \
-  do { \
-    if(CUDA_DEBUG_DEVICE_SYNC) { cudaDeviceSynchronize(); } \
-    cudaError_t error = condition; \
-    if(error != cudaSuccess) { \
-      printf("%s in %s at %d\n", cudaGetErrorString(error), __FILE__, __LINE__); \
-      exit(-1); \
-    } \
-  } while (0)
-
-/// Get error string for error code.
-/// @param error
-inline const char* cublasGetErrorString(cublasStatus_t error) {
-  switch (error) {
-  case CUBLAS_STATUS_SUCCESS:
-    return "CUBLAS_STATUS_SUCCESS";
-  case CUBLAS_STATUS_NOT_INITIALIZED:
-    return "CUBLAS_STATUS_NOT_INITIALIZED";
-  case CUBLAS_STATUS_ALLOC_FAILED:
-    return "CUBLAS_STATUS_ALLOC_FAILED";
-  case CUBLAS_STATUS_INVALID_VALUE:
-    return "CUBLAS_STATUS_INVALID_VALUE";
-  case CUBLAS_STATUS_ARCH_MISMATCH:
-    return "CUBLAS_STATUS_ARCH_MISMATCH";
-  case CUBLAS_STATUS_MAPPING_ERROR:
-    return "CUBLAS_STATUS_MAPPING_ERROR";
-  case CUBLAS_STATUS_EXECUTION_FAILED:
-    return "CUBLAS_STATUS_EXECUTION_FAILED";
-  case CUBLAS_STATUS_INTERNAL_ERROR:
-    return "CUBLAS_STATUS_INTERNAL_ERROR";
-  case CUBLAS_STATUS_NOT_SUPPORTED:
-    return "CUBLAS_STATUS_NOT_SUPPORTED";
-  case CUBLAS_STATUS_LICENSE_ERROR:
-    return "CUBLAS_STATUS_LICENSE_ERROR";
-  }
-  return "Unknown cublas status";
-}
-
-#define CUBLAS_CHECK(condition) \
-  do { \
-    if(CUDA_DEBUG_DEVICE_SYNC) { cudaDeviceSynchronize(); } \
-    cublasStatus_t status = condition; \
-    if(status != CUBLAS_STATUS_SUCCESS) { \
-      printf("%s in %s at %d\n", cublasGetErrorString(status), __FILE__, __LINE__); \
-      exit(-1); \
-    } \
-  } while (0)
-
-// check if there is a error after kernel execution
-#define CUDA_POST_KERNEL_CHECK \
-  CUDA_CHECK(cudaPeekAtLastError()); \
-  CUDA_CHECK(cudaGetLastError()); 
-
-#define CUDA_KERNEL_LOOP(i, n) \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
-
-const int CUDA_NUM_THREADS = 1024;
-
-inline int GET_BLOCKS(const int N, const int N_THREADS=CUDA_NUM_THREADS) {
-  return (N + N_THREADS - 1) / N_THREADS;
-}
-
-template<typename T>
-T* device_malloc(long N) {
-  T* dptr;
-  CUDA_CHECK(cudaMalloc(&dptr, N * sizeof(T)));
-  if(DEBUG) { printf("[DEBUG] device_malloc %p, %ld\n", dptr, N); }
-  return dptr;
-}
-
-template<typename T>
-void device_free(T* dptr) {
-  if(DEBUG) { printf("[DEBUG] device_free %p\n", dptr); }
-  CUDA_CHECK(cudaFree(dptr));
-}
-
-template<typename T>
-void host_to_device(const T* hptr, T* dptr, long N) {
-  if(DEBUG) { printf("[DEBUG] host_to_device %p => %p, %ld\n", hptr, dptr, N); }
-  CUDA_CHECK(cudaMemcpy(dptr, hptr, N * sizeof(T), cudaMemcpyHostToDevice));
-}
-
-template<typename T>
-T* host_to_device_malloc(const T* hptr, long N) {
-  T* dptr = device_malloc<T>(N);
-  host_to_device(hptr, dptr, N);
-  return dptr;
-}
-
-template<typename T>
-void device_to_host(const T* dptr, T* hptr, long N) {
-  if(DEBUG) { printf("[DEBUG] device_to_host %p => %p, %ld\n", dptr, hptr, N); }
-  CUDA_CHECK(cudaMemcpy(hptr, dptr, N * sizeof(T), cudaMemcpyDeviceToHost));
-}
-
-template<typename T>
-T* device_to_host_malloc(const T* dptr, long N) {
-  T* hptr = new T[N];
-  device_to_host(dptr, hptr, N);
-  return hptr;
-}
-
-template<typename T>
-void device_to_device(const T* dptr, T* hptr, long N) {
-  if(DEBUG) { printf("[DEBUG] device_to_device %p => %p, %ld\n", dptr, hptr, N); }
-  CUDA_CHECK(cudaMemcpy(hptr, dptr, N * sizeof(T), cudaMemcpyDeviceToDevice));
-}
-
-// https://github.com/parallel-forall/code-samples/blob/master/posts/cuda-aware-mpi-example/src/Device.cu
-// https://github.com/treecode/Bonsai/blob/master/runtime/profiling/derived_atomic_functions.h
-__device__ __forceinline__  void atomicMaxF(float * const address, const float value) {
-  if (*address >= value) {
-    return;
-  }
-
-  int * const address_as_i = (int *)address;
-  int old = * address_as_i, assumed;
-
-  do {
-    assumed = old;
-    if (__int_as_float(assumed) >= value) {
-      break;
-    }
-
-    old = atomicCAS(address_as_i, assumed, __float_as_int(value));
-  } while (assumed != old);
-}
-
-__device__ __forceinline__  void atomicMinF(float * const address, const float value) {
-  if (*address <= value) {
-    return;
-  }
-
-  int * const address_as_i = (int *)address;
-  int old = * address_as_i, assumed;
-
-  do {
-    assumed = old;
-    if (__int_as_float(assumed) <= value) {
-      break;
-    }
-
-    old = atomicCAS(address_as_i, assumed, __float_as_int(value));
-  } while (assumed != old);
-}
-
-
-template <typename FunctorT>
-__global__ void iterate_kernel(FunctorT functor, int N) {
-  CUDA_KERNEL_LOOP(idx, N) {
-    functor(idx);
-  }
-}
-
-template <typename FunctorT>
-void iterate_cuda(FunctorT functor, int N, int N_THREADS=CUDA_NUM_THREADS) {
-  iterate_kernel<<<GET_BLOCKS(N, N_THREADS), N_THREADS>>>(functor, N);
-  CUDA_POST_KERNEL_CHECK;
-}
-
-
-#endif
diff --git a/torchext/ext/ext.h b/torchext/ext/ext.h
deleted file mode 100644
index 21285ec..0000000
--- a/torchext/ext/ext.h
+++ /dev/null
@@ -1,347 +0,0 @@
-#pragma once
-
-#include "common.h"
-
-
-#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-
-#define CHECK_INPUT_CPU(x) CHECK_CONTIGUOUS(x)
-#define CHECK_INPUT_CUDA(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
-
-
-template <typename T, int dim=3>
-struct NNFunctor {
-  const T* in0; // nelem0 x dim
-  const T* in1; // nelem1 x dim
-  const long nelem0;
-  const long nelem1;
-  long* out; // nelem0
-
-  NNFunctor(const T* in0, const T* in1, long nelem0, long nelem1, long* out) : in0(in0), in1(in1), nelem0(nelem0), nelem1(nelem1), out(out) {}
-
-  CPU_GPU_FUNCTION void operator()(long idx0) {
-    // idx0 \in [nelem0]
-
-    const T* vec0 = in0 + idx0 * dim;
-
-    T min_dist = 1e9;
-    long min_arg = -1;
-    for(long idx1 = 0; idx1 < nelem1; ++idx1) {
-      const T* vec1 = in1 + idx1 * dim;
-      T dist = 0;
-      for(long didx = 0; didx < dim; ++didx) {
-        T diff = vec0[didx] - vec1[didx];
-        dist += diff * diff;
-      }
-
-      if(dist < min_dist) {
-        min_dist = dist;
-        min_arg = idx1;
-      }
-    }
-
-    out[idx0] = min_arg;
-  }
-};
-
-struct CrossCheckFunctor {
-  const long* in0; // nelem0
-  const long* in1; // nelem1
-  const long nelem0;
-  const long nelem1;
-  uint8_t* out; // nelem0
-
-  CrossCheckFunctor(const long* in0, const long* in1, long nelem0, long nelem1, uint8_t* out) : in0(in0), in1(in1), nelem0(nelem0), nelem1(nelem1), out(out) {}
-
-  CPU_GPU_FUNCTION void operator()(long idx0) {
-    // idx0 \in [nelem0]
-    int idx1 = in0[idx0];
-    out[idx0] = idx1 >=0 && in1[idx1] >= 0 && idx0 == in1[idx1];
-    // out[idx0] = idx0 == in1[in0[idx0]];
-  }
-};
-
-template <typename T, int dim=3>
-struct ProjNNFunctor {
-  // xyz0, xyz1 in coord sys of 1
-  const T* xyz0; // bs x height x width x 3
-  const T* xyz1; // bs x height x width x 3
-  const T* K; // 3 x 3
-  const long batch_size;
-  const long height;
-  const long width;
-  const long patch_size;
-  long* out; // bs x height x width
-
-  ProjNNFunctor(const T* xyz0, const T* xyz1, const T* K, long batch_size, long height, long width, long patch_size, long* out)
-    : xyz0(xyz0), xyz1(xyz1), K(K), batch_size(batch_size), height(height), width(width), patch_size(patch_size), out(out) {}
-
-  CPU_GPU_FUNCTION void operator()(long idx0) {
-    // idx0 \in [0, bs x height x width]
-
-    const long bs = idx0 / (height * width);
-
-    const T x = xyz0[idx0 * 3 + 0];
-    const T y = xyz0[idx0 * 3 + 1];
-    const T z = xyz0[idx0 * 3 + 2];
-    const T d = K[6] * x + K[7] * y + K[8] * z;
-    const T u = (K[0] * x + K[1] * y + K[2] * z) / d;
-    const T v = (K[3] * x + K[4] * y + K[5] * z) / d;
-
-    int u0 = u + 0.5;
-    int v0 = v + 0.5;
-
-    long min_idx1 = -1;
-    T min_dist = 1e9;
-    for(int pidx = 0; pidx < patch_size*patch_size; ++pidx) {
-      int pu = pidx % patch_size;
-      int pv = pidx / patch_size;
-
-      int u1 = u0 + pu - patch_size/2;
-      int v1 = v0 + pv - patch_size/2;
-
-      if(u1 >= 0 && v1 >= 0 && u1 < width && v1 < height) {
-        const long idx1 = (bs * height + v1) * width + u1;
-        const T* xyz1n = xyz1 + idx1 * 3;
-        const T d = (x-xyz1n[0]) * (x-xyz1n[0]) + (y-xyz1n[1]) * (y-xyz1n[1]) + (z-xyz1n[2]) * (z-xyz1n[2]);
-        if(d < min_dist) {
-          min_dist = d;
-          min_idx1 = idx1;
-        }
-      }
-    }
-
-    out[idx0] = min_idx1;
-  }
-};
-
-
-template <typename T, int dim=3>
-struct XCorrVolFunctor {
-  const T* in0; // channels x height x width
-  const T* in1; // channels x height x width
-  const long channels;
-  const long height;
-  const long width;
-  const long n_disps;
-  const long block_size;
-  T* out; // nelem0
-
-  XCorrVolFunctor(const T* in0, const T* in1, long channels, long height, long width, long n_disps, long block_size, T* out) : in0(in0), in1(in1), channels(channels), height(height), width(width), n_disps(n_disps), block_size(block_size), out(out) {}
-
-  CPU_GPU_FUNCTION void operator()(long oidx) {
-    // idx0 \in [n_disps x height x width]
-
-    auto d = oidx / (height * width);
-    auto h = (oidx / width) % height;
-    auto w = oidx % width;
-
-    long block_size2 = block_size * block_size;
-
-    T val = 0;
-    for(int c = 0; c < channels; ++c) {
-      // compute means
-      T mu0 = 0;
-      T mu1 = 0;
-      for(int bh = 0; bh < block_size; ++bh) {
-        long h0 = h + bh - block_size / 2;
-        h0 = mmax(long(0), mmin(height-1, h0));
-        for(int bw = 0; bw < block_size; ++bw) {
-          long w0 = w + bw - block_size / 2;
-          long w1 = w0 - d;
-          w0 = mmax(long(0), mmin(width-1, w0));
-          w1 = mmax(long(0), mmin(width-1, w1));
-          long idx0 = (c * height + h0) * width + w0;
-          long idx1 = (c * height + h0) * width + w1;
-          mu0 += in0[idx0] / block_size2;
-          mu1 += in1[idx1] / block_size2;
-        }
-      }
-
-      // compute stds and dot product
-      T sigma0 = 0;
-      T sigma1 = 0;
-      T dot = 0;
-      for(int bh = 0; bh < block_size; ++bh) {
-        long h0 = h + bh - block_size / 2;
-        h0 = mmax(long(0), mmin(height-1, h0));
-        for(int bw = 0; bw < block_size; ++bw) {
-          long w0 = w + bw - block_size / 2;
-          long w1 = w0 - d;
-          w0 = mmax(long(0), mmin(width-1, w0));
-          w1 = mmax(long(0), mmin(width-1, w1));
-          long idx0 = (c * height + h0) * width + w0;
-          long idx1 = (c * height + h0) * width + w1;
-          T v0 = in0[idx0] - mu0;
-          T v1 = in1[idx1] - mu1;
-
-          dot += v0 * v1;
-          sigma0 += v0 * v0;
-          sigma1 += v1 * v1;
-        }
-      }
-
-      T norm = sqrt(sigma0 * sigma1) + 1e-8;
-      val += dot / norm;
-    }
-
-    out[oidx] = val;
-  }
-};
-
-
-
-
-const int PHOTOMETRIC_LOSS_MSE = 0;
-const int PHOTOMETRIC_LOSS_SAD = 1;
-const int PHOTOMETRIC_LOSS_CENSUS_MSE = 2;
-const int PHOTOMETRIC_LOSS_CENSUS_SAD = 3;
-
-template <typename T, int type>
-struct PhotometricLossForward {
-  const T* es;  // batch_size x channels x height x width;
-  const T* ta;
-  const int block_size;
-  const int block_size2;
-  const T eps;
-  const int batch_size;
-  const int channels;
-  const int height;
-  const int width;
-  T* out;  // batch_size x channels x height x width;
-
-  PhotometricLossForward(const T* es, const T* ta, int block_size, T eps, int batch_size, int channels, int height, int width, T* out) :
-    es(es), ta(ta), block_size(block_size), block_size2(block_size*block_size), eps(eps), batch_size(batch_size), channels(channels), height(height), width(width), out(out) {}
-
-  CPU_GPU_FUNCTION void operator()(int outidx) {
-    // outidx \in [0, batch_size x height x width]
-
-    int w = outidx % width;
-    int h = (outidx / width) % height;
-    int n = outidx / (height * width);
-
-    T loss = 0;
-    for(int bidx = 0; bidx < block_size2; ++bidx) {
-      int bh = bidx / block_size;
-      int bw = bidx % block_size;
-      int h0 = h + bh - block_size / 2;
-      int w0 = w + bw - block_size / 2;
-
-      h0 = mmin(height-1, mmax(0, h0));
-      w0 = mmin(width-1, mmax(0, w0));
-
-      for(int c = 0; c < channels; ++c) {
-        int inidx = ((n * channels + c) * height + h0) * width + w0;
-        if(type == PHOTOMETRIC_LOSS_SAD || type == PHOTOMETRIC_LOSS_MSE) {
-          T diff = es[inidx] - ta[inidx];
-          if(type == PHOTOMETRIC_LOSS_MSE) {
-            loss += diff * diff / block_size2;
-          }
-          else if(type == PHOTOMETRIC_LOSS_SAD) {
-            loss += fabs(diff) / block_size2;
-          }
-        }
-        else if(type == PHOTOMETRIC_LOSS_CENSUS_SAD || type == PHOTOMETRIC_LOSS_CENSUS_MSE) {
-          int inidxc = ((n * channels + c) * height + h) * width + w;
-          T des = es[inidx] - es[inidxc];
-          T dta = ta[inidx] - ta[inidxc];
-          T h_des = 0.5 * (1 + des / sqrt(des * des + eps));
-          T h_dta = 0.5 * (1 + dta / sqrt(dta * dta + eps));
-          T diff = h_des - h_dta;
-          // printf("%d,%d %d,%d: des=%f, dta=%f, h_des=%f, h_dta=%f, diff=%f\n", h,w, h0,w0, des,dta, h_des,h_dta, diff);
-          // printf("%d,%d %d,%d: h_des=%f = 0.5 * (1 + %f / %f); %f, %f, %f\n", h,w, h0,w0, h_des, des, sqrt(des * des + eps), des*des, des*des+eps, eps);
-          if(type == PHOTOMETRIC_LOSS_CENSUS_MSE) {
-            loss += diff * diff / block_size2;
-          }
-          else if(type == PHOTOMETRIC_LOSS_CENSUS_SAD) {
-            loss += fabs(diff) / block_size2;
-          }
-        }
-      }
-    }
-
-    out[outidx] = loss;
-  }
-};
-
-template <typename T, int type>
-struct PhotometricLossBackward {
-  const T* es;  // batch_size x channels x height x width;
-  const T* ta;
-  const T* grad_out;
-  const int block_size;
-  const int block_size2;
-  const T eps;
-  const int batch_size;
-  const int channels;
-  const int height;
-  const int width;
-  T* grad_in;  // batch_size x channels x height x width;
-
-  PhotometricLossBackward(const T* es, const T* ta, const T* grad_out, int block_size, T eps, int batch_size, int channels, int height, int width, T* grad_in) :
-    es(es), ta(ta), grad_out(grad_out), block_size(block_size), block_size2(block_size*block_size), eps(eps), batch_size(batch_size), channels(channels), height(height), width(width), grad_in(grad_in) {}
-
-  CPU_GPU_FUNCTION void operator()(int outidx) {
-    // outidx \in [0, batch_size x height x width]
-
-    int w = outidx % width;
-    int h = (outidx / width) % height;
-    int n = outidx / (height * width);
-
-    for(int bidx = 0; bidx < block_size2; ++bidx) {
-      int bh = bidx / block_size;
-      int bw = bidx % block_size;
-      int h0 = h + bh - block_size / 2;
-      int w0 = w + bw - block_size / 2;
-
-      h0 = mmin(height-1, mmax(0, h0));
-      w0 = mmin(width-1, mmax(0, w0));
-
-      const T go = grad_out[outidx];
-
-      for(int c = 0; c < channels; ++c) {
-        int inidx = ((n * channels + c) * height + h0) * width + w0;
-        if(type == PHOTOMETRIC_LOSS_SAD || type == PHOTOMETRIC_LOSS_MSE) {
-          T diff = es[inidx] - ta[inidx];
-          T grad = 0;
-          if(type == PHOTOMETRIC_LOSS_MSE) {
-            grad = 2 * diff;
-          }
-          else if(type == PHOTOMETRIC_LOSS_SAD) {
-            grad = diff < 0 ? -1 : (diff > 0 ? 1 : 0);
-          }
-          grad = grad / block_size2 * go;
-          matomic_add(grad_in + inidx, grad);
-        }
-        else if(type == PHOTOMETRIC_LOSS_CENSUS_SAD || type == PHOTOMETRIC_LOSS_CENSUS_MSE) {
-          int inidxc = ((n * channels + c) * height + h) * width + w;
-          T des = es[inidx] - es[inidxc];
-          T dta = ta[inidx] - ta[inidxc];
-          T h_des = 0.5 * (1 + des / sqrt(des * des + eps));
-          T h_dta = 0.5 * (1 + dta / sqrt(dta * dta + eps));
-          T diff = h_des - h_dta;
-
-          T grad_loss = 0;
-          if(type == PHOTOMETRIC_LOSS_CENSUS_MSE) {
-            grad_loss = 2 * diff;
-          }
-          else if(type == PHOTOMETRIC_LOSS_CENSUS_SAD) {
-            grad_loss = diff < 0 ? -1 : (diff > 0 ? 1 : 0);
-          }
-          grad_loss = grad_loss / block_size2;
-
-          T tmp = des * des + eps;
-          T grad_heaviside = 0.5 * eps / sqrt(tmp * tmp * tmp);
-
-          T grad = go * grad_loss * grad_heaviside;
-          matomic_add(grad_in + inidx, grad);
-          matomic_add(grad_in + inidxc, -grad);
-        }
-      }
-    }
-  }
-};
-
-
-
diff --git a/torchext/ext/ext_cpu.cpp b/torchext/ext/ext_cpu.cpp
deleted file mode 100644
index 436f2a2..0000000
--- a/torchext/ext/ext_cpu.cpp
+++ /dev/null
@@ -1,198 +0,0 @@
-#include <torch/extension.h>
-
-#include <iostream>
-
-#include "ext.h"
-
-template <typename FunctorT>
-void iterate_cpu(FunctorT functor, int N) {
-  for(int idx = 0; idx < N; ++idx) {
-    functor(idx);
-  }
-}
-
-at::Tensor nn_cpu(at::Tensor in0, at::Tensor in1) {
-  CHECK_INPUT_CPU(in0);
-  CHECK_INPUT_CPU(in1);
-
-  auto nelem0 = in0.size(0);
-  auto nelem1 = in1.size(0);
-  auto dim = in0.size(1);
-
-  AT_ASSERTM(dim == in1.size(1), "in0 and in1 have to be the same shape");
-  AT_ASSERTM(dim == 3, "dim hast to be 3");
-  AT_ASSERTM(in0.dim() == 2, "in0 has to be N0 x 3");
-  AT_ASSERTM(in1.dim() == 2, "in1 has to be N1 x 3");
-
-  auto out = at::empty({nelem0}, torch::CPU(at::kLong));
-
-  AT_DISPATCH_FLOATING_TYPES(in0.scalar_type(), "nn", ([&] {
-    iterate_cpu(
-      NNFunctor<scalar_t>(in0.data<scalar_t>(), in1.data<scalar_t>(), nelem0, nelem1, out.data<long>()),
-      nelem0);
-  }));
-
-  return out;
-}
-
-
-at::Tensor crosscheck_cpu(at::Tensor in0, at::Tensor in1) {
-  CHECK_INPUT_CPU(in0);
-  CHECK_INPUT_CPU(in1);
-
-  AT_ASSERTM(in0.dim() == 1, "");
-  AT_ASSERTM(in1.dim() == 1, "");
-
-  auto nelem0 = in0.size(0);
-  auto nelem1 = in1.size(0);
-
-  auto out = at::empty({nelem0}, torch::CPU(at::kByte));
-
-  iterate_cpu(
-    CrossCheckFunctor(in0.data<long>(), in1.data<long>(), nelem0, nelem1, out.data<uint8_t>()),
-    nelem0);
-
-  return out;
-}
-
-
-at::Tensor proj_nn_cpu(at::Tensor xyz0, at::Tensor xyz1, at::Tensor K, int patch_size) {
-  CHECK_INPUT_CPU(xyz0);
-  CHECK_INPUT_CPU(xyz1);
-  CHECK_INPUT_CPU(K);
-
-  auto batch_size = xyz0.size(0);
-  auto height = xyz0.size(1);
-  auto width = xyz0.size(2);
-
-  AT_ASSERTM(xyz0.size(0) == xyz1.size(0), "");
-  AT_ASSERTM(xyz0.size(1) == xyz1.size(1), "");
-  AT_ASSERTM(xyz0.size(2) == xyz1.size(2), "");
-  AT_ASSERTM(xyz0.size(3) == xyz1.size(3), "");
-  AT_ASSERTM(xyz0.size(3) == 3, "");
-  AT_ASSERTM(xyz0.dim() == 4, "");
-  AT_ASSERTM(xyz1.dim() == 4, "");
-
-  auto out = at::empty({batch_size, height, width}, torch::CPU(at::kLong));
-
-  AT_DISPATCH_FLOATING_TYPES(xyz0.scalar_type(), "proj_nn", ([&] {
-    iterate_cpu(
-      ProjNNFunctor<scalar_t>(xyz0.data<scalar_t>(), xyz1.data<scalar_t>(), K.data<scalar_t>(), batch_size, height, width, patch_size, out.data<long>()),
-      batch_size * height * width);
-  }));
-
-  return out;
-}
-
-
-at::Tensor xcorrvol_cpu(at::Tensor in0, at::Tensor in1, int n_disps, int block_size) {
-  CHECK_INPUT_CPU(in0);
-  CHECK_INPUT_CPU(in1);
-
-  auto channels = in0.size(0);
-  auto height = in0.size(1);
-  auto width = in0.size(2);
-
-  auto out = at::empty({n_disps, height, width}, in0.options());
-
-  AT_DISPATCH_FLOATING_TYPES(in0.scalar_type(), "xcorrvol", ([&] {
-    iterate_cpu(
-      XCorrVolFunctor<scalar_t>(in0.data<scalar_t>(), in1.data<scalar_t>(), channels, height, width, n_disps, block_size, out.data<scalar_t>()),
-      n_disps * height * width);
-  }));
-
-  return out;
-}
-
-
-
-
-at::Tensor photometric_loss_forward(at::Tensor es, at::Tensor ta, int block_size, int type, float eps) {
-  CHECK_INPUT_CPU(es);
-  CHECK_INPUT_CPU(ta);
-
-  auto batch_size = es.size(0);
-  auto channels = es.size(1);
-  auto height = es.size(2);
-  auto width = es.size(3);
-
-  auto out = at::empty({batch_size, 1, height, width}, es.options());
-
-  AT_DISPATCH_FLOATING_TYPES(es.scalar_type(), "photometric_loss_forward_cpu", ([&] {
-    if(type == PHOTOMETRIC_LOSS_MSE) {
-      iterate_cpu(
-          PhotometricLossForward<scalar_t, PHOTOMETRIC_LOSS_MSE>(es.data<scalar_t>(), ta.data<scalar_t>(), block_size, eps, batch_size, channels, height, width, out.data<scalar_t>()),
-          out.numel());
-    }
-    else if(type == PHOTOMETRIC_LOSS_SAD) {
-      iterate_cpu(
-          PhotometricLossForward<scalar_t, PHOTOMETRIC_LOSS_SAD>(es.data<scalar_t>(), ta.data<scalar_t>(), block_size, eps, batch_size, channels, height, width, out.data<scalar_t>()),
-          out.numel());
-    }
-    else if(type == PHOTOMETRIC_LOSS_CENSUS_MSE) {
-      iterate_cpu(
-          PhotometricLossForward<scalar_t, PHOTOMETRIC_LOSS_CENSUS_MSE>(es.data<scalar_t>(), ta.data<scalar_t>(), block_size, eps, batch_size, channels, height, width, out.data<scalar_t>()),
-          out.numel());
-    }
-    else if(type == PHOTOMETRIC_LOSS_CENSUS_SAD) {
-      iterate_cpu(
-          PhotometricLossForward<scalar_t, PHOTOMETRIC_LOSS_CENSUS_SAD>(es.data<scalar_t>(), ta.data<scalar_t>(), block_size, eps, batch_size, channels, height, width, out.data<scalar_t>()),
-          out.numel());
-    }
-  }));
-
-  return out;
-}
-
-at::Tensor photometric_loss_backward(at::Tensor es, at::Tensor ta, at::Tensor grad_out, int block_size, int type, float eps) {
-  CHECK_INPUT_CPU(es);
-  CHECK_INPUT_CPU(ta);
-  CHECK_INPUT_CPU(grad_out);
-
-  auto batch_size = es.size(0);
-  auto channels = es.size(1);
-  auto height = es.size(2);
-  auto width = es.size(3);
-
-  CHECK_INPUT_CPU(ta);
-  auto grad_in = at::zeros({batch_size, channels, height, width}, grad_out.options());
-
-  AT_DISPATCH_FLOATING_TYPES(es.scalar_type(), "photometric_loss_backward_cpu", ([&] {
-    if(type == PHOTOMETRIC_LOSS_MSE) {
-      iterate_cpu(
-          PhotometricLossBackward<scalar_t, PHOTOMETRIC_LOSS_MSE>(es.data<scalar_t>(), ta.data<scalar_t>(), grad_out.data<scalar_t>(), block_size, eps, batch_size, channels, height, width, grad_in.data<scalar_t>()),
-          grad_out.numel());
-    }
-    else if(type == PHOTOMETRIC_LOSS_SAD) {
-      iterate_cpu(
-          PhotometricLossBackward<scalar_t, PHOTOMETRIC_LOSS_SAD>(es.data<scalar_t>(), ta.data<scalar_t>(), grad_out.data<scalar_t>(), block_size, eps, batch_size, channels, height, width, grad_in.data<scalar_t>()),
-          grad_out.numel());
-    }
-    else if(type == PHOTOMETRIC_LOSS_CENSUS_MSE) {
-      iterate_cpu(
-          PhotometricLossBackward<scalar_t, PHOTOMETRIC_LOSS_CENSUS_MSE>(es.data<scalar_t>(), ta.data<scalar_t>(), grad_out.data<scalar_t>(), block_size, eps, batch_size, channels, height, width, grad_in.data<scalar_t>()),
-          grad_out.numel());
-    }
-    else if(type == PHOTOMETRIC_LOSS_CENSUS_SAD) {
-      iterate_cpu(
-          PhotometricLossBackward<scalar_t, PHOTOMETRIC_LOSS_CENSUS_SAD>(es.data<scalar_t>(), ta.data<scalar_t>(), grad_out.data<scalar_t>(), block_size, eps, batch_size, channels, height, width, grad_in.data<scalar_t>()),
-          grad_out.numel());
-    }
-  }));
-
-  return grad_in;
-}
-
-
-
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("nn_cpu", &nn_cpu, "nn_cpu");
-  m.def("crosscheck_cpu", &crosscheck_cpu, "crosscheck_cpu");
-  m.def("proj_nn_cpu", &proj_nn_cpu, "proj_nn_cpu");
-
-  m.def("xcorrvol_cpu", &xcorrvol_cpu, "xcorrvol_cpu");
-
-  m.def("photometric_loss_forward", &photometric_loss_forward);
-  m.def("photometric_loss_backward", &photometric_loss_backward);
-}
diff --git a/torchext/ext/ext_cuda.cpp b/torchext/ext/ext_cuda.cpp
deleted file mode 100644
index d6c93ca..0000000
--- a/torchext/ext/ext_cuda.cpp
+++ /dev/null
@@ -1,135 +0,0 @@
-#include <torch/extension.h>
-
-#include <iostream>
-
-#include "ext.h"
-
-void nn_kernel(at::Tensor in0, at::Tensor in1, at::Tensor out);
-
-at::Tensor nn_cuda(at::Tensor in0, at::Tensor in1) {
-  CHECK_INPUT_CUDA(in0);
-  CHECK_INPUT_CUDA(in1);
-
-  auto nelem0 = in0.size(0);
-  auto dim = in0.size(1);
-
-  AT_ASSERTM(dim == in1.size(1), "in0 and in1 have to be the same shape");
-  AT_ASSERTM(dim == 3, "dim hast to be 3");
-  AT_ASSERTM(in0.dim() == 2, "in0 has to be N0 x 3");
-  AT_ASSERTM(in1.dim() == 2, "in1 has to be N1 x 3");
-
-  auto out = at::empty({nelem0}, torch::CUDA(at::kLong));
-
-  nn_kernel(in0, in1, out);
-
-  return out;
-}
-
-
-void crosscheck_kernel(at::Tensor in0, at::Tensor in1, at::Tensor out);
-
-at::Tensor crosscheck_cuda(at::Tensor in0, at::Tensor in1) {
-  CHECK_INPUT_CUDA(in0);
-  CHECK_INPUT_CUDA(in1);
-
-  AT_ASSERTM(in0.dim() == 1, "");
-  AT_ASSERTM(in1.dim() == 1, "");
-
-  auto nelem0 = in0.size(0);
-  auto out = at::empty({nelem0}, torch::CUDA(at::kByte));
-  crosscheck_kernel(in0, in1, out);
-
-  return out;
-}
-
-void proj_nn_kernel(at::Tensor xyz0, at::Tensor xyz1, at::Tensor K, int patch_size, at::Tensor out);
-
-at::Tensor proj_nn_cuda(at::Tensor xyz0, at::Tensor xyz1, at::Tensor K, int patch_size) {
-  CHECK_INPUT_CUDA(xyz0);
-  CHECK_INPUT_CUDA(xyz1);
-  CHECK_INPUT_CUDA(K);
-
-  auto batch_size = xyz0.size(0);
-  auto height = xyz0.size(1);
-  auto width = xyz0.size(2);
-
-  AT_ASSERTM(xyz0.size(0) == xyz1.size(0), "");
-  AT_ASSERTM(xyz0.size(1) == xyz1.size(1), "");
-  AT_ASSERTM(xyz0.size(2) == xyz1.size(2), "");
-  AT_ASSERTM(xyz0.size(3) == xyz1.size(3), "");
-  AT_ASSERTM(xyz0.size(3) == 3, "");
-  AT_ASSERTM(xyz0.dim() == 4, "");
-  AT_ASSERTM(xyz1.dim() == 4, "");
-
-  auto out = at::empty({batch_size, height, width}, torch::CUDA(at::kLong));
-
-  proj_nn_kernel(xyz0, xyz1, K, patch_size, out);
-
-  return out;
-}
-
-void xcorrvol_kernel(at::Tensor in0, at::Tensor in1, int n_disps, int block_size, at::Tensor out);
-
-at::Tensor xcorrvol_cuda(at::Tensor in0, at::Tensor in1, int n_disps, int block_size) {
-  CHECK_INPUT_CUDA(in0);
-  CHECK_INPUT_CUDA(in1);
-
-  // auto channels = in0.size(0);
-  auto height = in0.size(1);
-  auto width = in0.size(2);
-
-  auto out = at::empty({n_disps, height, width}, in0.options());
-
-  xcorrvol_kernel(in0, in1, n_disps, block_size, out);
-
-  return out;
-}
-
-
-
-void photometric_loss_forward_kernel(at::Tensor es, at::Tensor ta, int block_size, int type, float eps, at::Tensor out);
-
-at::Tensor photometric_loss_forward(at::Tensor es, at::Tensor ta, int block_size, int type, float eps) {
-  CHECK_INPUT_CUDA(es);
-  CHECK_INPUT_CUDA(ta);
-
-  auto batch_size = es.size(0);
-  auto height = es.size(2);
-  auto width = es.size(3);
-
-  auto out = at::empty({batch_size, 1, height, width}, es.options());
-  photometric_loss_forward_kernel(es, ta, block_size, type, eps, out);
-
-  return out;
-}
-
-
-void photometric_loss_backward_kernel(at::Tensor es, at::Tensor ta, at::Tensor grad_out, int block_size, int type, float eps, at::Tensor grad_in);
-
-at::Tensor photometric_loss_backward(at::Tensor es, at::Tensor ta, at::Tensor grad_out, int block_size, int type, float eps) {
-  CHECK_INPUT_CUDA(es);
-  CHECK_INPUT_CUDA(ta);
-  CHECK_INPUT_CUDA(grad_out);
-
-  auto batch_size = es.size(0);
-  auto channels = es.size(1);
-  auto height = es.size(2);
-  auto width = es.size(3);
-
-  auto grad_in = at::zeros({batch_size, channels, height, width}, grad_out.options());
-  photometric_loss_backward_kernel(es, ta, grad_out, block_size, type, eps, grad_in);
-
-  return grad_in;
-}
-
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("nn_cuda", &nn_cuda, "nn_cuda");
-  m.def("crosscheck_cuda", &crosscheck_cuda, "crosscheck_cuda");
-  m.def("proj_nn_cuda", &proj_nn_cuda, "proj_nn_cuda");
-
-  m.def("xcorrvol_cuda", &xcorrvol_cuda, "xcorrvol_cuda");
-
-  m.def("photometric_loss_forward", &photometric_loss_forward);
-  m.def("photometric_loss_backward", &photometric_loss_backward);
-}
diff --git a/torchext/ext/ext_kernel.cu b/torchext/ext/ext_kernel.cu
deleted file mode 100644
index 46df268..0000000
--- a/torchext/ext/ext_kernel.cu
+++ /dev/null
@@ -1,112 +0,0 @@
-#include <ATen/ATen.h>
-
-#include "ext.h"
-#include "common_cuda.h"
-
-void nn_kernel(at::Tensor in0, at::Tensor in1, at::Tensor out) {
-  auto nelem0 = in0.size(0);
-  auto nelem1 = in1.size(0);
-  auto dim = in0.size(1);
-
-  AT_DISPATCH_FLOATING_TYPES(in0.scalar_type(), "nn", ([&] {
-    iterate_cuda(
-      NNFunctor<scalar_t>(in0.data<scalar_t>(), in1.data<scalar_t>(), nelem0, nelem1, out.data<long>()),
-      nelem0);
-  }));
-}
-
-
-void crosscheck_kernel(at::Tensor in0, at::Tensor in1, at::Tensor out) {
-  auto nelem0 = in0.size(0);
-  auto nelem1 = in1.size(0);
-
-  iterate_cuda(
-    CrossCheckFunctor(in0.data<long>(), in1.data<long>(), nelem0, nelem1, out.data<uint8_t>()),
-    nelem0);
-}
-
-void proj_nn_kernel(at::Tensor xyz0, at::Tensor xyz1, at::Tensor K, int patch_size, at::Tensor out) {
-  auto batch_size = xyz0.size(0);
-  auto height = xyz0.size(1);
-  auto width = xyz0.size(2);
-
-  AT_DISPATCH_FLOATING_TYPES(xyz0.scalar_type(), "proj_nn", ([&] {
-    iterate_cuda(
-      ProjNNFunctor<scalar_t>(xyz0.data<scalar_t>(), xyz1.data<scalar_t>(), K.data<scalar_t>(), batch_size, height, width, patch_size, out.data<long>()),
-      batch_size * height * width);
-  }));
-}
-
-void xcorrvol_kernel(at::Tensor in0, at::Tensor in1, int n_disps, int block_size, at::Tensor out) {
-  auto channels = in0.size(0);
-  auto height = in0.size(1);
-  auto width = in0.size(2);
-
-  AT_DISPATCH_FLOATING_TYPES(in0.scalar_type(), "xcorrvol", ([&] {
-    iterate_cuda(
-      XCorrVolFunctor<scalar_t>(in0.data<scalar_t>(), in1.data<scalar_t>(), channels, height, width, n_disps, block_size, out.data<scalar_t>()),
-      n_disps * height * width, 512);
-  }));
-}
-
-
-
-void photometric_loss_forward_kernel(at::Tensor es, at::Tensor ta, int block_size, int type, float eps, at::Tensor out) {
-  auto batch_size = es.size(0);
-  auto channels = es.size(1);
-  auto height = es.size(2);
-  auto width = es.size(3);
-
-  AT_DISPATCH_FLOATING_TYPES(es.scalar_type(), "photometric_loss_forward_cuda", ([&] {
-    if(type == PHOTOMETRIC_LOSS_MSE) {
-      iterate_cuda(
-          PhotometricLossForward<scalar_t, PHOTOMETRIC_LOSS_MSE>(es.data<scalar_t>(), ta.data<scalar_t>(), block_size, eps, batch_size, channels, height, width, out.data<scalar_t>()),
-          out.numel());
-    }
-    else if(type == PHOTOMETRIC_LOSS_SAD) {
-      iterate_cuda(
-          PhotometricLossForward<scalar_t, PHOTOMETRIC_LOSS_SAD>(es.data<scalar_t>(), ta.data<scalar_t>(), block_size, eps, batch_size, channels, height, width, out.data<scalar_t>()),
-          out.numel());
-    }
-    else if(type == PHOTOMETRIC_LOSS_CENSUS_MSE) {
-      iterate_cuda(
-          PhotometricLossForward<scalar_t, PHOTOMETRIC_LOSS_CENSUS_MSE>(es.data<scalar_t>(), ta.data<scalar_t>(), block_size, eps, batch_size, channels, height, width, out.data<scalar_t>()),
-          out.numel());
-    }
-    else if(type == PHOTOMETRIC_LOSS_CENSUS_SAD) {
-      iterate_cuda(
-          PhotometricLossForward<scalar_t, PHOTOMETRIC_LOSS_CENSUS_SAD>(es.data<scalar_t>(), ta.data<scalar_t>(), block_size, eps, batch_size, channels, height, width, out.data<scalar_t>()),
-          out.numel());
-    }
-  }));
-}
-
-void photometric_loss_backward_kernel(at::Tensor es, at::Tensor ta, at::Tensor grad_out, int block_size, int type, float eps, at::Tensor grad_in) {
-  auto batch_size = es.size(0);
-  auto channels = es.size(1);
-  auto height = es.size(2);
-  auto width = es.size(3);
-
-  AT_DISPATCH_FLOATING_TYPES(es.scalar_type(), "photometric_loss_backward_cuda", ([&] {
-    if(type == PHOTOMETRIC_LOSS_MSE) {
-      iterate_cuda(
-          PhotometricLossBackward<scalar_t, PHOTOMETRIC_LOSS_MSE>(es.data<scalar_t>(), ta.data<scalar_t>(), grad_out.data<scalar_t>(), block_size, eps, batch_size, channels, height, width, grad_in.data<scalar_t>()),
-          grad_out.numel());
-    }
-    else if(type == PHOTOMETRIC_LOSS_SAD) {
-      iterate_cuda(
-          PhotometricLossBackward<scalar_t, PHOTOMETRIC_LOSS_SAD>(es.data<scalar_t>(), ta.data<scalar_t>(), grad_out.data<scalar_t>(), block_size, eps, batch_size, channels, height, width, grad_in.data<scalar_t>()),
-          grad_out.numel());
-    }
-    else if(type == PHOTOMETRIC_LOSS_CENSUS_MSE) {
-      iterate_cuda(
-          PhotometricLossBackward<scalar_t, PHOTOMETRIC_LOSS_CENSUS_MSE>(es.data<scalar_t>(), ta.data<scalar_t>(), grad_out.data<scalar_t>(), block_size, eps, batch_size, channels, height, width, grad_in.data<scalar_t>()),
-          grad_out.numel());
-    }
-    else if(type == PHOTOMETRIC_LOSS_CENSUS_SAD) {
-      iterate_cuda(
-          PhotometricLossBackward<scalar_t, PHOTOMETRIC_LOSS_CENSUS_SAD>(es.data<scalar_t>(), ta.data<scalar_t>(), grad_out.data<scalar_t>(), block_size, eps, batch_size, channels, height, width, grad_in.data<scalar_t>()),
-          grad_out.numel());
-    }
-  }));
-}
diff --git a/torchext/functions.py b/torchext/functions.py
index 2885ae9..9ae1d4c 100644
--- a/torchext/functions.py
+++ b/torchext/functions.py
@@ -1,126 +1,4 @@
 import torch
-from . import ext_cpu
-from . import ext_cuda
-
-
-class NNFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, in0, in1):
-        args = (in0, in1)
-        if in0.is_cuda:
-            out = ext_cuda.nn_cuda(*args)
-        else:
-            out = ext_cpu.nn_cpu(*args)
-        return out
-
-    @staticmethod
-    def backward(ctx, grad_out):
-        return None, None
-
-
-def nn(in0, in1):
-    return NNFunction.apply(in0, in1)
-
-
-class CrossCheckFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, in0, in1):
-        args = (in0, in1)
-        if in0.is_cuda:
-            out = ext_cuda.crosscheck_cuda(*args)
-        else:
-            out = ext_cpu.crosscheck_cpu(*args)
-        return out
-
-    @staticmethod
-    def backward(ctx, grad_out):
-        return None, None
-
-
-def crosscheck(in0, in1):
-    return CrossCheckFunction.apply(in0, in1)
-
-
-class ProjNNFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, xyz0, xyz1, K, patch_size):
-        args = (xyz0, xyz1, K, patch_size)
-        if xyz0.is_cuda:
-            out = ext_cuda.proj_nn_cuda(*args)
-        else:
-            out = ext_cpu.proj_nn_cpu(*args)
-        return out
-
-    @staticmethod
-    def backward(ctx, grad_out):
-        return None, None, None, None
-
-
-def proj_nn(xyz0, xyz1, K, patch_size):
-    return ProjNNFunction.apply(xyz0, xyz1, K, patch_size)
-
-
-class XCorrVolFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, in0, in1, n_disps, block_size):
-        args = (in0, in1, n_disps, block_size)
-        if in0.is_cuda:
-            out = ext_cuda.xcorrvol_cuda(*args)
-        else:
-            out = ext_cpu.xcorrvol_cpu(*args)
-        return out
-
-    @staticmethod
-    def backward(ctx, grad_out):
-        return None, None, None, None
-
-
-def xcorrvol(in0, in1, n_disps, block_size):
-    return XCorrVolFunction.apply(in0, in1, n_disps, block_size)
-
-
-class PhotometricLossFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, es, ta, block_size, type, eps):
-        args = (es, ta, block_size, type, eps)
-        ctx.save_for_backward(es, ta)
-        ctx.block_size = block_size
-        ctx.type = type
-        ctx.eps = eps
-        if es.is_cuda:
-            out = ext_cuda.photometric_loss_forward(*args)
-        else:
-            out = ext_cpu.photometric_loss_forward(*args)
-        return out
-
-    @staticmethod
-    def backward(ctx, grad_out):
-        es, ta = ctx.saved_tensors
-        block_size = ctx.block_size
-        type = ctx.type
-        eps = ctx.eps
-        args = (es, ta, grad_out.contiguous(), block_size, type, eps)
-        if grad_out.is_cuda:
-            grad_es = ext_cuda.photometric_loss_backward(*args)
-        else:
-            grad_es = ext_cpu.photometric_loss_backward(*args)
-        return grad_es, None, None, None, None
-
-
-def photometric_loss(es, ta, block_size, type='mse', eps=0.1):
-    type = type.lower()
-    if type == 'mse':
-        type = 0
-    elif type == 'sad':
-        type = 1
-    elif type == 'census_mse':
-        type = 2
-    elif type == 'census_sad':
-        type = 3
-    else:
-        raise Exception('invalid loss type')
-    return PhotometricLossFunction.apply(es, ta, block_size, type, eps)
-
 
 def photometric_loss_pytorch(es, ta, block_size, type='mse', eps=0.1):
     type = type.lower()