Faiss causes errors in Pytorch with 3090Ti
Summary
I tried to use faiss in the pytorch model. It can be run correctly with GTX1080Ti and CUDA10. However, when runing on a machine with RTX3090Ti GPU, it behaves abnormally. After performing knn search with faiss, the masked_select operation in pytorch will get wrong results. There is no problem using the two parts separately.
Platform
OS: Ubuntu 20.04.2 LTS GPU: Nvidia RTX3090Ti CUDA: V11.1.105 Pytorch version: 1.9.0
Faiss version: 1.7.0 Installed from: conda install faiss-gpu -c conda-forge
Running on:
- [ ] CPU
- [ *] GPU
Interface:
- [ ] C++
- [ *] Python
Reproduction instructions
The code for reproducing the bug:
import faiss
import torch
import faiss.contrib.torch_utils
class faiss_KNN:
def __init__(self, d):
res = faiss.StandardGpuResources()
self.index = faiss.GpuIndexFlat(res, d, faiss.METRIC_INNER_PRODUCT)
def search_raw_array_pytorch(self, qa, va, k):
#print(qa.size())
#print(va.size())
self.index.reset()
self.index.add(va)
d_np, I_np = self.index.search(qa, k) # shape: N*k
return d_np, I_np
def knn(self, fmap1, fmap2, k):
b, ch, _ = fmap1.shape
if b == 1:
fmap1 = fmap1.view(ch, -1).t().contiguous()
fmap2 = fmap2.view(ch, -1).t().contiguous()
dist, indx = self.search_raw_array_pytorch(fmap2, fmap1, k)
dist = dist.t().detach().unsqueeze(0).contiguous()
indx = indx.t().detach().unsqueeze(0).contiguous()
else:
fmap1 = fmap1.view(b, ch, -1).permute(0, 2, 1).contiguous()
fmap2 = fmap2.view(b, ch, -1).permute(0, 2, 1).contiguous()
dist = []
indx = []
for i in range(b):
#print('batch_i: ', i, '/', b)
dist_i, indx_i = self.search_raw_array_pytorch(fmap2[i], fmap1[i], k)
dist_i = dist_i.t().detach().unsqueeze(0).contiguous()
indx_i = indx_i.t().detach().unsqueeze(0).contiguous()
dist.append(dist_i)
indx.append(indx_i)
dist = torch.cat(dist, dim=0)
indx = torch.cat(indx, dim=0)
return dist, indx
knn_finder = faiss_KNN(256)
def coords_grid_y_first(batch, ht, wd):
"""Place y grid before x grid"""
coords = torch.meshgrid(torch.arange(ht), torch.arange(wd))
coords = torch.stack(coords, dim=0).int()
return coords[None].expand(batch, -1, -1, -1)
def compute_sparse_corr(fmap1, fmap2, k=32):
"""
Compute a cost volume containing the k-largest hypotheses for each pixel.
Output: corr_mink
"""
B, C, H1, W1 = fmap1.shape
H2, W2 = fmap2.shape[2:]
N = H1 * W1
fmap1, fmap2 = fmap1.view(B, C, -1), fmap2.view(B, C, -1)
with torch.no_grad():
_, indices = knn_finder.knn(fmap1, fmap2, k) # [B, k, H1*W1]
torch.cuda.synchronize()
#indices = torch.round(torch.rand((B,k,N))*H1*W1).clamp(0, H1*W1-1).long().to(fmap1.device)
indices_coord = indices.unsqueeze(1).expand(-1, 2, -1, -1) # [B, 2, k, H1*W1]
coords0 = coords_grid_y_first(B, H2, W2).view(B, 2, 1, -1).expand(-1, -1, k, -1).to(fmap1.device) # [B, 2, k, H1*W1]
coords1 = coords0.gather(3, indices_coord) # [B, 2, k, H1*W1]
coords1 = coords1 - coords0
batch_index = torch.arange(B).view(B, 1, 1, 1).expand(-1, -1, k, N).type_as(coords1)
return coords0, coords1, batch_index # coords: [B, 2, k, H1*W1]
fmap1 = torch.rand((4, 256, 28, 28)).cuda().float()
fmap2 = torch.rand((4, 256, 28, 28)).cuda().float()
coords0, coords1, batch_index = compute_sparse_corr(fmap1, fmap2)
search_range = 10
mask = (coords1[:, 0].abs() <= search_range) & (coords1[:, 1].abs() <= search_range)
batch_index = batch_index[:,0]
coords0 = coords0[:,0]
print(batch_index.size(), coords0.size())
batch_index = batch_index[mask]
coords0 = coords0[mask]
print(batch_index.size(), coords0.size())
The batch_index and coords0 tensor are selected with the same mask, while the shapes of the results are not the same. For example, the output of a run is:
torch.Size([4, 32, 784]) torch.Size([4, 32, 784])
torch.Size([0]) torch.Size([43])
Would it be possible to simplify the test case? That would make it easier to investigate.
import faiss
import torch
import faiss.contrib.torch_utils
class faiss_KNN:
def __init__(self, d):
res = faiss.StandardGpuResources()
self.index = faiss.GpuIndexFlat(res, d, faiss.METRIC_INNER_PRODUCT)
def knn(self, f1, f2, k):
f1 = f1.t().contiguous()
f2 = f2.t().contiguous()
self.index.reset()
self.index.add(f2)
d_np, I_np = self.index.search(f1, k)
return d_np, I_np
knn_finder = faiss_KNN(256)
torch.manual_seed(123)
f1 = torch.rand((256, 100)).cuda().float()
f2 = torch.rand((256, 100)).cuda().float()
_, index = knn_finder.knn(f1, f2, 1)
index = index[:,0]
original_index = torch.arange(100).cuda()
diff = (index-original_index).abs()
mask = (diff <= 10)
print(mask.size())
print(original_index.size())
#print(mask.sum()) //Commenting out this line will cause error.
original_index = original_index[mask]
The error info of the above code:
Traceback (most recent call last):
File "reshow_simple.py", line 32, in <module>
original_index = original_index[mask]
RuntimeError: invalid shape dimension -50
However, the error disappears after adding a meaningless mask.sum() before the mase_select operation. Why is there such unpredictable behavior?
I used Anaconda, I installed bellow, it worked. If you use Docker, please install Miniconda.
conda install faiss-gpu cudatoolkit=11.1 -c pytorch-gpu
conda install -c anaconda pytorch-gpu
This issue is stale because it has been open for 7 days with no activity.