sourmash
sourmash copied to clipboard
pairwise jaccard - distance - GPU
Dear sourmash-bio team, this is a feature request. Unlike nt-genomes I am comparing protein based minhashes. Here, every single protein of one species and its corresponding minhash is compared to all proteins from a second species. The jaccard distance will than be used to do network-based clustering. Anyhow, the CPU based comparison is limited in parallelsation, so my feature request is related to transfer the pairwise jaccard distance calculation via CUDA on a GPU, to speed up species comparison. Could you provide this feature in sourmash?
Thank you in anticipation
Best regards
Kristian
Simple CPU serial code would like like this:
import screed
from sourmash import MinHash
species_a = 'species_a.fasta'
species_b = 'species_b.fasta'
K = [6, 7, 8, 9, 10, 11]
scale = [1, 10, 20, 100, 200]
def get_mh_array(fasta, K, scale):
k_scale_dict = {}
k_scale_dict['names'] = []
for k in K:
k_scale_dict[k] = {}
for s in scale:
k_scale_dict[k][s] = []
for record in screed.open(fasta):
k_scale_dict['names'] += [record.name.strip().split(' ')[0]]
for k in K:
for s in scale:
mh = MinHash(n=0, ksize=k, is_protein=True, scaled=s)
mh.add_protein(record.sequence)
k_scale_dict[k][s] += [mh]
return k_scale_dict
def pairwise_jaccard_from_array(a_names, a_mh_array, b_names, b_mh_array, outfile, min_jaccard=0.01, k='', s=''):
with open(outfile, 'w') as outhandle:
for i, i_mh in enumerate(a_mh_array):
for j, j_mh in enumerate(b_mh_array):
iname = a_names[i]
jname = b_names[j]
ij_jaccard = i_mh.jaccard(j_mh)
if ij_jaccard > min_jaccard:
outhandle.write(str(k)+'\t'+str(s)+'\t'+iname+'\t'+jname+'\t'+str(ij_jaccard)+'\n')
def pairwise_output(K, scale, a, b, ab_out):
for k in K:
for s in scale:
pairwise_jaccard_from_array(a['names'],
a[k][s],
b['names'],
b[k][s],
ab_out+str(k)+'-'+str(s)+'.out',
min_jaccard=0.01,
k=str(k),
s=str(s))
species_a_mh_array = get_mh_array(species_a, K, scale)
species_b_mh_array = get_mh_array(species_b, K, scale)
pairwise_output(K, scale, species_a_mh_array, species_b_mh_array, 'a-vs-b')