chanjo
chanjo copied to clipboard
Investigate GC content integration
This is a first very simple example:
def gc_content(self, query=None, gc_amount='high', gene_ids=None):
"""Generate query to estimate coverage performace.
Works by default on a small subset of genes with high/low GC
content levels (BioMart).
"""
# use the average metrics query unless otherwise requested
query = query or self.average_metrics()
if gc_amount == 'high':
# highest GC content supersets
identifiers = gene_ids or ['UTF1', 'BHLHA9', 'C20orf201', 'LRRC26',
'HES4', 'BHLHE23', 'C9orf172', 'NKX6-2',
'CITED4']
elif gc_amount == 'low':
# lowest GC content supersets
identifiers = gene_ids or ['DEFB114', 'NTS', 'ANGPTL3', 'CYLC2',
'GPR22', 'SI', 'CSN3', 'KLRC4', 'CSN1S1']
else:
raise ValueError("'gc_amount' must be either 'high' or 'low'")
# build and return the query
return query.filter(SuperblockData.parent_id.in_(identifiers))