repeatcraftp
repeatcraftp copied to clipboard
In rcstat if rowRaw.get(col[2]): IndexError: list index out of range
Hi,
I've been running repeatcraft successfully, but this error keeps flagging in some runs:
File "/repeatCraft/repeatcraft.py", line 187, in
rcStatm.rcstat(rclabelp=outputnamelabel,rmergep=outputnamemerge,outfile= statfname, ltrgroup = True)
File "/repeatCraft/helper/rcStatm.py", line 54, in rcstat
if rowRaw.get(col[2]):
IndexError: list index out of range
Is there a fix for this currently, or a reason that this is happening?
Many thanks!
I've narrowed this down to the fact that a gff-version 3
line is found at the bottom of both *.rclabel.gff and *.rmerge.gff, which is causing an error when iterating through the file
I have fixed this so far by removing the "gff-version 3" line at the bottom of rclabel.gff an rmerge.gff before the quantification is performed by editing https://github.com/niccw/repeatcraftp/blob/master/helper/rcStatm.py :
import sys
import re
def rcstat(rclabelp,rmergep,outfile,ltrgroup=True):
rlabel = rclabelp
rmerge = rmergep
# print track
stdout = sys.stdout
sys.stdout = open(outfile, 'w')
# Read rlabel
# Stat variables
telabel = 0
ltrlabel = 0
teltrlabel = 0
teD = {}
ltrD ={}
teltrD = {}
rowRaw = {}
rowMerge = {}
# flag
teflag = False
ltrflag = False
# Check number of row of header
cnt = 0
with open(rlabel, "r") as f:
for line in f:
cnt += 1
if not line.startswith("#"):
cnt -= 1
break
# delete shitty file headers
with open(rlabel, "r") as f:
lines = f.readlines()
with open(rlabel, "w") as f:
for line in lines:
if line.strip("\n") != "##gff-version 3":
f.write(line)
with open(rmerge, "r") as f:
lines = f.readlines()
with open(rmerge, "w") as f:
for line in lines:
if line.strip("\n") != "##gff-version 3":
f.write(line)
with open(rlabel,"r") as f:
for i in range(cnt):
next(f)
for line in f:
col = line.rstrip().split("\t")
if rowRaw.get(col[2]):
rowRaw[col[2]] += 1
else:
rowRaw[col[2]] = 1
if re.search("TEgroup=",col[8]):
teflag = True
telabel += 1
if teD.get(col[2]):
teD[col[2]] += 1
else:
teD[col[2]] = 1
if re.search("LTRgroup=",col[8]):
ltrflag = True
ltrlabel += 1
if ltrD.get(col[2]):
ltrD[col[2]] += 1
else:
ltrD[col[2]] = 1
if teflag and ltrflag:
teltrlabel += 1
if teltrD.get(col[2]):
teltrD[col[2]] += 1
else:
teltrD[col[2]] = 1
teflag = False
ltrflag = False
# Read rmerge
# Check number of row of header
cnt = 0
with open(rmerge, "r") as f:
for line in f:
cnt += 1
if not line.startswith("#"):
cnt -= 1
break
with open(rmerge,"r") as f:
for i in range(cnt):
next(f)
for line in f:
col = line.rstrip().split("\t")
if rowMerge.get(col[2]):
rowMerge[col[2]] += 1
else:
rowMerge[col[2]] = 1
print("#1. Number of repeats (by class) before and after merge")
print("=============================================================")
print(*["repeat class","no. before merge","no. after merge"], sep="\t")
for c in list(rowRaw.keys()):
print(*[c,rowRaw[c],rowMerge[c]],sep="\t")
print("\n")
print("#2. Number of repeats (by class) merged by TEgroup and LTRgroup")
print("=============================================================")
for c in list(teD.keys()):
if ltrgroup:
if re.search("LTR",c):
try:
print(*[c,teD[c],ltrD[c]],sep="\t")
except:
print(*[c,teD[c],""],sep="\t")
else:
print(*[c,teD[c],""],sep="\t")
else:
print(*[c, teD[c], ""], sep="\t")
sys.stdout.close()
sys.stdout = stdout
I'm having the same issue and this doesn't seem to be working, is there another way to generate the summary file?
I'm having the same issue and this doesn't seem to be working, is there another way to generate the summary file?
Try this as your repeatcraftp/helper/rcStatm.py, I made a little more modifications to avoid read any lines startswith "#", it works for me.
import sys
import re
def rcstat(rclabelp,rmergep,outfile,ltrgroup=True):
rlabel = rclabelp
rmerge = rmergep
# print track
stdout = sys.stdout
sys.stdout = open(outfile, 'w')
# Read rlabel
# Stat variables
telabel = 0
ltrlabel = 0
teltrlabel = 0
teD = {}
ltrD ={}
teltrD = {}
rowRaw = {}
rowMerge = {}
# flag
teflag = False
ltrflag = False
# Check number of row of header
cnt = 0
with open(rlabel, "r") as f:
for line in f:
cnt += 1
if not line.startswith("#"):
cnt -= 1
break
# delete shitty file headers
with open(rlabel, "r") as f:
lines = f.readlines()
with open(rlabel, "w") as f:
for line in lines:
if line[0] != "#":
f.write(line)
with open(rmerge, "r") as f:
lines = f.readlines()
with open(rmerge, "w") as f:
for line in lines:
if line[0] != "#":
f.write(line)
with open(rlabel,"r") as f:
for i in range(cnt):
next(f)
for line in f:
col = line.rstrip().split("\t")
if rowRaw.get(col[2]):
rowRaw[col[2]] += 1
else:
rowRaw[col[2]] = 1
if re.search("TEgroup=",col[8]):
teflag = True
telabel += 1
if teD.get(col[2]):
teD[col[2]] += 1
else:
teD[col[2]] = 1
if re.search("LTRgroup=",col[8]):
ltrflag = True
ltrlabel += 1
if ltrD.get(col[2]):
ltrD[col[2]] += 1
else:
ltrD[col[2]] = 1
if teflag and ltrflag:
teltrlabel += 1
if teltrD.get(col[2]):
teltrD[col[2]] += 1
else:
teltrD[col[2]] = 1
teflag = False
ltrflag = False
# Read rmerge
# Check number of row of header
cnt = 0
with open(rmerge, "r") as f:
for line in f:
cnt += 1
if not line.startswith("#"):
cnt -= 1
break
with open(rmerge,"r") as f:
for i in range(cnt):
next(f)
for line in f:
col = line.rstrip().split("\t")
if rowMerge.get(col[2]):
rowMerge[col[2]] += 1
else:
rowMerge[col[2]] = 1
print("#1. Number of repeats (by class) before and after merge")
print("=============================================================")
print(*["repeat class","no. before merge","no. after merge"], sep="\t")
for c in list(rowRaw.keys()):
print(*[c,rowRaw[c],rowMerge[c]],sep="\t")
print("\n")
print("#2. Number of repeats (by class) merged by TEgroup and LTRgroup")
print("=============================================================")
for c in list(teD.keys()):
if ltrgroup:
if re.search("LTR",c):
try:
print(*[c,teD[c],ltrD[c]],sep="\t")
except:
print(*[c,teD[c],""],sep="\t")
else:
print(*[c,teD[c],""],sep="\t")
else:
print(*[c, teD[c], ""], sep="\t")
sys.stdout.close()
sys.stdout = stdout
Hi, is there any reason the proposed solutions have not been implemented / merged into the codebase?