adam
                                
                                 adam copied to clipboard
                                
                                    adam copied to clipboard
                            
                            
                            
                        Bump htsjdk dependency version to 2.20.3
Expected to fail CI.
ADAM version 0.28.0 with htsjdk 2.19.0
$ adam-shell
scala> import org.bdgenomics.adam.util.ADAMShell._
import org.bdgenomics.adam.util.ADAMShell._
scala> import org.bdgenomics.adam.rdd.ADAMContext._
import org.bdgenomics.adam.rdd.ADAMContext._
scala> val reads = sc.loadAlignments("adam-core/src/test/resources/small.sam")
reads: org.bdgenomics.adam.rdd.read.AlignmentRecordDataset = RDDBoundAlignmentRecordDataset with 2 reference sequences, 0 read groups, and 2 processing steps
scala> printAlignmentAttributes(reads, Seq(), 200)
Alignment Attributes
+----------------+-----------+-----------+---------------------------+--------+------------+
| Reference Name |   Start   |    End    |         Read Name         | Sample | Read Group |
+----------------+-----------+-----------+---------------------------+--------+------------+
|              1 |  26472783 |  26472858 |  simread:1:26472783:false |        |            |
|              1 | 240997787 | 240997862 |  simread:1:240997787:true |        |            |
|              1 | 189606653 | 189606728 |  simread:1:189606653:true |        |            |
|              1 | 207027738 | 207027813 |  simread:1:207027738:true |        |            |
|              1 |  14397233 |  14397308 |  simread:1:14397233:false |        |            |
|              1 | 240344442 | 240344517 |  simread:1:240344442:true |        |            |
|              1 | 153978724 | 153978799 | simread:1:153978724:false |        |            |
|              1 | 237728409 | 237728484 |  simread:1:237728409:true |        |            |
|              1 | 231911906 | 231911981 | simread:1:231911906:false |        |            |
|              1 |  50683371 |  50683446 |  simread:1:50683371:false |        |            |
|              1 |  37577445 |  37577520 |  simread:1:37577445:false |        |            |
|              1 | 195211965 | 195212040 | simread:1:195211965:false |        |            |
|              1 | 163841413 | 163841488 | simread:1:163841413:false |        |            |
|              1 | 101556378 | 101556453 | simread:1:101556378:false |        |            |
|              1 |  20101800 |  20101875 |   simread:1:20101800:true |        |            |
|              1 | 186794283 | 186794358 |  simread:1:186794283:true |        |            |
|              1 | 165341382 | 165341457 |  simread:1:165341382:true |        |            |
|              1 |   5469106 |   5469181 |    simread:1:5469106:true |        |            |
|              1 |  89554252 |  89554327 |  simread:1:89554252:false |        |            |
|              1 | 169801933 | 169802008 |  simread:1:169801933:true |        |            |
+----------------+-----------+-----------+---------------------------+--------+------------+
This pull request with htsjdk 2.20.0
$ adam-shell
scala> import org.bdgenomics.adam.util.ADAMShell._
import org.bdgenomics.adam.util.ADAMShell._
scala> import org.bdgenomics.adam.rdd.ADAMContext._
import org.bdgenomics.adam.rdd.ADAMContext._
scala> val reads = sc.loadAlignments("adam-core/src/test/resources/small.sam")
reads: org.bdgenomics.adam.rdd.read.AlignmentRecordDataset = RDDBoundAlignmentRecordDataset with 2 reference sequences, 0 read groups, and 2 processing steps
scala> printAlignmentAttributes(reads, Seq(), 200)
Alignment Attributes
+----------------+-----------+-----------+---------------------------+--------+------------+
| Reference Name |   Start   |    End    |         Read Name         | Sample | Read Group |
+----------------+-----------+-----------+---------------------------+--------+------------+
|              1 |  26472783 |  26472858 |             6472783:false |        |            |
|              1 | 240997787 | 240997862 |  simread:1:240997787:true |        |            |
|              1 | 189606653 | 189606728 |  simread:1:189606653:true |        |            |
|              1 | 207027738 | 207027813 |  simread:1:207027738:true |        |            |
|              1 |  14397233 |  14397308 |  simread:1:14397233:false |        |            |
|              1 | 240344442 | 240344517 |  simread:1:240344442:true |        |            |
|              1 | 153978724 | 153978799 | simread:1:153978724:false |        |            |
|              1 | 237728409 | 237728484 |  simread:1:237728409:true |        |            |
|              1 | 231911906 | 231911981 | simread:1:231911906:false |        |            |
|              1 |  50683371 |  50683446 |  simread:1:50683371:false |        |            |
|              1 |  37577445 |  37577520 |  simread:1:37577445:false |        |            |
|              1 | 195211965 | 195212040 | simread:1:195211965:false |        |            |
|              1 | 163841413 | 163841488 | simread:1:163841413:false |        |            |
|              1 | 101556378 | 101556453 | simread:1:101556378:false |        |            |
|              1 |  20101800 |  20101875 |   simread:1:20101800:true |        |            |
|              1 | 186794283 | 186794358 |  simread:1:186794283:true |        |            |
|              1 | 165341382 | 165341457 |  simread:1:165341382:true |        |            |
|              1 |   5469106 |   5469181 |    simread:1:5469106:true |        |            |
|              1 |  89554252 |  89554327 |  simread:1:89554252:false |        |            |
|              1 | 169801933 | 169802008 |  simread:1:169801933:true |        |            |
+----------------+-----------+-----------+---------------------------+--------+------------+
For some reason, the name of the first read has been clipped from simread:1:26472783:false to 6472783:false.
Test FAILed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/ADAM-prb/3039/
Build result: FAILURE
[...truncated 3 lines...]Building remotely on amp-jenkins-worker-05 (centos spark-test) in workspace /home/jenkins/workspace/ADAM-prbWiping out workspace first.Cloning the remote Git repositoryCloning repository https://github.com/bigdatagenomics/adam.git > git init /home/jenkins/workspace/ADAM-prb # timeout=10Fetching upstream changes from https://github.com/bigdatagenomics/adam.git > git --version # timeout=10 > git fetch --tags --progress https://github.com/bigdatagenomics/adam.git +refs/heads/:refs/remotes/origin/ # timeout=15 > git config remote.origin.url https://github.com/bigdatagenomics/adam.git # timeout=10 > git config --add remote.origin.fetch +refs/heads/:refs/remotes/origin/ # timeout=10 > git config remote.origin.url https://github.com/bigdatagenomics/adam.git # timeout=10Fetching upstream changes from https://github.com/bigdatagenomics/adam.git > git fetch --tags --progress https://github.com/bigdatagenomics/adam.git +refs/pull/:refs/remotes/origin/pr/ # timeout=15 > git rev-parse origin/pr/2195/merge^{commit} # timeout=10 > git branch -a -v --no-abbrev --contains f20a19859fbfbd38e2ff8549054f83a999f5a401 # timeout=10Checking out Revision f20a19859fbfbd38e2ff8549054f83a999f5a401 (origin/pr/2195/merge) > git config core.sparsecheckout # timeout=10 > git checkout -f f20a19859fbfbd38e2ff8549054f83a999f5a401First time build. Skipping changelog.Triggering ADAM-prb ? 2.7.5,2.12,2.4.3,ubuntuTriggering ADAM-prb ? 2.7.5,2.11,2.4.3,ubuntuADAM-prb ? 2.7.5,2.12,2.4.3,ubuntu completed with result FAILUREADAM-prb ? 2.7.5,2.11,2.4.3,ubuntu completed with result FAILURENotifying endpoint 'HTTP:https://webhooks.gitter.im/e/ac8bb6e9f53357bc8aa8' Test FAILed.
Same with htsjdk version 2.20.1.
Test FAILed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/ADAM-prb/3042/
Build result: FAILURE
[...truncated 3 lines...]Building remotely on amp-jenkins-worker-05 (centos spark-test) in workspace /home/jenkins/workspace/ADAM-prbWiping out workspace first.Cloning the remote Git repositoryCloning repository https://github.com/bigdatagenomics/adam.git > git init /home/jenkins/workspace/ADAM-prb # timeout=10Fetching upstream changes from https://github.com/bigdatagenomics/adam.git > git --version # timeout=10 > git fetch --tags --progress https://github.com/bigdatagenomics/adam.git +refs/heads/:refs/remotes/origin/ # timeout=15 > git config remote.origin.url https://github.com/bigdatagenomics/adam.git # timeout=10 > git config --add remote.origin.fetch +refs/heads/:refs/remotes/origin/ # timeout=10 > git config remote.origin.url https://github.com/bigdatagenomics/adam.git # timeout=10Fetching upstream changes from https://github.com/bigdatagenomics/adam.git > git fetch --tags --progress https://github.com/bigdatagenomics/adam.git +refs/pull/:refs/remotes/origin/pr/ # timeout=15 > git rev-parse origin/pr/2195/merge^{commit} # timeout=10 > git branch -a -v --no-abbrev --contains 70fa095272d8c033cfbc4f9d55466c46baa60179 # timeout=10Checking out Revision 70fa095272d8c033cfbc4f9d55466c46baa60179 (origin/pr/2195/merge) > git config core.sparsecheckout # timeout=10 > git checkout -f 70fa095272d8c033cfbc4f9d55466c46baa60179First time build. Skipping changelog.Triggering ADAM-prb ? 2.7.5,2.12,2.4.3,ubuntuTriggering ADAM-prb ? 2.7.5,2.11,2.4.3,ubuntuADAM-prb ? 2.7.5,2.12,2.4.3,ubuntu completed with result FAILUREADAM-prb ? 2.7.5,2.11,2.4.3,ubuntu completed with result FAILURENotifying endpoint 'HTTP:https://webhooks.gitter.im/e/ac8bb6e9f53357bc8aa8' Test FAILed.
Test FAILed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/ADAM-prb/3045/
Build result: FAILURE
[...truncated 3 lines...]Building remotely on amp-jenkins-worker-05 (centos spark-test) in workspace /home/jenkins/workspace/ADAM-prbWiping out workspace first.Cloning the remote Git repositoryCloning repository https://github.com/bigdatagenomics/adam.git > git init /home/jenkins/workspace/ADAM-prb # timeout=10Fetching upstream changes from https://github.com/bigdatagenomics/adam.git > git --version # timeout=10 > git fetch --tags --progress https://github.com/bigdatagenomics/adam.git +refs/heads/:refs/remotes/origin/ # timeout=15 > git config remote.origin.url https://github.com/bigdatagenomics/adam.git # timeout=10 > git config --add remote.origin.fetch +refs/heads/:refs/remotes/origin/ # timeout=10 > git config remote.origin.url https://github.com/bigdatagenomics/adam.git # timeout=10Fetching upstream changes from https://github.com/bigdatagenomics/adam.git > git fetch --tags --progress https://github.com/bigdatagenomics/adam.git +refs/pull/:refs/remotes/origin/pr/ # timeout=15 > git rev-parse origin/pr/2195/merge^{commit} # timeout=10 > git branch -a -v --no-abbrev --contains f52fc1e4de476c8d9a8ff314b8338f2045f5107c # timeout=10Checking out Revision f52fc1e4de476c8d9a8ff314b8338f2045f5107c (origin/pr/2195/merge) > git config core.sparsecheckout # timeout=10 > git checkout -f f52fc1e4de476c8d9a8ff314b8338f2045f5107cFirst time build. Skipping changelog.Triggering ADAM-prb ? 2.7.5,2.12,2.4.3,ubuntuTriggering ADAM-prb ? 2.7.5,2.11,2.4.3,ubuntuADAM-prb ? 2.7.5,2.12,2.4.3,ubuntu completed with result FAILUREADAM-prb ? 2.7.5,2.11,2.4.3,ubuntu completed with result FAILURENotifying endpoint 'HTTP:https://webhooks.gitter.im/e/ac8bb6e9f53357bc8aa8' Test FAILed.
Another test failure, it appears htsjdk 2.20.3 cannot read this test file incorrectly
@SQ     SN:1    LN:249250621                                                                                                                                                                                                                                         
read:0  0       1       1       60      75M     *       0       0       GTATAAGAGCAGCCTTATTCCTATTTATAATCAGGGTGAAACACCTGTGCCAATGCCAAGACAGGGGTGCCAAGA     *       NM:i:0  AS:i:75 XS:i:0                                                                               
read:4  4       *       0       0       *       *       0       0       GTATAAGAGCAGCCTTATTCCTATTTATAATCAGGGTGAAACACCTGTGCCAATGCCAAGACAGGGGTGCCAAGA     *       NM:i:0  AS:i:75 XS:i:0 
...
htsjdk.samtools.SAMFormatException: Error parsing text SAM file. RNAME '75M' not found in any SQ record; Line 3
Line: 1	60	75M	*	0	0	GTATAAGAGCAGCCTTATTCCTATTTATAATCAGGGTGAAACACCTGTGCCAATGCCAAGACAGGGGTGCCAAGA	*	NM:i:0	AS:i:75	XS:i:0
	at htsjdk.samtools.SAMLineParser.reportErrorParsingLine(SAMLineParser.java:457)
	at htsjdk.samtools.SAMLineParser.validateReferenceName(SAMLineParser.java:199)
	at htsjdk.samtools.SAMLineParser.parseLine(SAMLineParser.java:255)
	at htsjdk.samtools.SAMTextReader$RecordIterator.parseLine(SAMTextReader.java:268)
	at htsjdk.samtools.SAMTextReader$RecordIterator.next(SAMTextReader.java:255)
	at htsjdk.samtools.SAMTextReader$RecordIterator.next(SAMTextReader.java:228)
	at htsjdk.samtools.SamReader$AssertingIterator.next(SamReader.java:574)
	at htsjdk.samtools.SamReader$AssertingIterator.next(SamReader.java:553)
	at org.seqdoop.hadoop_bam.SAMRecordReader.nextKeyValue(SAMRecordReader.java:175)
	at org.apache.spark.rdd.NewHadoopRDD$$anon$1.hasNext(NewHadoopRDD.scala:230)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at scala.collection.AbstractIterator.to(Iterator.scala:1334)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1334)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1334)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Test FAILed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/ADAM-prb/3056/
Build result: FAILURE
[...truncated 3 lines...]Building remotely on amp-jenkins-worker-05 (centos spark-test) in workspace /home/jenkins/workspace/ADAM-prbWiping out workspace first.Cloning the remote Git repositoryCloning repository https://github.com/bigdatagenomics/adam.git > git init /home/jenkins/workspace/ADAM-prb # timeout=10Fetching upstream changes from https://github.com/bigdatagenomics/adam.git > git --version # timeout=10 > git fetch --tags --progress https://github.com/bigdatagenomics/adam.git +refs/heads/:refs/remotes/origin/ # timeout=15 > git config remote.origin.url https://github.com/bigdatagenomics/adam.git # timeout=10 > git config --add remote.origin.fetch +refs/heads/:refs/remotes/origin/ # timeout=10 > git config remote.origin.url https://github.com/bigdatagenomics/adam.git # timeout=10Fetching upstream changes from https://github.com/bigdatagenomics/adam.git > git fetch --tags --progress https://github.com/bigdatagenomics/adam.git +refs/pull/:refs/remotes/origin/pr/ # timeout=15 > git rev-parse origin/pr/2195/merge^{commit} # timeout=10 > git branch -a -v --no-abbrev --contains 9472dd08b735e42dd7e03ebd391021fcdb7bbbda # timeout=10Checking out Revision 9472dd08b735e42dd7e03ebd391021fcdb7bbbda (origin/pr/2195/merge) > git config core.sparsecheckout # timeout=10 > git checkout -f 9472dd08b735e42dd7e03ebd391021fcdb7bbbdaFirst time build. Skipping changelog.Triggering ADAM-prb ? 2.7.5,2.11,2.4.4,ubuntuTriggering ADAM-prb ? 2.7.5,2.12,2.4.4,ubuntuADAM-prb ? 2.7.5,2.11,2.4.4,ubuntu completed with result FAILUREADAM-prb ? 2.7.5,2.12,2.4.4,ubuntu completed with result FAILURENotifying endpoint 'HTTP:https://webhooks.gitter.im/e/ac8bb6e9f53357bc8aa8' Test FAILed.
I am afraid the only resolution to this would be #2111, which involves a lot of code change and possible performance implications. Hadoop-BAM is effectively no longer maintained.