Adam 학습 13의 Fasta/Fastq/SAM/BAM 파일 형식 데이터 읽기
package org.bdgenomics.adamLocal.algorithms.test
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.bdgenomics.adam.rdd.ADAMContext
import htsjdk.samtools.ValidationStringency
//import scala.collection.parallel.Foreach
object hs38DHL1Test1 {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("readFileFromFaFq").setMaster("local")
val sc = new SparkContext(conf)
val ac = new ADAMContext(sc)
val fileFa = "file/adam/hs38DH/hs38DHL1/hs38DHL1.fa"
// val fileFq = "file/adam/hs38DH/hs38DHSE1L100F1.fq"
val fileFq = "file/adam/hs38DH/hs38DHL1/hs38DHL1F1Len10.fq"
val fileSam = "file/adam/hs38DH/hs38DHL1/hs38DHL1F1Len10.sam"
val fq0 = sc.textFile(fileFa)
println("fq0.count:" + fq0.count);
//load Fasta
println(fileFa + ":");
val fa1 = ac.loadFasta(fileFa, 10000)
var fa1Segquence = fa1.map(_.getFragmentSequence()).collect
// fq1Segquence.foreach(println)
println("fa1Segquence.length:" + fa1Segquence.length);
for (i <- 0 until fa1Segquence.length) {
println(fa1Segquence(i));
}
println();
fa1.foreach(println)
//load Fastq
println();
println(fileFq + ":");
val fq1 = ac.loadFastq(fileFq, None, None, ValidationStringency.STRICT)
var fq1Segquence = fq1.map(_.getSequence()).collect
// fq1Segquence.foreach(println)
println("fq1Segquence.length:" + fq1Segquence.length);
for (i <- 0 until fq1Segquence.length) {
println(fq1Segquence(i));
}
println();
fq1.foreach(println)
println("SAM:");
val sam1=ac.loadAlignments(fileSam)
sam1.foreach(println)
sc.stop
}
}
1.Fasta:
ac.loadFasta(fileFa, 10000)
소스 파일:
>chrUn_KN707606v1_decoy AC:KN707606.1 gi:734691250 LN:2200 rl:unplaced M5:20c768ac79ca38077e5012ee0e5f8333 AS:hs38d1
ctagtagctgggactacaagcgcccgccaccacacccggctaatttttttgtatttttagtggagacaggtttcaccgtg
ttggccaggatggtctcgatctcctgaccttgtgatctgcccaccttgccctcccaaagtgctgggattacaggcatgag
ccaccatacccggcagTGTCCTATCCATTTTTAAGGCAGCCACTTGGAGTTGGAGCATGTCTTTCTCTCATAATCTCTTA
CCAGATGTCTCAGAGCAGCCTGTGCACTTTAACTCCAGACATTCTGCCACTGAGCCCCCTAGAGCTCCAGCTTTTAAAGC
ACTTGGGGTGAGCCTCGAGAGATGACAGACGGAGCTGCCCAAGAGCTGCCAGCTGCCAACCCTGCCTGGGGCTTCACGGC
CCGCGCCCTACTTCCTCTCAGCTGGCTCCACACCCTGGGGCGTGTAATTTCCAAATTCTCACTCCCAGGGCTAATTTGGG
GGATAAGACATTTGATTAGAAGTATCAgaaaccagctgggcatggtggctcacacctgtaatcccagcactttgggaggt
tatgactagaggatcatttgaactcaggaattcaagaccagcctggataacagtgagaccccatctctacaaaatataaa
caattatgtgagcatggtggtgcacacctgtagtccctgttccttgggaggctgaggccggaggatcccttgagcccagg
agttcaaggctgcagagagctgcgattgtgccactgcacactaacctgggagatagagcaagaacttgtctcagaaaaaa
aaagtatcaggaaCTAATCTCCAGTCCTATCAAGTTAGGCATAAGGTCAATGTGTGATAGCTGAGTGTCACAGAAACCAA
GGACAGGAATGCAACTGCCACTGGGGATGAACTGGAAGTGGGGAGTTAAACCACCTCAGAATGTccccatttttgtttct
tctccagATGTGCTGCTTTGCTTTTCCGTATGTTTCTCTACGGACCAGCTACCTCTCCTCTGCCAACAGATCCAAGTTGT
GCATGTTATGGGTCCAAACACCACGTGACAAGCCCATTCTTCCAGTTTCTCAGACCAGAAACTGCACTGTCCTCTAACTG
CTTCTTCTCCCTCTTGCATCTGGTCCTTGGGGAAATCCTGTTTGCCCGGCCTTCAGCATATATCCACAGTTTAACCTTAA
CCACTCCTCGCCACCACTCGCGGGGGCGAGCAGCCTTCGCCCCCTGCCTAGATTACTACAGTAACTTCATTGTTCTTTCT
ACTTCTCTCTTTGCCCCTCTGCTATCTCAAAACAGCATCCAAAATGCACCTAGCAAGAGCATGTCATTCCTCTGCACAAA
ACTCTccaacttctctctttttttttttttttttttttttgagacggagtctcactctgtcacccaggctggagtgcaat
agtgtgatcttggctcactgcaacctccacctcccagattcaagcgattctcctgcctcagcctcctgagtagctgagat
tacaggttcatgtcaccatgcccggctaatttttgtatttttagtagagacagggtttcaccatgttagtcaggctggtc
tcgaactcctgaccttgtgatccacccgcctcagcctcccaaagtgctgggattataggcatgagccaccgtgcatgacC
AACTTCTCTTTTTGTTCAGAGTAAAAGCCAACGGCCCATGAGGCTTTCCATGGTCACGCCTCCGCTCATTCGCTCTGTGG
CTTTGTCTTACACGGGTTCACTCCTCACTGGCCGCCTTGCTGACCCCATAGCTCACGGGCCTTACTCTGCTctcggggcc
tttgcacttgctccaCTGCAAATGCTCCTCCCCCAGAGGCCTTTGTGGCCCATTCCCTCGGTTCCTTAGGAACAATCCCT
TCCCTGGTCAAACCTCCACTGACATCTGTCTCCTtcccttctgaattttttttctccgGTAGTATTTATCACTCTGCTAT
CCTTAGGATTTCCTTATCTTGTTTATCATCATCTCCTCATCCAGAGcttaagtcctttttttttttttgagatagagtct
cgctctgtcgcccaggctggagtgcagtggcgcgatctcgtctcgctgaaagctccacctcccgggttcacgccattctc
ccgcctcagcctcccgagtagctgggactacaggcactcg
읽기 결과:
{"contig": {"contigName": "chrUn_KN707606v1_decoy", "contigLength": 2200, "contigMD5": null, "referenceURL": null, "assembly": null, "species": null, "referenceIndex": null}, "description": "AC:KN707606.1 gi:734691250 LN:2200 rl:unplaced M5:20c768ac79ca38077e5012ee0e5f8333 AS:hs38d1", "fragmentSequence": "ctagtagctgggactacaagcgcccgccaccacacccggctaatttttttgtatttttagtggagacaggtttcaccgtgttggccaggatggtctcgatctcctgaccttgtgatctgcccaccttgccctcccaaagtgctgggattacaggcatgagccaccatacccggcagTGTCCTATCCATTTTTAAGGCAGCCACTTGGAGTTGGAGCATGTCTTTCTCTCATAATCTCTTACCAGATGTCTCAGAGCAGCCTGTGCACTTTAACTCCAGACATTCTGCCACTGAGCCCCCTAGAGCTCCAGCTTTTAAAGCACTTGGGGTGAGCCTCGAGAGATGACAGACGGAGCTGCCCAAGAGCTGCCAGCTGCCAACCCTGCCTGGGGCTTCACGGCCCGCGCCCTACTTCCTCTCAGCTGGCTCCACACCCTGGGGCGTGTAATTTCCAAATTCTCACTCCCAGGGCTAATTTGGGGGATAAGACATTTGATTAGAAGTATCAgaaaccagctgggcatggtggctcacacctgtaatcccagcactttgggaggttatgactagaggatcatttgaactcaggaattcaagaccagcctggataacagtgagaccccatctctacaaaatataaacaattatgtgagcatggtggtgcacacctgtagtccctgttccttgggaggctgaggccggaggatcccttgagcccaggagttcaaggctgcagagagctgcgattgtgccactgcacactaacctgggagatagagcaagaacttgtctcagaaaaaaaaagtatcaggaaCTAATCTCCAGTCCTATCAAGTTAGGCATAAGGTCAATGTGTGATAGCTGAGTGTCACAGAAACCAAGGACAGGAATGCAACTGCCACTGGGGATGAACTGGAAGTGGGGAGTTAAACCACCTCAGAATGTccccatttttgtttcttctccagATGTGCTGCTTTGCTTTTCCGTATGTTTCTCTACGGACCAGCTACCTCTCCTCTGCCAACAGATCCAAGTTGTGCATGTTATGGGTCCAAACACCACGTGACAAGCCCATTCTTCCAGTTTCTCAGACCAGAAACTGCACTGTCCTCTAACTGCTTCTTCTCCCTCTTGCATCTGGTCCTTGGGGAAATCCTGTTTGCCCGGCCTTCAGCATATATCCACAGTTTAACCTTAACCACTCCTCGCCACCACTCGCGGGGGCGAGCAGCCTTCGCCCCCTGCCTAGATTACTACAGTAACTTCATTGTTCTTTCTACTTCTCTCTTTGCCCCTCTGCTATCTCAAAACAGCATCCAAAATGCACCTAGCAAGAGCATGTCATTCCTCTGCACAAAACTCTccaacttctctctttttttttttttttttttttttgagacggagtctcactctgtcacccaggctggagtgcaatagtgtgatcttggctcactgcaacctccacctcccagattcaagcgattctcctgcctcagcctcctgagtagctgagattacaggttcatgtcaccatgcccggctaatttttgtatttttagtagagacagggtttcaccatgttagtcaggctggtctcgaactcctgaccttgtgatccacccgcctcagcctcccaaagtgctgggattataggcatgagccaccgtgcatgacCAACTTCTCTTTTTGTTCAGAGTAAAAGCCAACGGCCCATGAGGCTTTCCATGGTCACGCCTCCGCTCATTCGCTCTGTGGCTTTGTCTTACACGGGTTCACTCCTCACTGGCCGCCTTGCTGACCCCATAGCTCACGGGCCTTACTCTGCTctcggggcctttgcacttgctccaCTGCAAATGCTCCTCCCCCAGAGGCCTTTGTGGCCCATTCCCTCGGTTCCTTAGGAACAATCCCTTCCCTGGTCAAACCTCCACTGACATCTGTCTCCTtcccttctgaattttttttctccgGTAGTATTTATCACTCTGCTATCCTTAGGATTTCCTTATCTTGTTTATCATCATCTCCTCATCCAGAGcttaagtcctttttttttttttgagatagagtctcgctctgtcgcccaggctggagtgcagtggcgcgatctcgtctcgctgaaagctccacctcccgggttcacgccattctcccgcctcagcctcccgagtagctgggactacaggcactcg", "fragmentNumber": 0, "fragmentStartPosition": 0, "fragmentLength": 2200, "numberOfFragmentsInContig": 1}
2.Fastq:
val fq1 = ac.loadFastq(fileFq, None, None, ValidationStringency.STRICT)
원본 데이터:
@chrUn_KN707606v1_decoy_1204_1728_0:0:0_1:0:0_0/1
CTCCTCGCCA
+
2222222222
읽기 결과:
{"readNum": null, "contig": null, "start": null, "oldPosition": null, "end": null, "mapq": null, "readName": "chrUn_KN707606v1_decoy_1204_1728_0:0:0_1:0:0_0", "sequence": "CTCCTCGCCA", "qual": "2222222222", "cigar": null, "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": null, "readMapped": false, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": null, "mateNegativeStrand": null, "primaryAlignment": null, "secondaryAlignment": null, "supplementaryAlignment": null, "mismatchingPositions": null, "origQual": null, "attributes": null, "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null}
3.SAM:
println("SAM:");
val sam1=ac.loadAlignments(fileSam)
sam1.foreach(println)
소스 데이터:
@SQ SN:chrUn_KN707606v1_decoy LN:2200
@PG ID:bwa PN:bwa VN:0.7.12-r1039 CL:bwa samse -f hs38DHL1F1Len10.sam hs38DHL1.fa hs38DHL1F1Len10.sai hs38DHL1F1Len10.fq
chrUn_KN707606v1_decoy_1204_1728_0:0:0_1:0:0_0 0 chrUn_KN707606v1_decoy 1204 37 10M * 0 0 CTCCTCGCCA 2222222222 XT:A:U NM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:10
읽기 결과:
{"readNum": 0, "contig": {"contigName": "chrUn_KN707606v1_decoy", "contigLength": 2200, "contigMD5": null, "referenceURL": null, "assembly": null, "species": null, "referenceIndex": 0}, "start": 1203, "oldPosition": null, "end": 1213, "mapq": 37, "readName": "chrUn_KN707606v1_decoy_1204_1728_0:0:0_1:0:0_0", "sequence": "CTCCTCGCCA", "qual": "2222222222", "cigar": "10M", "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": true, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": "10", "origQual": null, "attributes": "XT:A:U\tXO:i:0\tXM:i:0\tNM:i:0\tXG:i:0\tX1:i:0\tX0:i:1", "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null}
4.BAM:
println("BAM:");
println("methods 1:");
val bam1 = ac.loadAlignments(fileBam)
bam1.foreach(println)
println("methods 2:");
val bam2 = ac.loadBam(fileBam)
BAM은 SAM의 바이너리로 바로 부호를 흐트러뜨린다
읽기 결과:
BAM:
methods 1:
{"readNum": 0, "contig": {"contigName": "chrUn_KN707606v1_decoy", "contigLength": 2200, "contigMD5": null, "referenceURL": null, "assembly": null, "species": null, "referenceIndex": 0}, "start": 1203, "oldPosition": null, "end": 1213, "mapq": 37, "readName": "chrUn_KN707606v1_decoy_1204_1728_0:0:0_1:0:0_0", "sequence": "CTCCTCGCCA", "qual": "2222222222", "cigar": "10M", "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": true, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": "10", "origQual": null, "attributes": "XT:A:U\tXO:i:0\tXM:i:0\tNM:i:0\tXG:i:0\tX1:i:0\tX0:i:1", "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null}
methods 2:
{"readNum": 0, "contig": {"contigName": "chrUn_KN707606v1_decoy", "contigLength": 2200, "contigMD5": null, "referenceURL": null, "assembly": null, "species": null, "referenceIndex": 0}, "start": 1203, "oldPosition": null, "end": 1213, "mapq": 37, "readName": "chrUn_KN707606v1_decoy_1204_1728_0:0:0_1:0:0_0", "sequence": "CTCCTCGCCA", "qual": "2222222222", "cigar": "10M", "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": true, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": "10", "origQual": null, "attributes": "XT:A:U\tXO:i:0\tXM:i:0\tNM:i:0\tXG:i:0\tX1:i:0\tX0:i:1", "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null}
BAM과SAM이 읽은 결과는 같고 본래 내용도 같다
이 내용에 흥미가 있습니까?
현재 기사가 여러분의 문제를 해결하지 못하는 경우 AI 엔진은 머신러닝 분석(스마트 모델이 방금 만들어져 부정확한 경우가 있을 수 있음)을 통해 가장 유사한 기사를 추천합니다:
다양한 언어의 JSONJSON은 Javascript 표기법을 사용하여 데이터 구조를 레이아웃하는 데이터 형식입니다. 그러나 Javascript가 코드에서 이러한 구조를 나타낼 수 있는 유일한 언어는 아닙니다. 저는 일반적으로 '객체'{}...
텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.
CC BY-SA 2.5, CC BY-SA 3.0 및 CC BY-SA 4.0에 따라 라이센스가 부여됩니다.