| |
Converters between equivalent formats: SAM/BAM and FASTA/TwoBit.
| ( ns cljam.algo.convert
( :require [ cljam.common :refer [ *n-threads* get-exec-n-threads ] ]
[ cljam.io.bam.encoder :as encoder ]
[ cljam.io.fastq :as fq ]
[ cljam.io.sam :as sam ]
[ cljam.io.sam.util.flag :as flag ]
[ cljam.io.sam.util.refs :as refs ]
[ cljam.io.sequence :as cseq ]
[ cljam.io.util :as io-util ]
[ cljam.util.sequence :as util-seq ]
[ clojure.string :as cstr ]
[ clojure.tools.logging :as logging ]
[ com.climate.claypoole :as cp ] )
( :import [ cljam.io.fastq FASTQRead ]
[ java.io ByteArrayOutputStream ] ) )
|
|
SAM <-> BAM
| |
| ( def ^ :private default-num-block 100000 )
|
|
| ( defn- sam-write-alignments [ rdr wtr hdr num-block ]
( when ( and ( pos? ( int *n-threads* ) ) ( > ( get-exec-n-threads ) 1 ) )
( logging/warn "Concurrent SAM writing is not supported." ) )
( doseq [ alns ( partition-all num-block ( sam/read-alignments rdr { } ) ) ]
( sam/write-alignments wtr alns hdr ) ) )
|
|
| ( defn- bam-write-alignments [ rdr wtr hdr num-block ]
( let [ refs ( refs/make-refs hdr )
n-threads ( get-exec-n-threads ) ]
( doseq [ blocks ( cp/pmap ( if ( = n-threads 1 ) :serial ( dec n-threads ) )
( fn [ chunk' ]
( mapv #( let [ baos ( ByteArrayOutputStream. ( encoder/get-block-size % ) ) ]
( encoder/encode-alignment baos % refs )
{ :data ( .toByteArray baos ) } )
chunk' ) )
( partition-all num-block ( sam/read-alignments rdr { } ) ) ) ]
( sam/write-blocks wtr blocks ) ) ) )
|
|
| ( defn- convert-sam*
[ rdr wtr num-block write-alignments-fn ]
( let [ hdr ( sam/read-header rdr ) ]
( sam/write-header wtr hdr )
( sam/write-refs wtr hdr )
( write-alignments-fn rdr wtr hdr num-block ) ) )
|
|
Converts file format between SAM and BAM based on the file extension.
| ( defn convert-sam
[ in out & { :keys [ n-threads num-block create-index? ]
:or { n-threads 0 , num-block default-num-block , create-index? false } } ]
( with-open [ rdr ( sam/reader in )
wtr ( sam/writer out create-index? ) ]
( binding [ *n-threads* n-threads ]
( cond
( io-util/sam-writer? wtr ) ( convert-sam* rdr wtr num-block sam-write-alignments )
( io-util/bam-writer? wtr ) ( convert-sam* rdr wtr num-block bam-write-alignments )
:else ( throw ( ex-info ( str "Unsupported output file format " out ) { } ) ) ) ) )
nil )
|
|
FASTA <-> TwoBit
| |
Converts file format between FASTA and TwoBit based on the file extension.
| ( defn convert-sequence
( [ in out ]
( convert-sequence identity in out ) )
( [ xf in out ]
( with-open [ rdr ( cseq/reader in )
wtr ( cseq/writer out { :index ( if ( cseq/indexed? rdr )
( cseq/read-indices rdr )
( logging/warn "Non-indexed sequence may use stupendous memory." ) ) } ) ]
( cseq/write-sequences wtr ( sequence xf ( cseq/read-all-sequences rdr { :mask? true } ) ) ) ) ) )
|
|
FASTQ -> FASTA or TwoBit
| |
Converts a FASTQ file to a FASTA or TwoBit sequence file.
| ( defn fq->seq
( [ in out ]
( fq->seq identity in out ) )
( [ xf in out ]
( with-open [ rdr ( fq/reader in )
wtr ( cseq/writer out ) ]
( ->> ( fq/read-sequences rdr { :decode-quality nil } )
( sequence xf )
( cseq/write-sequences wtr ) ) ) ) )
|
|
SAM -> FASTQ
| |
Append casava 1.8 style R1/R2 suffix to the query name.
| ( defn long-qname
[ aln ]
( let [ r ( flag/r1r2 ( :flag aln ) ) ]
( if ( zero? r )
aln
( update aln :qname str \space r ":N:0:1" ) ) ) )
|
|
Append _R1 or _R2 to the query name.
| ( defn medium-qname
[ aln ]
( let [ r ( flag/r1r2 ( :flag aln ) ) ]
( if ( zero? r )
aln
( update aln :qname str "_R" r ) ) ) )
|
|
Append /1 or /2 to the query name.
| ( defn short-qname
[ aln ]
( let [ r ( flag/r1r2 ( :flag aln ) ) ]
( if ( zero? r )
aln
( update aln :qname str \/ r ) ) ) )
|
|
| ( defn- null-or-star? [ ^ String qual ]
( or ( nil? qual )
( and ( = 1 ( .length qual ) )
( = \* ( .charAt qual 0 ) ) ) ) )
|
|
Converts a SAM alignment record to a FASTQ read.
| ( defn aln->read
[ { :keys [ flag ^ String qual qname ] ^ String seq' :seq } ]
( let [ reversed? ( flag/reversed? flag ) ]
( FASTQRead.
qname
( if reversed? ( util-seq/revcomp seq' ) seq' )
( if ( null-or-star? qual )
( cstr/join ( repeat ( .length seq' ) \" ) )
( if reversed?
( cstr/reverse qual )
qual ) ) ) ) )
|
|
| ( defn- sam->fq-rf
[ w0 w1 w2 _ aln ]
( when-let [ w ( case ( int ( flag/r1r2 ( :flag aln ) ) ) 0 w0 1 w1 2 w2 ) ]
( fq/write-sequences w [ ( aln->read aln ) ] { :encode-quality nil } ) ) )
|
|
Converts a SAM/BAM to a FASTQ file.
| ( defn sam->fq
( [ xf in out ]
( with-open [ rdr ( sam/reader in )
wtr ( fq/writer out ) ]
( fq/write-sequences
wtr
( sequence
( comp
xf
( filter ( comp flag/primary? :flag ) )
( map aln->read ) )
( sam/read-alignments rdr ) )
{ :encode-quality nil } ) ) )
( [ xf in out-r1 out-r2 ]
( with-open [ rdr ( sam/reader in )
wtr1 ( fq/writer out-r1 )
wtr2 ( fq/writer out-r2 ) ]
( ->> ( sam/read-alignments rdr )
( transduce
( comp
xf
( filter ( comp flag/primary? :flag ) ) )
( completing ( partial sam->fq-rf nil wtr1 wtr2 ) ) nil ) ) ) )
( [ xf in out-r0 out-r1 out-r2 ]
( with-open [ rdr ( sam/reader in )
wtr0 ( fq/writer out-r0 )
wtr1 ( fq/writer out-r1 )
wtr2 ( fq/writer out-r2 ) ]
( ->> ( sam/read-alignments rdr )
( transduce
( comp
xf
( filter ( comp flag/primary? :flag ) ) )
( completing ( partial sam->fq-rf wtr0 wtr1 wtr2 ) ) nil ) ) ) ) )
|
|
General converter
| |
| ( defn- file-type [ f ] ( when f ( io-util/file-type f ) ) )
|
|
| ( def ^ :private alignment-io? ( every-pred ( comp #{ :sam :bam } file-type ) ) )
|
|
| ( def ^ :private sequence-io? ( every-pred ( comp #{ :fasta :2bit } file-type ) ) )
|
|
| ( def ^ :private read-io? ( every-pred ( comp #{ :fastq } file-type ) ) )
|
|
Converts file format from input file to output file by the file extension.
| ( defn convert
[ in out & opts ]
( cond
( and ( alignment-io? in ) ( sequential? out ) ( apply read-io? out ) ) ( apply sam->fq ( map short-qname ) in out )
( and ( alignment-io? in ) ( not ( sequential? out ) ) ( read-io? out ) ) ( sam->fq ( map short-qname ) in out )
( alignment-io? in out ) ( apply convert-sam in out opts )
( sequence-io? in out ) ( convert-sequence in out )
( and ( read-io? in ) ( sequence-io? out ) ) ( fq->seq in out )
:else ( throw ( ex-info ( str "Unsupported I/O pair: " in " and " out ) { } ) ) ) )
|
|
| |