Converters between equivalent formats: SAM/BAM and FASTA/TwoBit. | (ns cljam.algo.convert
(:require [cljam.common :refer [*n-threads* get-exec-n-threads]]
[cljam.io.bam.encoder :as encoder]
[cljam.io.fastq :as fq]
[cljam.io.sam :as sam]
[cljam.io.sam.util.flag :as flag]
[cljam.io.sam.util.refs :as refs]
[cljam.io.sequence :as cseq]
[cljam.io.util :as io-util]
[cljam.util.sequence :as util-seq]
[clojure.string :as cstr]
[clojure.tools.logging :as logging]
[com.climate.claypoole :as cp])
(:import [cljam.io.fastq FASTQRead]
[java.io ByteArrayOutputStream])) |
SAM <-> BAM | |
(def ^:private default-num-block 100000) | |
(defn- sam-write-alignments [rdr wtr hdr num-block]
(when (and (pos? (int *n-threads*)) (> (get-exec-n-threads) 1))
(logging/warn "Concurrent SAM writing is not supported."))
(doseq [alns (partition-all num-block (sam/read-alignments rdr {}))]
(sam/write-alignments wtr alns hdr))) | |
(defn- bam-write-alignments [rdr wtr hdr num-block]
(let [refs (refs/make-refs hdr)
n-threads (get-exec-n-threads)]
(doseq [blocks (cp/pmap (if (= n-threads 1) :serial (dec n-threads))
(fn [chunk']
(mapv #(let [baos (ByteArrayOutputStream. (encoder/get-block-size %))]
(encoder/encode-alignment baos % refs)
{:data (.toByteArray baos)})
chunk'))
(partition-all num-block (sam/read-alignments rdr {})))]
(sam/write-blocks wtr blocks)))) | |
(defn- convert-sam*
[rdr wtr num-block write-alignments-fn]
(let [hdr (sam/read-header rdr)]
(sam/write-header wtr hdr)
(sam/write-refs wtr hdr)
(write-alignments-fn rdr wtr hdr num-block))) | |
Converts file format between SAM and BAM based on the file extension. | (defn convert-sam
[in out & {:keys [n-threads num-block create-index?]
:or {n-threads 0, num-block default-num-block, create-index? false}}]
(with-open [rdr (sam/reader in)
wtr (sam/writer out create-index?)]
(binding [*n-threads* n-threads]
(cond
(io-util/sam-writer? wtr) (convert-sam* rdr wtr num-block sam-write-alignments)
(io-util/bam-writer? wtr) (convert-sam* rdr wtr num-block bam-write-alignments)
:else (throw (ex-info (str "Unsupported output file format " out) {})))))
nil) |
FASTA <-> TwoBit | |
Converts file format between FASTA and TwoBit based on the file extension. | (defn convert-sequence
([in out]
(convert-sequence identity in out))
([xf in out]
(with-open [rdr (cseq/reader in)
wtr (cseq/writer out {:index (if (cseq/indexed? rdr)
(cseq/read-indices rdr)
(logging/warn "Non-indexed sequence may use stupendous memory."))})]
(cseq/write-sequences wtr (sequence xf (cseq/read-all-sequences rdr {:mask? true})))))) |
FASTQ -> FASTA or TwoBit | |
Converts a FASTQ file to a FASTA or TwoBit sequence file. | (defn fq->seq
([in out]
(fq->seq identity in out))
([xf in out]
(with-open [rdr (fq/reader in)
wtr (cseq/writer out)]
(->> (fq/read-sequences rdr {:decode-quality nil})
(sequence xf)
(cseq/write-sequences wtr))))) |
SAM -> FASTQ | |
Append casava 1.8 style R1/R2 suffix to the query name. | (defn long-qname
[aln]
(let [r (flag/r1r2 (:flag aln))]
(if (zero? r)
aln
(update aln :qname str \space r ":N:0:1")))) |
Append _R1 or _R2 to the query name. | (defn medium-qname
[aln]
(let [r (flag/r1r2 (:flag aln))]
(if (zero? r)
aln
(update aln :qname str "_R" r)))) |
Append /1 or /2 to the query name. | (defn short-qname
[aln]
(let [r (flag/r1r2 (:flag aln))]
(if (zero? r)
aln
(update aln :qname str \/ r)))) |
(defn- null-or-star? [^String qual]
(or (nil? qual)
(and (= 1 (.length qual))
(= \* (.charAt qual 0))))) | |
Converts a SAM alignment record to a FASTQ read. | (defn aln->read
[{:keys [flag ^String qual qname] ^String seq' :seq}]
(let [reversed? (flag/reversed? flag)]
(FASTQRead.
qname
(if reversed? (util-seq/revcomp seq') seq')
(if (null-or-star? qual)
(cstr/join (repeat (.length seq') \"))
(if reversed?
(cstr/reverse qual)
qual))))) |
(defn- sam->fq-rf
[w0 w1 w2 _ aln]
(when-let [w (case (int (flag/r1r2 (:flag aln))) 0 w0 1 w1 2 w2)]
(fq/write-sequences w [(aln->read aln)] {:encode-quality nil}))) | |
Converts a SAM/BAM to a FASTQ file. | (defn sam->fq
([xf in out]
(with-open [rdr (sam/reader in)
wtr (fq/writer out)]
(fq/write-sequences
wtr
(sequence
(comp
xf
(filter (comp flag/primary? :flag))
(map aln->read))
(sam/read-alignments rdr))
{:encode-quality nil})))
([xf in out-r1 out-r2]
(with-open [rdr (sam/reader in)
wtr1 (fq/writer out-r1)
wtr2 (fq/writer out-r2)]
(->> (sam/read-alignments rdr)
(transduce
(comp
xf
(filter (comp flag/primary? :flag)))
(completing (partial sam->fq-rf nil wtr1 wtr2)) nil))))
([xf in out-r0 out-r1 out-r2]
(with-open [rdr (sam/reader in)
wtr0 (fq/writer out-r0)
wtr1 (fq/writer out-r1)
wtr2 (fq/writer out-r2)]
(->> (sam/read-alignments rdr)
(transduce
(comp
xf
(filter (comp flag/primary? :flag)))
(completing (partial sam->fq-rf wtr0 wtr1 wtr2)) nil))))) |
General converter | |
(defn- file-type [f] (when f (io-util/file-type f))) | |
(def ^:private alignment-io? (every-pred (comp #{:sam :bam} file-type))) | |
(def ^:private sequence-io? (every-pred (comp #{:fasta :2bit} file-type))) | |
(def ^:private read-io? (every-pred (comp #{:fastq} file-type))) | |
Converts file format from input file to output file by the file extension. | (defn convert
[in out & opts]
(cond
(and (alignment-io? in) (sequential? out) (apply read-io? out)) (apply sam->fq (map short-qname) in out)
(and (alignment-io? in) (not (sequential? out)) (read-io? out)) (sam->fq (map short-qname) in out)
(alignment-io? in out) (apply convert-sam in out opts)
(sequence-io? in out) (convert-sequence in out)
(and (read-io? in) (sequence-io? out)) (fq->seq in out)
:else (throw (ex-info (str "Unsupported I/O pair: " in " and " out) {})))) |