Converters between equivalent formats: SAM/BAM and FASTA/TwoBit. | (ns cljam.algo.convert (:require [cljam.common :refer [*n-threads* get-exec-n-threads]] [cljam.io.bam.encoder :as encoder] [cljam.io.fastq :as fq] [cljam.io.sam :as sam] [cljam.io.sam.util.flag :as flag] [cljam.io.sam.util.refs :as refs] [cljam.io.sequence :as cseq] [cljam.io.util :as io-util] [cljam.util.sequence :as util-seq] [clojure.string :as cstr] [clojure.tools.logging :as logging] [com.climate.claypoole :as cp]) (:import [cljam.io.fastq FASTQRead] [java.io ByteArrayOutputStream])) |
SAM <-> BAM | |
(def ^:private default-num-block 100000) | |
(defn- sam-write-alignments [rdr wtr hdr num-block] (when (and (pos? (int *n-threads*)) (> (get-exec-n-threads) 1)) (logging/warn "Concurrent SAM writing is not supported.")) (doseq [alns (partition-all num-block (sam/read-alignments rdr {}))] (sam/write-alignments wtr alns hdr))) | |
(defn- bam-write-alignments [rdr wtr hdr num-block] (let [refs (refs/make-refs hdr) n-threads (get-exec-n-threads)] (doseq [blocks (cp/pmap (if (= n-threads 1) :serial (dec n-threads)) (fn [chunk'] (mapv #(let [baos (ByteArrayOutputStream. (encoder/get-block-size %))] (encoder/encode-alignment baos % refs) {:data (.toByteArray baos)}) chunk')) (partition-all num-block (sam/read-alignments rdr {})))] (sam/write-blocks wtr blocks)))) | |
(defn- convert-sam* [rdr wtr num-block write-alignments-fn] (let [hdr (sam/read-header rdr)] (sam/write-header wtr hdr) (sam/write-refs wtr hdr) (write-alignments-fn rdr wtr hdr num-block))) | |
Converts file format between SAM and BAM based on the file extension. | (defn convert-sam [in out & {:keys [n-threads num-block create-index?] :or {n-threads 0, num-block default-num-block, create-index? false}}] (with-open [rdr (sam/reader in) wtr (sam/writer out create-index?)] (binding [*n-threads* n-threads] (cond (io-util/sam-writer? wtr) (convert-sam* rdr wtr num-block sam-write-alignments) (io-util/bam-writer? wtr) (convert-sam* rdr wtr num-block bam-write-alignments) :else (throw (ex-info (str "Unsupported output file format " out) {}))))) nil) |
FASTA <-> TwoBit | |
Converts file format between FASTA and TwoBit based on the file extension. | (defn convert-sequence ([in out] (convert-sequence identity in out)) ([xf in out] (with-open [rdr (cseq/reader in) wtr (cseq/writer out {:index (if (cseq/indexed? rdr) (cseq/read-indices rdr) (logging/warn "Non-indexed sequence may use stupendous memory."))})] (cseq/write-sequences wtr (sequence xf (cseq/read-all-sequences rdr {:mask? true})))))) |
FASTQ -> FASTA or TwoBit | |
Converts a FASTQ file to a FASTA or TwoBit sequence file. | (defn fq->seq ([in out] (fq->seq identity in out)) ([xf in out] (with-open [rdr (fq/reader in) wtr (cseq/writer out)] (->> (fq/read-sequences rdr {:decode-quality nil}) (sequence xf) (cseq/write-sequences wtr))))) |
SAM -> FASTQ | |
Append casava 1.8 style R1/R2 suffix to the query name. | (defn long-qname [aln] (let [r (flag/r1r2 (:flag aln))] (if (zero? r) aln (update aln :qname str \space r ":N:0:1")))) |
Append _R1 or _R2 to the query name. | (defn medium-qname [aln] (let [r (flag/r1r2 (:flag aln))] (if (zero? r) aln (update aln :qname str "_R" r)))) |
Append /1 or /2 to the query name. | (defn short-qname [aln] (let [r (flag/r1r2 (:flag aln))] (if (zero? r) aln (update aln :qname str \/ r)))) |
(defn- null-or-star? [^String qual] (or (nil? qual) (and (= 1 (.length qual)) (= \* (.charAt qual 0))))) | |
Converts a SAM alignment record to a FASTQ read. | (defn aln->read [{:keys [flag ^String qual qname] ^String seq' :seq}] (let [reversed? (flag/reversed? flag)] (FASTQRead. qname (if reversed? (util-seq/revcomp seq') seq') (if (null-or-star? qual) (cstr/join (repeat (.length seq') \")) (if reversed? (cstr/reverse qual) qual))))) |
(defn- sam->fq-rf [w0 w1 w2 _ aln] (when-let [w (case (int (flag/r1r2 (:flag aln))) 0 w0 1 w1 2 w2)] (fq/write-sequences w [(aln->read aln)] {:encode-quality nil}))) | |
Converts a SAM/BAM to a FASTQ file. | (defn sam->fq ([xf in out] (with-open [rdr (sam/reader in) wtr (fq/writer out)] (fq/write-sequences wtr (sequence (comp xf (filter (comp flag/primary? :flag)) (map aln->read)) (sam/read-alignments rdr)) {:encode-quality nil}))) ([xf in out-r1 out-r2] (with-open [rdr (sam/reader in) wtr1 (fq/writer out-r1) wtr2 (fq/writer out-r2)] (->> (sam/read-alignments rdr) (transduce (comp xf (filter (comp flag/primary? :flag))) (completing (partial sam->fq-rf nil wtr1 wtr2)) nil)))) ([xf in out-r0 out-r1 out-r2] (with-open [rdr (sam/reader in) wtr0 (fq/writer out-r0) wtr1 (fq/writer out-r1) wtr2 (fq/writer out-r2)] (->> (sam/read-alignments rdr) (transduce (comp xf (filter (comp flag/primary? :flag))) (completing (partial sam->fq-rf wtr0 wtr1 wtr2)) nil))))) |
General converter | |
(defn- file-type [f] (when f (io-util/file-type f))) | |
(def ^:private alignment-io? (every-pred (comp #{:sam :bam} file-type))) | |
(def ^:private sequence-io? (every-pred (comp #{:fasta :2bit} file-type))) | |
(def ^:private read-io? (every-pred (comp #{:fastq} file-type))) | |
Converts file format from input file to output file by the file extension. | (defn convert [in out & opts] (cond (and (alignment-io? in) (sequential? out) (apply read-io? out)) (apply sam->fq (map short-qname) in out) (and (alignment-io? in) (not (sequential? out)) (read-io? out)) (sam->fq (map short-qname) in out) (alignment-io? in out) (apply convert-sam in out opts) (sequence-io? in out) (convert-sequence in out) (and (read-io? in) (sequence-io? out)) (fq->seq in out) :else (throw (ex-info (str "Unsupported I/O pair: " in " and " out) {})))) |