(ns cljam.io.cram.decode.record
(:require [cljam.io.cram.seq-resolver.protocol :as resolver]
[cljam.io.sam.util.cigar :as sam.cigar]
[cljam.io.sam.util.flag :as sam.flag]
[cljam.io.util.byte-buffer :as bb])) | |
(defn- ref-name [cram-header ^long ref-id]
(or (when (<= 0 ref-id)
(get-in cram-header [:SQ ref-id :SN]))
"*")) | |
(defn- read-group-id [cram-header ^long rg-index]
(when (<= 0 rg-index)
(get-in cram-header [:RG rg-index :ID]))) | |
(defn- build-positional-data-decoder
[cram-header {:keys [preservation-map]} slice-header {:keys [RI RL AP RG]}]
(let [AP' (if (:AP preservation-map)
(let [pos (volatile! nil)]
(fn []
(let [pos' (+ (long (or @pos (:start slice-header)))
(long (AP)))]
(vreset! pos pos')
pos')))
AP)
RI' (if (= (:ref-seq-id slice-header) -2)
RI
(constantly (:ref-seq-id slice-header)))]
(fn [record]
(let [rg (read-group-id cram-header (RG))]
(assoc record
:rname (ref-name cram-header (RI'))
::len (RL)
:pos (AP')
:options (cond-> []
rg
(conj {:RG {:type "Z" :value rg}}))))))) | |
(defn- build-read-name-decoder [{:keys [preservation-map]} {:keys [RN]}]
(if (:RN preservation-map)
#(assoc % :qname (String. ^bytes (RN)))
identity)) | |
(defn- build-mate-read-decoder [cram-header {:keys [MF NS RN NP TS NF]}]
(fn [{:keys [rname qname] :as record}]
(let [flag (long (::flag record))]
(if (pos? (bit-and flag 0x02))
(let [mate-flag (long (MF))
bam-flag (cond-> (long (:flag record))
(pos? (bit-and mate-flag 0x01))
(bit-or (sam.flag/encoded #{:next-reversed}))
(pos? (bit-and mate-flag 0x02))
(bit-or (sam.flag/encoded #{:next-unmapped})))
rnext (ref-name cram-header (NS))]
(assoc record
:qname (or qname (String. ^bytes (RN)))
:flag bam-flag
:rnext (if (and (= rname rnext) (not= rname "*")) "=" rnext)
:pnext (NP)
:tlen (TS)))
(cond-> record
(pos? (bit-and flag 0x04))
(assoc ::next-fragment (NF))))))) | |
(defn- build-auxiliary-tags-decoder [{:keys [preservation-map]} {:keys [TL]} tag-decoders]
(let [tag-decoder (fn [{tag-type :type :keys [tag]}]
(let [decoder (get-in tag-decoders [tag tag-type])]
(fn []
{tag (decoder)})))
decoders (mapv (fn [tags]
(let [decoders (mapv tag-decoder tags)]
(fn []
(into [] (map #(%)) decoders))))
(:TD preservation-map))]
(fn [record]
(let [tl (TL)
decoder (nth decoders tl)]
(update record :options into (decoder)))))) | |
(defn- record-end [record features]
(->> features
^long (reduce
(fn [^long ret f]
(case (:code f)
(:insertion :softclip)
(- ret (alength ^bytes (:bases f)))
(:deletion :ref-skip)
(+ ret (long (:len f)))
:insert-base
(dec ret)
ret))
(::len record))
dec
(+ (long (:pos record))))) | |
(defn- record-seq
[seq-resolver {:keys [preservation-map]} {:keys [rname pos end] :as record} features]
(let [^bytes ref-bases (resolver/resolve-sequence seq-resolver rname pos end)
len (long (::len record))
bs (byte-array len (byte (int \N)))
subst (:SM preservation-map)]
(loop [[f & more :as fs] features, rpos 0, spos 1]
(if f
(let [fpos (long (:pos f))
gap (- fpos spos)]
(if (pos? gap)
(do (System/arraycopy ref-bases rpos bs (dec spos) gap)
(recur fs (+ rpos gap) fpos))
(case (:code f)
:subst
(let [b (char (aget ref-bases rpos))
b' (get-in subst [b (:subst f)])]
(aset bs (dec fpos) (byte (int b')))
(recur more (inc rpos) (inc fpos)))
(:insertion :softclip)
(let [^bytes bs' (:bases f)
n (alength bs')]
(System/arraycopy bs' 0 bs (dec fpos) n)
(recur more rpos (+ fpos n)))
(:deletion :ref-skip)
(recur more (+ rpos (long (:len f))) fpos)
:insert-base
(do (aset bs (dec fpos) (byte (:base f)))
(recur more rpos (inc fpos)))
(recur more rpos spos))))
(let [gap (- len (dec spos))]
(when (pos? gap)
(System/arraycopy ref-bases rpos bs (dec spos) gap))
(run! (fn [f]
(case (:code f)
:read-base (aset bs (dec (long (:pos f))) (byte (:base f)))
:bases (let [^bytes bs' (:bases f)
n (alength bs')]
(System/arraycopy bs' 0 bs (dec (long (:pos f))) n))
nil))
features)
(String. bs)))))) | |
(defn- qual-score ^long [^long qual] (+ qual 33)) | |
(defn- decode-qual ^String [^long len qs-decoder]
(let [bb (bb/allocate-lsb-byte-buffer len)]
(loop [i len, miss 0]
(if (zero? i)
(if (= len miss)
"*"
(String. (.array bb)))
(let [q (long (qs-decoder))]
(.put bb (byte (qual-score q)))
(recur (dec i) (cond-> miss (= q -1) inc))))))) | |
(defn- record-qual [record features {:keys [QS]}]
(let [flag (long (::flag record))
len (long (::len record))]
(if (pos? (bit-and flag 0x01))
(decode-qual len QS)
(let [qs (byte-array len)
missing? (reduce
(fn [missing? {:keys [^long pos] :as f}]
(case (:code f)
(:read-base :score)
(do (aset qs (dec pos) (byte (:qual f)))
false)
:scores
(let [^bytes scores (:scores f)]
(System/arraycopy ^bytes scores 0 qs (dec pos) (alength scores))
false)
missing?))
true
features)]
(if missing?
"*"
(do (dotimes [i len]
(aset qs i (byte (qual-score (aget qs i)))))
(String. qs))))))) | |
(defn- record-cigar [record features]
(let [read-len (long (::len record))]
(loop [[f & more :as fs] features
pos 0
acc (transient [])]
(if f
(let [p (long (:pos f))
gap (- p (inc pos))]
(if (pos? gap)
(recur fs p (conj! acc [gap \M]))
(if-let [[op ^long len] (case (:code f)
(:read-base :subst) [\M 1]
:insertion [\I (count (:bases f))]
:softclip [\S (count (:bases f))]
:hardclip [\H (:len f)]
:padding [\P (:len f)]
:deletion [\D (:len f)]
:ref-skip [\N (:len f)]
:insert-base [\I 1]
nil)]
(let [pos' (if (#{\M \I \S} op)
(+ p (dec len))
(dec p))]
(recur more pos' (cond-> acc (pos? len) (conj! [len op]))))
(recur more p acc))))
(let [acc' (cond-> acc
(< pos read-len)
(conj! [(- read-len pos) \M]))]
(some->> (not-empty (persistent! acc')) sam.cigar/simplify)))))) | |
(defn- build-read-features-decoder [{:keys [FN FC FP BA QS BS IN SC HC PD DL RS BB QQ]}]
(fn []
(loop [i (long (FN))
prev-pos 0
fs (transient [])]
(if (zero? i)
(persistent! fs)
(let [code (char (FC))
pos (+ prev-pos (long (FP)))
data (case code
\B {:code :read-base :base (BA) :qual (QS)}
\X {:code :subst :subst (BS)}
\I {:code :insertion :bases (IN)}
\S {:code :softclip :bases (SC)}
\H {:code :hardclip :len (HC)}
\P {:code :padding :len (PD)}
\D {:code :deletion :len (DL)}
\N {:code :ref-skip :len (RS)}
\i {:code :insert-base :base (BA)}
\b {:code :bases :bases (BB)}
\q {:code :scores :scores (QQ)}
\Q {:code :score :score (QS)})]
(recur (dec i) pos (conj! fs (assoc data :pos pos)))))))) | |
(defn- build-mapped-read-decoder [seq-resolver compression-header {:keys [MQ] :as decoders}]
(let [features-decoder (build-read-features-decoder decoders)]
(fn [record]
(let [fs (features-decoder)
end (record-end record fs)
record' (assoc record :end end)]
(assoc record'
:mapq (MQ)
:seq (record-seq seq-resolver compression-header record' fs)
:qual (record-qual record' fs decoders)
:cigar (->> (record-cigar record' fs)
(map (fn [[n op]] (str n op)))
(apply str))))))) | |
(defn- build-unmapped-read-decoder [{:keys [BA QS]}]
(fn [record]
(let [len (long (::len record))
flag (long (::flag record))
bb (bb/allocate-lsb-byte-buffer len)
_ (dotimes [_ len]
(.put bb (byte (BA))))
record' (assoc record
:seq (String. (.array bb))
:mapq 0
:cigar "*"
:end (:pos record))]
(if (zero? (bit-and flag 0x01))
record'
(assoc record' :qual (decode-qual len QS)))))) | |
Builds a CRAM record decoder. Returns a function with no arguments that returns a map representing a decoded CRAM record upon each call. | (defn build-cram-record-decoder
[seq-resolver cram-header compression-header slice-header ds-decoders tag-decoders]
(let [{:keys [BF CF]} ds-decoders
pos-decoder (build-positional-data-decoder cram-header
compression-header
slice-header
ds-decoders)
name-decoder (build-read-name-decoder compression-header ds-decoders)
mate-decoder (build-mate-read-decoder cram-header ds-decoders)
tags-decoder (build-auxiliary-tags-decoder compression-header ds-decoders tag-decoders)
mapped-decoder (build-mapped-read-decoder seq-resolver compression-header ds-decoders)
unmapped-decoder (build-unmapped-read-decoder ds-decoders)]
(fn []
(let [bf (BF)
cf (CF)
record' (->> {:flag bf ::flag cf}
(pos-decoder)
(name-decoder)
(mate-decoder)
(tags-decoder))]
(if (sam.flag/unmapped? bf)
(unmapped-decoder record')
(mapped-decoder record')))))) |
(defn- update-next-mate
[{:keys [^long flag] :as record} {^long mate-flag :flag :as mate}]
(assoc record
:flag (cond-> flag
(sam.flag/reversed? mate-flag)
(bit-or (sam.flag/encoded #{:next-reversed}))
(sam.flag/unmapped? mate-flag)
(bit-or (sam.flag/encoded #{:next-unmapped})))
:rnext (if (= (:rname record) (:rname mate))
"="
(:rname mate))
:pnext (:pos mate))) | |
(defn- update-mate-records
[{^long s1 :pos ^long e1 :end :as r1} {^long s2 :pos ^long e2 :end :as r2}]
(let [r1' (update-next-mate r1 r2)
r2' (update-next-mate r2 r1)]
(if (or (sam.flag/unmapped? (:flag r1))
(sam.flag/unmapped? (:flag r2))
(not= (:rname r1') (:rname r2')))
[(assoc r1' :tlen 0) (assoc r2' :tlen 0)]
(if (<= s1 s2)
(let [tlen (inc (- e2 s1))]
[(assoc r1' :tlen tlen)
(assoc r2' :tlen (- tlen))])
(let [tlen (inc (- e1 s2))]
[(assoc r1' :tlen (- tlen))
(assoc r2' :tlen tlen)]))))) | |
Resolves mate records in the same slice by updating their flags, tlen, etc. | (defn resolve-mate-records
[^objects records]
(dotimes [i (alength records)]
(let [record (aget records i)]
(when-let [nf (::next-fragment record)]
(let [j (inc (+ i (long nf)))
mate (aget records j)
[record' mate'] (update-mate-records record mate)]
(aset records i record')
(aset records j mate')))))) |
(defn- assign-record-names [qname-generator slice-header ^objects records]
(let [counter-start (long (:counter slice-header))]
(dotimes [i (alength records)]
(let [record (aget records i)]
(when (nil? (:qname record))
;; detached records always have qname, so if the control comes in here,
;; it means this record is not detached and must have ::next-fragment
(let [j (inc (+ i (long (::next-fragment record))))
mate (aget records j)
qname (qname-generator (+ counter-start i))]
(aset records i (assoc record :qname qname))
(aset records j (assoc mate :qname qname)))))))) | |
Decodes CRAM records in a slice all at once and returns them as a sequence of maps. | (defn decode-slice-records
[seq-resolver qname-generator cram-header compression-header slice-header
ds-decoders tag-decoders]
(let [record-decoder (build-cram-record-decoder seq-resolver
cram-header
compression-header
slice-header
ds-decoders
tag-decoders)
n (:records slice-header)
records (object-array n)]
(dotimes [i n]
(aset records i (record-decoder)))
(resolve-mate-records records)
(when-not (get-in compression-header [:preservation-map :RN])
(assign-record-names qname-generator slice-header records))
(map #(dissoc % ::flag ::len ::next-fragment) records))) |