(ns cljam.io.cram.decode.record
  (:require [cljam.io.cram.seq-resolver.protocol :as resolver]
            [cljam.io.sam.util.cigar :as sam.cigar]
            [cljam.io.sam.util.flag :as sam.flag]
            [cljam.io.util.byte-buffer :as bb]))
(defn- ref-name [cram-header ^long ref-id]
  (or (when (<= 0 ref-id)
        (get-in cram-header [:SQ ref-id :SN]))
      "*"))
(defn- read-group-id [cram-header ^long rg-index]
  (when (<= 0 rg-index)
    (get-in cram-header [:RG rg-index :ID])))
(defn- build-positional-data-decoder
  [cram-header {:keys [preservation-map]} slice-header {:keys [RI RL AP RG]}]
  (let [AP' (if (:AP preservation-map)
              (let [pos (volatile! nil)]
                (fn []
                  (let [pos' (+ (long (or @pos (:start slice-header)))
                                (long (AP)))]
                    (vreset! pos pos')
                    pos')))
              AP)
        RI' (if (= (:ref-seq-id slice-header) -2)
              RI
              (constantly (:ref-seq-id slice-header)))]
    (fn [record]
      (let [rg (read-group-id cram-header (RG))]
        (assoc record
               :rname (ref-name cram-header (RI'))
               ::len (RL)
               :pos (AP')
               :options (cond-> []
                          rg
                          (conj {:RG {:type "Z" :value rg}})))))))
(defn- build-read-name-decoder [{:keys [preservation-map]} {:keys [RN]}]
  (if (:RN preservation-map)
    #(assoc % :qname (String. ^bytes (RN)))
    identity))
(defn- build-mate-read-decoder [cram-header {:keys [MF NS RN NP TS NF]}]
  (fn [{:keys [rname qname] :as record}]
    (let [flag (long (::flag record))]
      (if (pos? (bit-and flag 0x02))
        (let [mate-flag (long (MF))
              bam-flag (cond-> (long (:flag record))
                         (pos? (bit-and mate-flag 0x01))
                         (bit-or (sam.flag/encoded #{:next-reversed}))
                         (pos? (bit-and mate-flag 0x02))
                         (bit-or (sam.flag/encoded #{:next-unmapped})))
              rnext (ref-name cram-header (NS))]
          (assoc record
                 :qname (or qname (String. ^bytes (RN)))
                 :flag bam-flag
                 :rnext (if (and (= rname rnext) (not= rname "*")) "=" rnext)
                 :pnext (NP)
                 :tlen (TS)))
        (cond-> record
          (pos? (bit-and flag 0x04))
          (assoc ::next-fragment (NF)))))))
(defn- build-auxiliary-tags-decoder [{:keys [preservation-map]} {:keys [TL]} tag-decoders]
  (let [tag-decoder (fn [{tag-type :type :keys [tag]}]
                      (let [decoder (get-in tag-decoders [tag tag-type])]
                        (fn []
                          {tag (decoder)})))
        decoders (mapv (fn [tags]
                         (let [decoders (mapv tag-decoder tags)]
                           (fn []
                             (into [] (map #(%)) decoders))))
                       (:TD preservation-map))]
    (fn [record]
      (let [tl (TL)
            decoder (nth decoders tl)]
        (update record :options into (decoder))))))
(defn- record-end [record features]
  (->> features
       ^long (reduce
              (fn [^long ret f]
                (case (:code f)
                  (:insertion :softclip)
                  (- ret (alength ^bytes (:bases f)))
                  (:deletion :ref-skip)
                  (+ ret (long (:len f)))
                  :insert-base
                  (dec ret)
                  ret))
              (::len record))
       dec
       (+ (long (:pos record)))))
(defn- record-seq
  [seq-resolver {:keys [preservation-map]} {:keys [rname pos end] :as record} features]
  (let [^bytes ref-bases (resolver/resolve-sequence seq-resolver rname pos end)
        len (long (::len record))
        bs (byte-array len (byte (int \N)))
        subst (:SM preservation-map)]
    (loop [[f & more :as fs] features, rpos 0, spos 1]
      (if f
        (let [fpos (long (:pos f))
              gap (- fpos spos)]
          (if (pos? gap)
            (do (System/arraycopy ref-bases rpos bs (dec spos) gap)
                (recur fs (+ rpos gap) fpos))
            (case (:code f)
              :subst
              (let [b (char (aget ref-bases rpos))
                    b' (get-in subst [b (:subst f)])]
                (aset bs (dec fpos) (byte (int b')))
                (recur more (inc rpos) (inc fpos)))
              (:insertion :softclip)
              (let [^bytes bs' (:bases f)
                    n (alength bs')]
                (System/arraycopy bs' 0 bs (dec fpos) n)
                (recur more rpos (+ fpos n)))
              (:deletion :ref-skip)
              (recur more (+ rpos (long (:len f))) fpos)
              :insert-base
              (do (aset bs (dec fpos) (byte (:base f)))
                  (recur more rpos (inc fpos)))
              (recur more rpos spos))))
        (let [gap (- len (dec spos))]
          (when (pos? gap)
            (System/arraycopy ref-bases rpos bs (dec spos) gap))
          (run! (fn [f]
                  (case (:code f)
                    :read-base (aset bs (dec (long (:pos f))) (byte (:base f)))
                    :bases (let [^bytes bs' (:bases f)
                                 n (alength bs')]
                             (System/arraycopy bs' 0 bs (dec (long (:pos f))) n))
                    nil))
                features)
          (String. bs))))))
(defn- qual-score ^long [^long qual]
  (+ qual 33))
(defn- decode-qual ^String [^long len qs-decoder]
  (let [bb (bb/allocate-lsb-byte-buffer len)]
    (loop [i len, miss 0]
      (if (zero? i)
        (if (= len miss)
          "*"
          (String. (.array bb)))
        (let [q (long (qs-decoder))]
          (.put bb (byte (qual-score q)))
          (recur (dec i) (cond-> miss (= q -1) inc)))))))
(defn- record-qual [record features {:keys [QS]}]
  (let [flag (long (::flag record))
        len (long (::len record))]
    (if (pos? (bit-and flag 0x01))
      (decode-qual len QS)
      (let [qs (byte-array len)
            missing? (reduce
                      (fn [missing? {:keys [^long pos] :as f}]
                        (case (:code f)
                          (:read-base :score)
                          (do (aset qs (dec pos) (byte (:qual f)))
                              false)
                          :scores
                          (let [^bytes scores (:scores f)]
                            (System/arraycopy ^bytes scores 0 qs (dec pos) (alength scores))
                            false)
                          missing?))
                      true
                      features)]
        (if missing?
          "*"
          (do (dotimes [i len]
                (aset qs i (byte (qual-score (aget qs i)))))
              (String. qs)))))))
(defn- record-cigar [record features]
  (let [read-len (long (::len record))]
    (loop [[f & more :as fs] features
           pos 0
           acc (transient [])]
      (if f
        (let [p (long (:pos f))
              gap (- p (inc pos))]
          (if (pos? gap)
            (recur fs p (conj! acc [gap \M]))
            (if-let [[op ^long len] (case (:code f)
                                      (:read-base :subst) [\M 1]
                                      :insertion [\I (count (:bases f))]
                                      :softclip [\S (count (:bases f))]
                                      :hardclip [\H (:len f)]
                                      :padding [\P (:len f)]
                                      :deletion [\D (:len f)]
                                      :ref-skip [\N (:len f)]
                                      :insert-base [\I 1]
                                      nil)]
              (let [pos' (if (#{\M \I \S} op)
                           (+ p (dec len))
                           (dec p))]
                (recur more pos' (cond-> acc (pos? len) (conj! [len op]))))
              (recur more p acc))))
        (let [acc' (cond-> acc
                     (< pos read-len)
                     (conj! [(- read-len pos) \M]))]
          (some->> (not-empty (persistent! acc')) sam.cigar/simplify))))))
(defn- build-read-features-decoder [{:keys [FN FC FP BA QS BS IN SC HC PD DL RS BB QQ]}]
  (fn []
    (loop [i (long (FN))
           prev-pos 0
           fs (transient [])]
      (if (zero? i)
        (persistent! fs)
        (let [code (char (FC))
              pos (+ prev-pos (long (FP)))
              data (case code
                     \B {:code :read-base :base (BA) :qual (QS)}
                     \X {:code :subst :subst (BS)}
                     \I {:code :insertion :bases (IN)}
                     \S {:code :softclip :bases (SC)}
                     \H {:code :hardclip :len (HC)}
                     \P {:code :padding :len (PD)}
                     \D {:code :deletion :len (DL)}
                     \N {:code :ref-skip :len (RS)}
                     \i {:code :insert-base :base (BA)}
                     \b {:code :bases :bases (BB)}
                     \q {:code :scores :scores (QQ)}
                     \Q {:code :score :score (QS)})]
          (recur (dec i) pos (conj! fs (assoc data :pos pos))))))))
(defn- build-mapped-read-decoder [seq-resolver compression-header {:keys [MQ] :as decoders}]
  (let [features-decoder (build-read-features-decoder decoders)]
    (fn [record]
      (let [fs (features-decoder)
            end (record-end record fs)
            record' (assoc record :end end)]
        (assoc record'
               :mapq (MQ)
               :seq (record-seq seq-resolver compression-header record' fs)
               :qual (record-qual record' fs decoders)
               :cigar (->> (record-cigar record' fs)
                           (map (fn [[n op]] (str n op)))
                           (apply str)))))))
(defn- build-unmapped-read-decoder [{:keys [BA QS]}]
  (fn [record]
    (let [len (long (::len record))
          flag (long (::flag record))
          bb (bb/allocate-lsb-byte-buffer len)
          _ (dotimes [_ len]
              (.put bb (byte (BA))))
          record' (assoc record
                         :seq (String. (.array bb))
                         :mapq 0
                         :cigar "*"
                         :end (:pos record))]
      (if (zero? (bit-and flag 0x01))
        record'
        (assoc record' :qual (decode-qual len QS))))))

Builds a CRAM record decoder.

Returns a function with no arguments that returns a map representing a decoded CRAM record upon each call.

(defn build-cram-record-decoder
  [seq-resolver cram-header compression-header slice-header ds-decoders tag-decoders]
  (let [{:keys [BF CF]} ds-decoders
        pos-decoder (build-positional-data-decoder cram-header
                                                   compression-header
                                                   slice-header
                                                   ds-decoders)
        name-decoder (build-read-name-decoder compression-header ds-decoders)
        mate-decoder (build-mate-read-decoder cram-header ds-decoders)
        tags-decoder (build-auxiliary-tags-decoder compression-header ds-decoders tag-decoders)
        mapped-decoder (build-mapped-read-decoder seq-resolver compression-header ds-decoders)
        unmapped-decoder (build-unmapped-read-decoder ds-decoders)]
    (fn []
      (let [bf (BF)
            cf (CF)
            record' (->> {:flag bf ::flag cf}
                         (pos-decoder)
                         (name-decoder)
                         (mate-decoder)
                         (tags-decoder))]
        (if (sam.flag/unmapped? bf)
          (unmapped-decoder record')
          (mapped-decoder record'))))))
(defn- update-next-mate
  [{:keys [^long flag] :as record} {^long mate-flag :flag :as mate}]
  (assoc record
         :flag (cond-> flag
                 (sam.flag/reversed? mate-flag)
                 (bit-or (sam.flag/encoded #{:next-reversed}))
                 (sam.flag/unmapped? mate-flag)
                 (bit-or (sam.flag/encoded #{:next-unmapped})))
         :rnext (if (= (:rname record) (:rname mate))
                  "="
                  (:rname mate))
         :pnext (:pos mate)))
(defn- update-mate-records
  [{^long s1 :pos ^long e1 :end :as r1} {^long s2 :pos ^long e2 :end :as r2}]
  (let [r1' (update-next-mate r1 r2)
        r2' (update-next-mate r2 r1)]
    (if (or (sam.flag/unmapped? (:flag r1))
            (sam.flag/unmapped? (:flag r2))
            (not= (:rname r1') (:rname r2')))
      [(assoc r1' :tlen 0) (assoc r2' :tlen 0)]
      (if (<= s1 s2)
        (let [tlen (inc (- e2 s1))]
          [(assoc r1' :tlen tlen)
           (assoc r2' :tlen (- tlen))])
        (let [tlen (inc (- e1 s2))]
          [(assoc r1' :tlen (- tlen))
           (assoc r2' :tlen tlen)])))))

Resolves mate records in the same slice by updating their flags, tlen, etc.

(defn resolve-mate-records
  [^objects records]
  (dotimes [i (alength records)]
    (let [record (aget records i)]
      (when-let [nf (::next-fragment record)]
        (let [j (inc (+ i (long nf)))
              mate (aget records j)
              [record' mate'] (update-mate-records record mate)]
          (aset records i record')
          (aset records j mate'))))))
(defn- assign-record-names [qname-generator slice-header ^objects records]
  (let [counter-start (long (:counter slice-header))]
    (dotimes [i (alength records)]
      (let [record (aget records i)]
        (when (nil? (:qname record))
          ;; detached records always have qname, so if the control comes in here,
          ;; it means this record is not detached and must have ::next-fragment
          (let [j (inc (+ i (long (::next-fragment record))))
                mate (aget records j)
                qname (qname-generator (+ counter-start i))]
            (aset records i (assoc record :qname qname))
            (aset records j (assoc mate :qname qname))))))))

Decodes CRAM records in a slice all at once and returns them as a sequence of maps.

(defn decode-slice-records
  [seq-resolver qname-generator cram-header compression-header slice-header
   ds-decoders tag-decoders]
  (let [record-decoder (build-cram-record-decoder seq-resolver
                                                  cram-header
                                                  compression-header
                                                  slice-header
                                                  ds-decoders
                                                  tag-decoders)
        n (:records slice-header)
        records (object-array n)]
    (dotimes [i n]
      (aset records i (record-decoder)))
    (resolve-mate-records records)
    (when-not (get-in compression-header [:preservation-map :RN])
      (assign-record-names qname-generator slice-header records))
    (map #(dissoc % ::flag ::len ::next-fragment) records)))