Utility functions for SAM optional fields.

(ns cljam.io.sam.util.option
  (:require [clojure.string :as cstr]
            [proton.core :as p]))

parse

(defn- parse-tag-single [val-type val]
  (case val-type
    \Z val
    \A (first val)
    \I (p/as-long val)
    \i (p/as-long val)
    \s (p/as-long val)
    \S (p/as-long val)
    \c (p/as-long val)
    \C (p/as-long val)
    \f (p/as-double val)
    \H (p/hex->bytes val)
    (-> "Unrecognized tag type: %s, for value: %s"
        (format val-type val)
        Exception.
        throw)))
(defn parse-optional-field [op]
  (let [[tag val-type-str val] (cstr/split op #":" 3)
        val-type (first val-type-str)]
    {(keyword tag) {:type val-type-str
                    :value (if (= val-type \B)
                             val
                             (parse-tag-single val-type val))}}))

stringify

(defn stringify-optional-fields [options]
  (->> options
       (map
        (fn [op]
          (let [[tag {:keys [type value]}] (first (seq op))]
            (cstr/join \: [(name tag) type value]))))
       (cstr/join \tab)))

accessors

(defn value-for-tag
  [tag aln]
  (:value (some tag (:options aln))))

CIGAR string of the mate alignment.

(def
  ^{:doc 
    :arglists '([aln])}
  mate-cigar
  (partial value-for-tag :MC))

Score of the alignment.

(def
  ^{:doc 
    :arglists '([aln])}
  score
  (partial value-for-tag :AS))

Barcode sequence.

(def
  ^{:doc 
    :arglists '([aln])}
  barcode
  (partial value-for-tag :BC))

Edit distance from reference of the alignment.

(def
  ^{:doc 
    :arglists '([aln])}
  edit-distance
  (partial value-for-tag :NM))

Parse mismatching positions in the SAM optional fields. Returns a sequence consisting of vectors which are one of [:match matching-length], [:mismatch a-reference-base-char] and [:deletion reference-bases-string].

(defn parse-mismatching-positions-str
  [s]
  (when s
    (let [[_ head tail] (re-matches #"(\d+)(.*)" s)]
      (->> (re-seq #"(([A-Z])|\^([A-Z]+))(\d+)" tail)
           (mapcat
            (fn [[_ _ mismatch deletion match]]
              [(if mismatch
                 [:mismatch (first mismatch)]
                 [:deletion deletion])
               [:match (p/as-long match)]]))
           (cons [:match (p/as-long head)])))))

Mismatching positions and bases of the alignment.

(def
  ^{:doc 
    :arglists '([aln])}
  mismatching-positions
  (comp parse-mismatching-positions-str (partial value-for-tag :MD)))

Ratio of the primary alignment score and the alternative one.

(def
  ^{:doc 
    :arglists '([aln])}
  primary-to-alt-score
  (partial value-for-tag :pa))

Parse serialized supplementary alignments.

(defn parse-supplementary-alignments-str
  [s]
  (when s
    (->> (re-seq #"(\S+?),(\d+),([+-]),((?:\d+[MIDSH])+),(\d+),(\d+);" s)
         (map (fn [[_ & rests]]
                (-> [:rname :pos :strand :cigar :mapq :edit-distance]
                    (zipmap rests)
                    (update :pos p/as-long)
                    (update :mapq p/as-long)
                    (update :strand (fn [^String strand]
                                      (case (.charAt strand 0)
                                        \+ :forward
                                        \- :reverse)))
                    (update :edit-distance p/as-long)))))))

List of supplementary alignments.

(def
  ^{:doc 
    :arglists '([aln])}
  supplementary-alignments
  (comp parse-supplementary-alignments-str (partial value-for-tag :SA)))

Parse serialized supplementary alignments.

(defn parse-alternative-hits-str
  [s]
  (when s
    (->> (re-seq #"(\S+?),([+-]?\d+),((?:\d+[MIDSH])+),(\d+);" s)
         (map (fn [[_ & rests]]
                (-> (zipmap [:rname :pos :cigar :edit-distance] rests)
                    (update :pos p/as-long)
                    (update :edit-distance p/as-long)))))))

List of alternative alignments.

(def
  ^{:doc 
    :arglists '([aln])}
  alternative-hits
  (comp parse-alternative-hits-str (partial value-for-tag :XA)))

Suboptimal alignment score.

(def
  ^{:doc 
    :arglists '([aln])}
  suboptimal-score
  (partial value-for-tag :XS))

Name of read group of the alignment.

(def
  ^{:doc 
    :arglists '([aln])}
  read-group
  (partial value-for-tag :RG))

Comment of reference sequence.

(def
  ^{:doc 
    :arglists '([aln])}
  ref-comment
  (partial value-for-tag :XR))