Parser of CIGAR strings.  | (ns cljam.io.sam.util.cigar
  (:require [clojure.core.memoize :as memoize]
            [proton.core :as proton])
  (:import [java.nio ByteBuffer ByteOrder])) | 
Parses CIGAR string, returning a sequence of lengths and operations.  | (defn parse
  [^String s]
  (for [[_ n op] (re-seq #"([0-9]*)([MIDNSHP=X])" s)]
    [(Integer/parseInt n) (first op)])) | 
Merges contiguous same operations of parsed CIGAR.  | (defn simplify
  [cigs]
  (loop [[[^long l op :as x] & xs] cigs result (transient [])]
    (if (and l op)
      (let [[^long nl nop] (first xs)]
        (if (= op nop)
          (recur (cons [(+ l nl) op] (next xs)) result)
          (recur xs (conj! result x))))
      (persistent! result)))) | 
(defn- concat! [v coll] (reduce conj! v coll))  | |
(defn- update-last! [coll f]
  (let [c (dec (count coll))]
    (if (neg? c)
      coll
      (let [[op x] (get coll c)]
        (if (= :m op)
          (assoc! coll c (f x))
          coll))))) | |
Converts CIGAR string to sequence of indices.  | (defn to-index*
  [^String s]
  (let [cigs (simplify (remove (comp #{\P \H} second) (parse s)))]
    (loop [[[^long l op] & xs] cigs r 0 s 0 idx (transient [])]
      (if (and l op)
        (condp get op
          #{\M \= \X} (recur xs (+ l r) (+ l s) (concat! idx (map (fn [x] [:m x]) (range s (+ l s)))))
          #{\D} (recur xs (+ r l) s (concat! (update-last! idx (fn [x] [:d x l])) (repeat l [:m \*])))
          #{\N} (recur xs (+ r l) s (concat! idx (repeat l [:m \>])))
          #{\S} (recur xs r (+ s l) idx)
          #{\I} (recur xs r (+ s l) (update-last! idx (fn [x] [:i x [s (+ l s)]]))))
        (persistent! idx))))) | 
Converts from CIGAR string to sequence of indices. This function is memoized.  | (def to-index
  (memoize/lu to-index* :lu/threshold
              (or (proton/as-int (System/getProperty "cljam.sam.cigar.cache-size"))
                  1024))) | 
Returns length of CIGAR operations.  | (defn count-op ^long [^String s] (count (parse s)))  | 
(defn- count-ref-str*
  [^String s]
  (->> (parse s)
       (filter (comp #{\M \D \N \= \X} peek))
       (map first)
       (reduce +))) | |
(def ^:private count-ref-str (memoize count-ref-str*))  | |
Counts covering length in reference from encoded CIGAR byte-array.  | (defn count-ref-bytes
  ^long
  [cigar-bytes]
  (let [buf (ByteBuffer/wrap cigar-bytes)]
    (.order buf ByteOrder/LITTLE_ENDIAN)
    (loop [ref-length 0]
      (if (.hasRemaining buf)
        (let [b (Integer/toUnsignedLong (.getInt buf))
              op (bit-and b 0xF)
              n (unsigned-bit-shift-right b 4)]
          (recur (+ ref-length (case op 0 n 2 n 3 n 7 n 8 n 0))))
        ref-length)))) | 
Decodes CIGAR string and length of alignment in reference. Returns a vector of [cigar, ref-length].  | (defn decode-cigar-and-ref-length
  [cigar-bytes]
  (let [buf (ByteBuffer/wrap cigar-bytes)
        sb (StringBuilder.)]
    (.order buf ByteOrder/LITTLE_ENDIAN)
    (loop [ref-length 0]
      (if (.hasRemaining buf)
        (let [b (Integer/toUnsignedLong (.getInt buf))
              op (bit-and b 0xF)
              n  (unsigned-bit-shift-right b 4)]
          (doto sb
            (.append n)
            (.append (case op 0 \M 1 \I 2 \D 3 \N 4 \S 5 \H 6 \P 7 \= 8 \X)))
          (recur (+ ref-length (case op 0 n 2 n 3 n 7 n 8 n 0))))
        [(.toString sb) ref-length])))) | 
Returns a boolean indicating whether a CIGAR is in   | (defn placeholder?
  [^bytes cigar-bytes]
  (and (= 8 (alength cigar-bytes))
       (= 4 (bit-and 0xF (aget cigar-bytes 0)))   ;; S
       (= 3 (bit-and 0xF (aget cigar-bytes 4))))) ;; N | 
N  | |
Encodes CIGAR string into a sequence of longs.  | (defn encode-cigar
  [cigar]
  (mapv (fn [[^long n c]]
          (bit-or (bit-shift-left n 4)
                  (case c \M 0 \I 1 \D 2 \N 3 \S 4 \H 5 \P 6 \= 7 \X 8)))
        (parse cigar))) | 
Returns length of reference bases.  | (defmulti count-ref class)  | 
(defmethod count-ref String [s] (count-ref-str s))  | |
(defmethod count-ref (Class/forName "[B") [b] (count-ref-bytes b))  | |
Creates an encoded placeholder from a given CIGAR string. The placeholder is in the format of kSmN where k is the read length and m is the reference length. Returns a vector of ints.  | (defn ->placeholder
  [cigar-str]
  (transduce
   identity
   (fn
     ([[^long r ^long q]]
      [(bit-or (bit-shift-left q 4) 4)
       (bit-or (bit-shift-left r 4) 3)])
     ([[^long r ^long q] [n op]]
      [(+ r (case op (\M \D \N \= \X) (long n) 0))
       (+ q (case op (\M \I \S \= \X) (long n) 0))]))
   [0 0]
   (parse cigar-str))) |