Parser of CIGAR strings. | (ns cljam.io.sam.util.cigar
(:require [clojure.core.memoize :as memoize]
[proton.core :as proton])
(:import [java.nio ByteBuffer ByteOrder])) |
Parses CIGAR string, returning a sequence of lengths and operations. | (defn parse
[^String s]
(for [[_ n op] (re-seq #"([0-9]*)([MIDNSHP=X])" s)]
[(Integer/parseInt n) (first op)])) |
Merges contiguous same operations of parsed CIGAR. | (defn simplify
[cigs]
(loop [[[^long l op :as x] & xs] cigs result (transient [])]
(if (and l op)
(let [[^long nl nop] (first xs)]
(if (= op nop)
(recur (cons [(+ l nl) op] (next xs)) result)
(recur xs (conj! result x))))
(persistent! result)))) |
(defn- concat! [v coll] (reduce conj! v coll)) | |
(defn- update-last! [coll f]
(let [c (dec (count coll))]
(if (neg? c)
coll
(let [[op x] (get coll c)]
(if (= :m op)
(assoc! coll c (f x))
coll))))) | |
Converts CIGAR string to sequence of indices. | (defn to-index*
[^String s]
(let [cigs (simplify (remove (comp #{\P \H} second) (parse s)))]
(loop [[[^long l op] & xs] cigs r 0 s 0 idx (transient [])]
(if (and l op)
(condp get op
#{\M \= \X} (recur xs (+ l r) (+ l s) (concat! idx (map (fn [x] [:m x]) (range s (+ l s)))))
#{\D} (recur xs (+ r l) s (concat! (update-last! idx (fn [x] [:d x l])) (repeat l [:m \*])))
#{\N} (recur xs (+ r l) s (concat! idx (repeat l [:m \>])))
#{\S} (recur xs r (+ s l) idx)
#{\I} (recur xs r (+ s l) (update-last! idx (fn [x] [:i x [s (+ l s)]]))))
(persistent! idx))))) |
Converts from CIGAR string to sequence of indices. This function is memoized. | (def to-index
(memoize/lu to-index* :lu/threshold
(or (proton/as-int (System/getProperty "cljam.sam.cigar.cache-size"))
1024))) |
Returns length of CIGAR operations. | (defn count-op ^long [^String s] (count (parse s))) |
(defn- count-ref-str*
[^String s]
(->> (parse s)
(filter (comp #{\M \D \N \= \X} peek))
(map first)
(reduce +))) | |
(def ^:private count-ref-str (memoize count-ref-str*)) | |
Counts covering length in reference from encoded CIGAR byte-array. | (defn count-ref-bytes
^long
[cigar-bytes]
(let [buf (ByteBuffer/wrap cigar-bytes)]
(.order buf ByteOrder/LITTLE_ENDIAN)
(loop [ref-length 0]
(if (.hasRemaining buf)
(let [b (Integer/toUnsignedLong (.getInt buf))
op (bit-and b 0xF)
n (unsigned-bit-shift-right b 4)]
(recur (+ ref-length (case op 0 n 2 n 3 n 7 n 8 n 0))))
ref-length)))) |
Decodes CIGAR string and length of alignment in reference. Returns a vector of [cigar, ref-length]. | (defn decode-cigar-and-ref-length
[cigar-bytes]
(let [buf (ByteBuffer/wrap cigar-bytes)
sb (StringBuilder.)]
(.order buf ByteOrder/LITTLE_ENDIAN)
(loop [ref-length 0]
(if (.hasRemaining buf)
(let [b (Integer/toUnsignedLong (.getInt buf))
op (bit-and b 0xF)
n (unsigned-bit-shift-right b 4)]
(doto sb
(.append n)
(.append (case op 0 \M 1 \I 2 \D 3 \N 4 \S 5 \H 6 \P 7 \= 8 \X)))
(recur (+ ref-length (case op 0 n 2 n 3 n 7 n 8 n 0))))
[(.toString sb) ref-length])))) |
Returns a boolean indicating whether a CIGAR is in | (defn placeholder?
[^bytes cigar-bytes]
(and (= 8 (alength cigar-bytes))
(= 4 (bit-and 0xF (aget cigar-bytes 0))) ;; S
(= 3 (bit-and 0xF (aget cigar-bytes 4))))) ;; N |
N | |
Encodes CIGAR string into a sequence of longs. | (defn encode-cigar
[cigar]
(mapv (fn [[^long n c]]
(bit-or (bit-shift-left n 4)
(case c \M 0 \I 1 \D 2 \N 3 \S 4 \H 5 \P 6 \= 7 \X 8)))
(parse cigar))) |
Returns length of reference bases. | (defmulti count-ref class) |
(defmethod count-ref String [s] (count-ref-str s)) | |
(defmethod count-ref (Class/forName "[B") [b] (count-ref-bytes b)) | |
Creates an encoded placeholder from a given CIGAR string. The placeholder is in the format of kSmN where k is the read length and m is the reference length. Returns a vector of ints. | (defn ->placeholder
[cigar-str]
(transduce
identity
(fn
([[^long r ^long q]]
[(bit-or (bit-shift-left q 4) 4)
(bit-or (bit-shift-left r 4) 3)])
([[^long r ^long q] [n op]]
[(+ r (case op (\M \D \N \= \X) (long n) 0))
(+ q (case op (\M \I \S \= \X) (long n) 0))]))
[0 0]
(parse cigar-str))) |