Utility functions for base sequences. | (ns cljam.io.sam.util.sequence (:require [clojure.string :as cstr]) (:import [java.nio Buffer ByteBuffer CharBuffer])) |
(def ^:private ^:const nibble-to-base-table ;; Index: nibble of a compressed base. ;; Value: base for the nibble. "=ACMGRSVTWYHKDBN") | |
(def ^:private two-bytes-to-compressed-bases-table
;; Index: two bases (A,C) => ASCII (65,67) => 2r 1000001 1000011 => 8387
;; Value: two bases (A,C) => nibbles (1,2) => 2r 0001 0010 => 18
(let [ba (byte-array (bit-shift-left 1 14))
byte-to-nibble-table (byte-array (bit-shift-left 1 7) (byte 15))]
(doseq [[^byte i c] (map vector (range) nibble-to-base-table)]
(aset byte-to-nibble-table (int c) i)
(aset byte-to-nibble-table (int (.charAt (cstr/lower-case c) 0)) i))
(dotimes [i (alength ba)]
(let [u (unchecked-byte (bit-and 0x7F (unsigned-bit-shift-right i 7)))
l (unchecked-byte (bit-and 0x7F i))]
(->> (aget byte-to-nibble-table l)
(bit-or (bit-shift-left (aget byte-to-nibble-table u) 4))
unchecked-byte
(aset ba i))))
ba)) | |
Creates a buffer consists of compressed bases from ASCII sequence. | (defn str->compressed-bases
^bytes [^String s]
(let [b (.getBytes s)
length (alength b)
result-len (quot (inc length) 2)
in-bb (ByteBuffer/wrap b)
out-bb (ByteBuffer/allocate result-len)]
(dotimes [_ result-len]
(let [u (.get in-bb)
l (byte (if (.hasRemaining in-bb) (.get in-bb) (byte (int \=))))]
(->> (bit-and 0x7F l)
(bit-or (bit-shift-left (bit-and 0x7F u) 7))
(aget ^bytes two-bytes-to-compressed-bases-table)
(.put out-bb))))
(.array out-bb))) |
(def ^:const ^:private compressed-bases-to-bases-table
;; Index: compressed base n containing two nibbles => 2n
;; Value 2n+0: base for upper nibble of n.
;; Value 2n+1: base for lower nibble of n.
(->> (for [i nibble-to-base-table j nibble-to-base-table] [i j])
(apply concat)
cstr/join)) | |
Decodes a sequence from byte array to String. | (defn compressed-bases->str
[^long length ^bytes compressed-bases ^long compressed-offset]
(let [cb (CharBuffer/allocate (inc length))
bb (ByteBuffer/wrap compressed-bases)]
(.position ^Buffer bb compressed-offset)
(dotimes [_ (quot (inc length) 2)]
(let [i (-> (.get bb) (bit-and 0xff) (* 2))]
(.put cb (.charAt compressed-bases-to-bases-table i))
(.put cb (.charAt compressed-bases-to-bases-table (inc i)))))
(.limit ^Buffer cb length)
(.flip ^Buffer cb)
(.toString cb))) |
Converts bases in given buffer to upper-case. Also converts '.' to 'N'. Bases are represented as buffer of ASCII characters. | (defn normalize-bases
^bytes [^bytes bases']
(dotimes [i (alength bases')]
(let [b (aget bases' i)]
(cond
(= b (byte (int \.))) (aset bases' i (byte (int \N)))
(<= (byte (int \a)) b (byte (int \z))) (aset bases' i (byte (- b 32)))))) ;; Upper-case ASCII offset
bases') |