Utility functions for base sequences. | (ns cljam.io.sam.util.sequence (:require [clojure.string :as cstr]) (:import [java.nio Buffer ByteBuffer CharBuffer])) |
(def ^:private ^:const nibble-to-base-table ;; Index: nibble of a compressed base. ;; Value: base for the nibble. "=ACMGRSVTWYHKDBN") | |
(def ^:private two-bytes-to-compressed-bases-table ;; Index: two bases (A,C) => ASCII (65,67) => 2r 1000001 1000011 => 8387 ;; Value: two bases (A,C) => nibbles (1,2) => 2r 0001 0010 => 18 (let [ba (byte-array (bit-shift-left 1 14)) byte-to-nibble-table (byte-array (bit-shift-left 1 7) (byte 15))] (doseq [[^byte i c] (map vector (range) nibble-to-base-table)] (aset byte-to-nibble-table (int c) i) (aset byte-to-nibble-table (int (.charAt (cstr/lower-case c) 0)) i)) (dotimes [i (alength ba)] (let [u (unchecked-byte (bit-and 0x7F (unsigned-bit-shift-right i 7))) l (unchecked-byte (bit-and 0x7F i))] (->> (aget byte-to-nibble-table l) (bit-or (bit-shift-left (aget byte-to-nibble-table u) 4)) unchecked-byte (aset ba i)))) ba)) | |
Creates a buffer consists of compressed bases from ASCII sequence. | (defn str->compressed-bases ^bytes [^String s] (let [b (.getBytes s) length (alength b) result-len (quot (inc length) 2) in-bb (ByteBuffer/wrap b) out-bb (ByteBuffer/allocate result-len)] (dotimes [_ result-len] (let [u (.get in-bb) l (byte (if (.hasRemaining in-bb) (.get in-bb) (byte (int \=))))] (->> (bit-and 0x7F l) (bit-or (bit-shift-left (bit-and 0x7F u) 7)) (aget ^bytes two-bytes-to-compressed-bases-table) (.put out-bb)))) (.array out-bb))) |
(def ^:const ^:private compressed-bases-to-bases-table ;; Index: compressed base n containing two nibbles => 2n ;; Value 2n+0: base for upper nibble of n. ;; Value 2n+1: base for lower nibble of n. (->> (for [i nibble-to-base-table j nibble-to-base-table] [i j]) (apply concat) cstr/join)) | |
Decodes a sequence from byte array to String. | (defn compressed-bases->str [^long length ^bytes compressed-bases ^long compressed-offset] (let [cb (CharBuffer/allocate (inc length)) bb (ByteBuffer/wrap compressed-bases)] (.position ^Buffer bb compressed-offset) (dotimes [_ (quot (inc length) 2)] (let [i (-> (.get bb) (bit-and 0xff) (* 2))] (.put cb (.charAt compressed-bases-to-bases-table i)) (.put cb (.charAt compressed-bases-to-bases-table (inc i))))) (.limit ^Buffer cb length) (.flip ^Buffer cb) (.toString cb))) |
Converts bases in given buffer to upper-case. Also converts '.' to 'N'. Bases are represented as buffer of ASCII characters. | (defn normalize-bases ^bytes [^bytes bases'] (dotimes [i (alength bases')] (let [b (aget bases' i)] (cond (= b (byte (int \.))) (aset bases' i (byte (int \N))) (<= (byte (int \a)) b (byte (int \z))) (aset bases' i (byte (- b 32)))))) ;; Upper-case ASCII offset bases') |