Alpha - subject to change. Reader of a TABIX format file. | (ns cljam.io.tabix
(:require [cljam.io.util.bgzf :as bgzf]
[cljam.io.util.lsb.data-io :as lsb]
[cljam.io.util.bin :as util-bin]
[clojure.string :as cstr])
(:import java.util.Arrays
[java.io DataInputStream IOException]
[cljam.io.util.chunk Chunk])
(:refer-clojure :exclude [meta seq])) |
(def ^:private ^:const linear-index-shift 14) (def ^:private ^:const linear-index-depth 5) | |
(deftype Tabix [n-ref preset sc bc ec meta skip seq bidx lidx]
util-bin/IBinningIndex
(get-chunks [_ ref-idx bins]
(vec (mapcat (get bidx ref-idx) bins)))
(get-min-offset [_ ref-idx beg]
(get (get lidx ref-idx)
(util-bin/pos->lidx-offset beg linear-index-shift) 0))
(get-min-shift [_] linear-index-shift)
(get-depth [_]
linear-index-depth)
(get-chr-names [_]
seq)) | |
(def ^:private tabix-magic "TBI\1") | |
(defn- read-chunks!
[rdr]
(->> #(Chunk. (lsb/read-long rdr) (lsb/read-long rdr))
(repeatedly (lsb/read-int rdr))
vec)) | |
(defn- read-seq [^bytes buf] (cstr/split (String. buf) #"\00")) | |
(defn- read-bin-index
[rdr]
(->> #(hash-map
:bin (lsb/read-int rdr)
:chunks (read-chunks! rdr))
(repeatedly (lsb/read-int rdr))
vec)) | |
(defn- read-linear-index
[rdr]
(->> #(lsb/read-long rdr)
(repeatedly (lsb/read-int rdr))
vec)) | |
(defn- read-index*
[^DataInputStream rdr]
(when-not (Arrays/equals ^bytes (lsb/read-bytes rdr 4) (.getBytes ^String tabix-magic))
(throw (IOException. "Invalid TABIX file")))
(let [n-ref (lsb/read-int rdr)
preset (lsb/read-int rdr)
sc (lsb/read-int rdr)
bc (lsb/read-int rdr)
ec (lsb/read-int rdr)
meta (lsb/read-int rdr)
skip (lsb/read-int rdr)
len (lsb/read-int rdr)
buf (lsb/read-bytes rdr len)
seq (read-seq buf)
refs (range n-ref)
all-idx (map (fn [_] [(read-bin-index rdr) (read-linear-index rdr)]) refs)
bidx-seq (map first all-idx)
bidx (zipmap
refs
(map (fn [bins]
(into {} (map (juxt :bin :chunks)) bins))
bidx-seq))
lidx (zipmap refs (map second all-idx))]
(->Tabix n-ref preset sc bc ec meta
skip seq bidx lidx))) | |
Reads tabix and returns Tabix object. | (defn read-index
[f]
(with-open [r (DataInputStream. (bgzf/bgzf-input-stream f))]
(read-index* r))) |