Functions to read the bigWig format. See https://genome.ucsc.edu/goldenpath/help/bigWig.html and https://github.com/ucscGenomeBrowser/kent for the detail bigWig specifications.

(ns cljam.io.bigwig
  (:require [clojure.java.io :as cio]
            [cljam.io.protocols :as protocols]
            [cljam.io.util.lsb.data-io :as lsb]
            [cljam.util :as util])
  (:import [java.net URL]
           [java.io Closeable IOException RandomAccessFile]
           [java.nio ByteBuffer ByteOrder]
           [java.util.zip Inflater])
  (:refer-clojure :exclude [name]))
(def ^:private bigwig-magic 0x888ffc26)
(def ^:private bpt-magic 0x78CA8C91)
(def ^:private cir-tree-magic 0x2468ACE0)
(declare read-tracks read-all-headers read-bbi-chrom-info read-cir-tree)
(defrecord FixedWidthHeader [magic version zoom-levels chromosome-tree-offset
                             full-data-offset full-index-offset
                             total-summary-offset uncompress-buf-size
                             extension-offset])
(defrecord ZoomHeader [reduction-level data-offset index-offset])
(defrecord TotalSummary [bases-covered min-val max-val sum-data sum-squared])
(defrecord ExtendedHeader [extension-size extra-index-count
                           extra-index-list-offset])
(defrecord BptHeader [block-size key-size val-size item-count root-offset])
(defrecord BbiChromInfo [name id size])
(defrecord CirTree [block-size item-count start-chrom-ix start-base
                    end-chrom-ix end-base file-size items-per-slot
                    root-offset])
(defrecord BigWigHeaders [^FixedWidthHeader fixed-width-header
                          zoom-headers
                          ^TotalSummary total-summary
                          ^ExtendedHeader extended-header
                          ^BptHeader bpt-header
                          bbi-chrom-info
                          ^CirTree cir-tree])
(defrecord BIGWIGReader [^RandomAccessFile reader ^URL url ^BigWigHeaders headers]
  Closeable
  (close [this]
    (.close ^Closeable (.reader this)))
  protocols/IReader
  (reader-url [this] (.url this))
  (read [this] (read-tracks this))
  (read [this _] (read-tracks this))
  (indexed? [_] false))

Returns an open cljam.io.bigwig.BIGWIGReader of f. Should be used inside with-open to ensure the reader is properly closed.

(defn reader
  ^BIGWIGReader
  [f]
  (let [f (.getAbsolutePath (cio/file f))
        rdr (RandomAccessFile. f "r")
        headers (read-all-headers rdr)]
    (BIGWIGReader. rdr (util/as-url f) headers)))

Checks if the magic is right for bigWig format. Otherwise, throws IOException.

(defn- check-bigwig-magic
  [^long uint]
  (when-not (= uint bigwig-magic)
    (throw (IOException. "Invalid bigWig magic"))))

Ranged from [1,4]. Throws IOException if the version is out of range.

(defn- check-version
  [^long ushort]
  (when-not (<= 1 ushort 4)
    (throw (IOException. "Invalid bigWig version"))))

For bigWig 0. Throws IOException if the fieldCount is invalid.

(defn- check-field-count
  [^long ushort]
  (when-not (zero? ushort)
    (throw (IOException. "Invalid bigWig fieldCount"))))

For bigWig 0. Throws IOException if the definedFieldCount is invalid.

(defn- check-defined-field-count
  [^long ushort]
  (when-not (zero? ushort)
    (throw (IOException. "Invalid bigWig definedFieldCount"))))

For bigWig 0. Throws IOException if the autoSqlOffset is invalid.

(defn- check-auto-sql-offset
  [^long auto-sql-offset]
  (when-not (zero? auto-sql-offset)
    (throw (IOException. "Invalid bigWig autoSqlOffset"))))

Returns a FixedWidthHeader.

(defn- read-fixed-width-header
  [^RandomAccessFile r]
  (let [magic (lsb/read-uint r)
        version (lsb/read-ushort r)
        zoom-levels (lsb/read-ushort r)
        chromosome-tree-offset (lsb/read-long r)
        full-data-offset (lsb/read-long r)
        full-index-offset (lsb/read-long r)
        field-count (lsb/read-ushort r)
        defined-field-count (lsb/read-ushort r)
        auto-sql-offset (lsb/read-long r)
        total-summary-offset (lsb/read-long r)
        uncompress-buf-size (lsb/read-uint r)
        extension-offset (lsb/read-long r)]
    (check-bigwig-magic magic)
    (check-version version)
    (check-field-count field-count)
    (check-defined-field-count defined-field-count)
    (check-auto-sql-offset auto-sql-offset)
    (FixedWidthHeader.
     magic version zoom-levels chromosome-tree-offset full-data-offset
     full-index-offset total-summary-offset uncompress-buf-size
     extension-offset)))

Returns a vector of ZoomHeader from reader.

(defn- read-zoom-headers
  [^RandomAccessFile r {:keys [zoom-levels]}]
  (letfn [(read-zoom-header [^long n acc]
            (if (zero? n)
              acc
              (let [reduction-level (lsb/read-uint r)
                    _reserved (lsb/read-uint r)
                    data-offset (lsb/read-long r)
                    index-offset (lsb/read-long r)]
                (recur (dec n)
                       (conj acc
                             (ZoomHeader.
                              reduction-level data-offset index-offset))))))]
    (read-zoom-header zoom-levels [])))

Returns a totalSummay. If it isn't present, returns nil.

(defn- read-total-summary
  [^RandomAccessFile r {:keys [^long total-summary-offset]}]
  (when-not (zero? total-summary-offset)
    (.seek r total-summary-offset)
    (let [bases-covered (lsb/read-long r)
          min-val (lsb/read-double r)
          max-val (lsb/read-double r)
          sum-data (lsb/read-double r)
          sum-squared (lsb/read-double r)]
      (TotalSummary.
       bases-covered min-val max-val sum-data sum-squared))))

Returns an extendedHeader. It it isn't present, returns nil.

(defn- read-extended-header
  [^RandomAccessFile r {:keys [extension-offset]}]
  (when-not (zero? (long extension-offset))
    (.seek r extension-offset)
    (let [extension-size (lsb/read-ushort r)
          extra-index-count (lsb/read-ushort r)
          extra-index-list-offset (lsb/read-long r)]
      (ExtendedHeader.
       extension-size extra-index-count extra-index-list-offset))))

Checks if the magic is right for bpt format. Otherwise, throws IOException.

(defn- check-bpt-magic
  [uint]
  (when-not (= uint bpt-magic)
    (throw (IOException. "Invalid bpt magic"))))

Returns B+ tree data.

(defn- read-bpt-header
  [^RandomAccessFile r {:keys [chromosome-tree-offset]}]
  (.seek r chromosome-tree-offset)
  (let [magic (lsb/read-uint r)
        block-size (lsb/read-uint r)
        key-size (lsb/read-uint r)
        val-size (lsb/read-uint r)
        item-count (lsb/read-long r)]
    (check-bpt-magic magic)
    (lsb/skip r 8)
    (let [root-offset (.getFilePointer r)]
      (BptHeader.
       block-size key-size val-size item-count root-offset))))

Returns the all headers of bigWig format.

(defn- read-all-headers
  [^RandomAccessFile r]
  (.seek r 0)
  (let [fixed-width-header (read-fixed-width-header r)
        zoom-headers (read-zoom-headers r fixed-width-header)
        total-summary (read-total-summary r fixed-width-header)
        extended-header (read-extended-header r fixed-width-header)
        bpt-header (read-bpt-header r fixed-width-header)
        bbi-chrom-info (read-bbi-chrom-info r bpt-header)
        cir-tree (read-cir-tree r fixed-width-header)]
    (BigWigHeaders.
     fixed-width-header zoom-headers total-summary extended-header bpt-header
     bbi-chrom-info cir-tree)))

Reads length bytes and returns a string. This function is useful for handling c-style null-terminated string.

(defn- read-c-style-string
  [^RandomAccessFile r ^long length]
  (->> (lsb/read-bytes r length)
       (reduce
        (fn [cs c]
          (if (zero? (byte c))
            (reduced cs)
            (conj cs c)))
        [])
       (map char)
       (apply str)))

Returns the BbiChromInfo data of leaves.

(defn- read-bbi-chrom-info-leaves
  [^RandomAccessFile r key-size child-count]
  (repeatedly child-count
              (fn []
                (let [name (read-c-style-string r key-size)
                      id (lsb/read-uint r)
                      size (lsb/read-uint r)]
                  (BbiChromInfo.
                   name id size)))))

Skips offsets and returns the file offsets of children.

(defn- read-bbi-chrom-info-file-offsets
  [^RandomAccessFile r key-size child-count]
  (repeatedly child-count
              (fn []
                (lsb/skip r key-size)
                (lsb/read-long r))))

Returns a sequence of BbiChromInfo data.

(defn- read-bbi-chrom-info
  [^RandomAccessFile r {:keys [key-size root-offset]}]
  (letfn [(traverse [block-start]
            (.seek r block-start)
            (let [leaf? (-> r lsb/read-ubyte short zero? not)
                  _reversed (lsb/read-ubyte r)
                  child-count (lsb/read-ushort r)]
              (if leaf?
                (doall (read-bbi-chrom-info-leaves r key-size child-count))
                (let [file-offsets (read-bbi-chrom-info-file-offsets r
                                                                     key-size
                                                                     child-count)]
                  (map traverse file-offsets)))))]
    (vec (traverse root-offset))))

Checks if the magic is right for chromosome id r-tree index format. Otherwise, throws IOException.

(defn- check-cir-tree-magic
  [uint]
  (when-not (= uint cir-tree-magic)
    (throw (IOException. "Invalid cir-tree magic"))))

Returns a CirTree data.

(defn- read-cir-tree
  [^RandomAccessFile r {:keys [full-index-offset]}]
  (.seek r full-index-offset)
  (let [magic (lsb/read-uint r)
        block-size (lsb/read-uint r)
        item-count (lsb/read-long r)
        start-chrom-ix (lsb/read-uint r)
        start-base (lsb/read-uint r)
        end-chrom-ix (lsb/read-uint r)
        end-base (lsb/read-uint r)
        file-size (lsb/read-long r)
        items-per-slot (lsb/read-uint r)]
    (check-cir-tree-magic magic)
    (lsb/skip r 4)
    (let [root-offset (.getFilePointer r)]
      (CirTree.
       block-size item-count start-chrom-ix start-base end-chrom-ix
       end-base file-size items-per-slot root-offset))))

Returns true if the given blocks are overlapped.

(defn- cir-tree-overlaps?
  [id start end start-chrom-ix start-base end-chrom-ix end-base]
  (letfn [(cmp ^long [^long a-hi ^long a-lo ^long b-hi ^long b-lo]
            (cond
              (< a-hi b-hi) 1
              (> a-hi b-hi) -1
              (< a-lo b-lo) 1
              (> a-lo b-lo) -1
              :else 0))]
    (and (pos? (long (cmp id start end-chrom-ix end-base)))
         (neg? (long (cmp id end start-chrom-ix start-base))))))

Converts CirTree leaves into blocks that contain a flat map including offset and size.

(defn- cir-tree-leaves->blocks
  [^RandomAccessFile r id start end child-count]
  (->> (repeatedly child-count
                   (fn []
                     (let [start-chrom-ix (lsb/read-uint r)
                           start-base (lsb/read-uint r)
                           end-chrom-ix (lsb/read-uint r)
                           end-base (lsb/read-uint r)
                           offset (lsb/read-long r)
                           size (lsb/read-long r)]
                       (when (cir-tree-overlaps?
                              id start end start-chrom-ix
                              start-base end-chrom-ix end-base)
                         {:offset offset, :size size}))))
       (remove nil?)))

Returns a sequence that contains overlapping blocks describing CirTree leaves.

(defn- fetch-overlapping-blocks
  [^RandomAccessFile r id start end {:keys [root-offset]}]
  (letfn [(make-blocks [index-file-offset]
            (.seek r index-file-offset)
            (let [leaf? (-> r lsb/read-ubyte short zero? not)
                  _reserved (lsb/read-ubyte r)
                  child-count (lsb/read-ushort r)]
              (if leaf?
                (doall (cir-tree-leaves->blocks r id start end child-count))
                (->> (repeatedly child-count
                                 (fn []
                                   (let [start-chrom-ix (lsb/read-uint r)
                                         start-base (lsb/read-uint r)
                                         end-chrom-ix (lsb/read-uint r)
                                         end-base (lsb/read-uint r)
                                         offset (lsb/read-long r)]
                                     {:start-chrom-ix start-chrom-ix
                                      :start-base start-base
                                      :end-chrom-ix end-chrom-ix
                                      :end-base end-base
                                      :offset offset})))
                     doall
                     (sequence
                      (comp
                       (filter (fn [{:keys [start-chrom-ix start-base
                                            end-chrom-ix end-base]}]
                                 (cir-tree-overlaps? id start end
                                                     start-chrom-ix start-base
                                                     end-chrom-ix end-base)))
                       (mapcat (fn [{:keys [offset]}] (make-blocks offset)))))))))]
    (make-blocks root-offset)))

Returns a sequence of blocks that describe overlapping chrom range.

(defn- fetch-overlapping-blocks-group
  [^RandomAccessFile r ^BbiChromInfo chrom-info ^CirTree cir-tree]
  (fetch-overlapping-blocks r (:id chrom-info) 0 (:size chrom-info) cir-tree))

Returns a range intersection of two ranges that include start and end.

(defn- range-intersection
  ^long [a b]
  (- (min (long (:end a)) (long (:end b)))
     (max (long (:start a)) (long (:start b)))))

Converts bigWig tracks into BedGraph format (1-based, fully-closed).

(defn- ->bedgraph
  [^ByteBuffer bb track-start track-end item-count chrom rng]
  (->> (repeatedly item-count
                   (fn []
                     (let [start (.getInt bb)
                           end (.getInt bb)
                           value (.getFloat bb)]
                       (when (pos? (range-intersection rng {:start start :end end}))
                         {:track {:line nil :chr chrom :start track-start
                                  :end track-end}
                          :chr chrom :start (inc start) :end end :value value}))))
       (remove nil?)
       doall))

Converts bigWig tracks into variableStep tracks of wig format (1-start, fully-closed).

(defn- ->variable-step
  [^ByteBuffer bb item-span item-count chrom rng]
  (let [item-span (long item-span)]
    (->> (repeatedly
          item-count
          (fn []
            (let [start (.getInt bb)
                  value (.getFloat bb)]
              (when (pos? (range-intersection
                           rng {:start start, :end (+ start item-span)}))
                {:track {:line nil :format :variable-step :chr chrom
                         :step nil :span item-span}
                 :chr chrom :start (inc start)
                 :end (+ start item-span) :value value}))))
         (remove nil?)
         doall)))

Converts bigWig tracks into fixedStep tracks of wig format (1-start, fully-closed).

(defn- ->fixed-step
  [^ByteBuffer bb start item-step item-span item-count chrom rng]
  (reduce
   (fn [acc ^long i]
     (let [value (.getFloat bb)
           cur-start (+ (long start) (* i (long item-step)))
           cur-end (+ cur-start (long item-span))]
       (if (pos? (range-intersection rng {:start cur-start :end cur-end}))
         (conj acc {:track {:line nil :format :fixed-step :chr chrom
                            :step item-step :span item-span}
                    :chr chrom :value value :start (inc cur-start) :end cur-end})
         acc)))
   []
   (range item-count)))

Reads blocks according to BbiChromInfo data and returns tracks described Wig or BedGraph format.

(defn- read-blocks
  [^RandomAccessFile r ^BbiChromInfo chrom-info {:keys [fixed-width-header cir-tree]}]
  (let [blocks (fetch-overlapping-blocks-group r chrom-info cir-tree)
        rng {:start 0, :end (:size chrom-info)}]
    (->> blocks
         (map
          (fn [{:keys [offset size]}]
            (.seek r offset)
            (lsb/read-bytes r size)))
         (map
          (fn [^bytes block]
            (let [inf (doto (Inflater.)
                        (.setInput block))
                  ba (byte-array (:uncompress-buf-size fixed-width-header))
                  uncompress-size (.inflate inf ba)]
              (.end inf)
              (doto (ByteBuffer/wrap ba 0 uncompress-size)
                (.order
                 (case (:magic fixed-width-header)
                   0x888ffc26 ByteOrder/LITTLE_ENDIAN
                   0x26fc8f88 ByteOrder/BIG_ENDIAN))))))
         (mapcat
          (fn [^ByteBuffer bb]
            (let [_chrom-id (.getInt bb)
                  start (.getInt bb)
                  end (.getInt bb)
                  item-step (.getInt bb)
                  item-span (.getInt bb)
                  typ (int (.get bb))
                  _reserved (int (.get bb))
                  item-count (.getShort bb)]
              (case typ
                1 (->bedgraph bb start end item-count (:name chrom-info) rng)
                2 (->variable-step bb item-span item-count (:name chrom-info) rng)
                3 (->fixed-step bb start item-step item-span item-count
                                (:name chrom-info) rng)
                (throw (IOException.
                        "Invalid type of bigWig section header.")))))))))

Reads a bigWig tracks from reader and returns a sequence of tracks representing Wig (fixedStep or variableStep) or BedGraph format.

(defn read-tracks
  [^BIGWIGReader rdr]
  (let [r ^RandomAccessFile (.reader rdr)]
    (mapcat (fn [chrom-info]
              (read-blocks r chrom-info (.headers rdr)))
            (:bbi-chrom-info (.headers rdr)))))