(ns cljam.io.twobit.writer
  (:require [clojure.java.io :as cio]
            [cljam.io.protocols :as protocols]
            [cljam.io.util.lsb :as lsb]
            [cljam.util :as util])
  (:import [java.io Closeable OutputStream DataOutputStream BufferedOutputStream FileOutputStream]
           [java.nio ByteBuffer]))
(declare write-sequences)
(deftype TwoBitWriter [url writer file-output-stream index]
  Closeable
  (close [this]
    (.close ^Closeable (.writer this)))
  protocols/IWriter
  (writer-url [this]
    (.url this))
  protocols/ISequenceWriter
  (write-sequences [this seqs]
    (write-sequences this seqs)))

Returns a 2bit writer of f.

(defn writer
  [f {:keys [index]}]
  (let [abs-f (.getAbsolutePath (cio/file f))
        fos (FileOutputStream. abs-f)
        bos (BufferedOutputStream. fos)
        dos (DataOutputStream. bos)]
    (TwoBitWriter. (util/as-url abs-f) dos fos index)))

Writes a 2bit file header. Supports little-endian only.

(defn- write-file-header!
  [w nseq]
  (lsb/write-int w 0x1A412743)
  (lsb/write-int w 0)
  (lsb/write-int w nseq)
  (lsb/write-int w 0))

Returns a sequence of [start length] of masked regions.

(defn- mask-regions
  [^String s]
  (let [len (.length s)]
    (loop [r (transient [])
           p nil
           l nil
           i 0]
      (if (= i len)
        (if p
          (persistent! (conj! r [p l]))
          (persistent! r))
        (if (<= (int \a) (int (.charAt s (int i))))
          (if p
            (recur r p (inc l) (inc i))
            (recur r i 1 (inc i)))
          (if p
            (recur (conj! r [p l]) nil nil (inc i))
            (recur r nil nil (inc i))))))))

Returns a sequence of [start length] of N regions.

(defn- amb-regions
  [^String s]
  (let [len (.length s)]
    (loop [r (transient [])
           p nil
           l nil
           i 0]
      (if (= i len)
        (if p
          (persistent! (conj! r [p l]))
          (persistent! r))
        (if (= \N (.charAt s (int i)))
          (if p
            (recur r p (inc l) (inc i))
            (recur r i 1 (inc i)))
          (if p
            (recur (conj! r [p l]) nil nil (inc i))
            (recur r nil nil (inc i))))))))
(defn- write-index!
  [w idx]
  (loop [offset (+ (* 4 4) (reduce + (map #(+ 1 (count (:name %)) 4) idx)))
         idx idx]
    (when-let [{:keys [name len]} (first idx)]
      (lsb/write-ubyte w (count name))
      (lsb/write-string w name)
      (lsb/write-int w offset)
      (recur (+ offset
                (if-let [{:keys [ambs masks]} (first idx)]
                  (+ 4 4 (* 2 4 (count ambs)) 4 (* 2 4 (count masks)) 4)
                  0) ; dummy
                (quot (dec (+ len 4)) 4))
             (next idx)))))
(def ^:private
  char->twobit
  (doto (byte-array 128)
    (aset-byte (int \C) 1)
    (aset-byte (int \c) 1)
    (aset-byte (int \A) 2)
    (aset-byte (int \a) 2)
    (aset-byte (int \G) 3)
    (aset-byte (int \g) 3)))

Encodes a sequence into twobit format.

(defn write-twobit!
  [^OutputStream o ^String s]
  (let [len (.length s)
        bb (ByteBuffer/wrap (.getBytes s))
        table ^bytes char->twobit]
    (dotimes [_ (quot len 4)]
      (->> (bit-or
            (bit-shift-left (aget table (.get bb)) 6)
            (bit-shift-left (aget table (.get bb)) 4)
            (bit-shift-left (aget table (.get bb)) 2)
            (aget table (.get bb)))
           unchecked-int
           (.write o)))
    (when (pos? (mod len 4))
      (loop [b 0 i (mod len 4) j 1]
        (if (pos? i)
          (recur (bit-or b (bit-shift-left (aget table (.get bb)) (* 2 (- 4 j)))) (dec i) (inc j))
          (.write o (unchecked-int b)))))))

Writes a single sequence entry to writer.

(defn- write-sequence!
  [w sequence idx]
  (let [name (or (:name sequence) (:rname sequence))
        seq-data (or (:seq sequence) (:sequence sequence))
        {:keys [len ambs masks]} (first (filter #(= (:name %) name) idx))]
    (lsb/write-int w len)
    (lsb/write-int w (count ambs))
    (doseq [[s _] ambs]
      (lsb/write-int w s))
    (doseq [[_ l] ambs]
      (lsb/write-int w l))
    (lsb/write-int w (count masks))
    (doseq [[s _] masks]
      (lsb/write-int w s))
    (doseq [[_ l] masks]
      (lsb/write-int w l))
    (lsb/write-int w 0)
    (write-twobit! w seq-data)))
(defn- write-sequences-without-index
  [^TwoBitWriter wtr xs]
  (let [idx (map (fn [{:keys [name rname seq sequence]}]
                   (let [seq-data (or seq sequence)]
                     {:name (or name rname)
                      :len (count seq-data)
                      :masks (mask-regions seq-data)
                      :ambs (amb-regions seq-data)}))
                 xs)]
    (write-file-header! (.writer wtr) (count xs))
    (write-index! (.writer wtr) idx)
    (doseq [sequence xs]
      (write-sequence! (.writer wtr) sequence idx))))
(defn- write-sequences-with-index
  [^TwoBitWriter wtr idx xs]
  (let [idx-atom (atom idx)]
    (write-file-header! (.writer wtr) (count @idx-atom))
    (write-index! (.writer wtr) @idx-atom)
    (doseq [sequence xs]
      (let [name (or (:name sequence) (:rname sequence))
            seq-data (or (:seq sequence) (:sequence sequence))
            masks (mask-regions seq-data)
            ambs (amb-regions seq-data)
            i (first (keep-indexed #(when (= (:name %2) name) %1) @idx-atom))]
        (swap! idx-atom update i assoc :masks masks :ambs ambs))
      (write-sequence! (.writer wtr) sequence @idx-atom))
    ;; finalize
    (.flush ^DataOutputStream (.writer wtr))
    (let [ch (.getChannel ^FileOutputStream (.file-output-stream wtr))]
      (.position ch 16)
      (write-index! ch @idx-atom))))

Writes all sequences to wtr. Input sequences must be a sequence of maps.

(defn write-sequences
  [^TwoBitWriter wtr xs]
  (if (nil? (.index wtr))
    (write-sequences-without-index wtr xs)
    (write-sequences-with-index wtr (.index wtr) xs)))