(ns cljam.io.pileup
  (:require [clojure.java.io :as cio]
            [clojure.string :as cstr]
            [cljam.io.sam.util.quality :as qual]
            [cljam.io.sequence :as cseq]
            [proton.core :as p])
  (:import [java.io Closeable BufferedWriter])
  (:refer-clojure :exclude [ref]))

Records

(defrecord PileupBase [^boolean start?
                       mapq ;; byte?
                       ^char base
                       ^short qual
                       ^boolean reverse?
                       ^boolean end?
                       insertion ;; String?
                       deletion ;; int?
                       qname ;; String?
                       alignment]) ;; SAMAlignment?

SAMAlignment?

(defrecord LocusPile [rname
                      ^int pos
                      pile])

Reader

(deftype PileupReader [reader]
  Closeable
  (close [this]
    (.close ^Closeable (.reader this))))

Returns an open instance of cljam.io.pileup.PileupReader of f. Should be used inside with-open to ensure the reader is properly closed.

(defn reader
  ^PileupReader
  [f]
  (PileupReader. (cio/reader f)))
(def ^:private
  upper-table
  (let [ba (byte-array 128)]
    (doseq [c "ATGCN"]
      (aset ba (byte (int c)) (byte (int c)))
      (aset ba
            (+ (byte (int c))
               (- (byte (int \a))
                  (byte (int \A))))
            (byte (int c))))
    (doseq [[from to] [[\, -1] [\. -1] [\< \>] [\> \>] [\* \*]]]
      (aset ba (int from) (byte (int to))))
    ba))
(defn- parse-bases-col
  [ref-base ^String column]
  ;; TODO: Too slow and unreadable
  (let [len (.length column)]
    (loop [i 0
           results (transient [])]
      (if (< i len)
        (let [c (.charAt column i)
              start? (= \^ c)
              mapq (when start? (qual/fastq-char->phred-byte (.charAt column (inc i))))
              base (if mapq (.charAt column (+ i 2)) c)
              base-int (int base)
              reverse? (or (= base-int 60)
                           (= base-int 44)
                           (<= 97 base-int))
              upper-base-int (aget ^bytes upper-table base-int)
              upper-base (if (neg? upper-base-int) ref-base (char upper-base-int))]
          (when (zero? upper-base-int)
            (throw (ex-info (format "Invalid character %s in %s" base column) {:column column :base base})))
          (if (= len (+ i (if mapq 3 1)))
            (persistent! (conj! results (PileupBase. start? mapq upper-base -1 reverse? false nil nil nil nil)))
            (let [x (.charAt column (+ i (if mapq 3 1)))
                  ins? (= x \+)
                  del? (= x \-)
                  [indel-num-chars indel-num] (when (or ins? del?)
                                                (loop [j (+ i (if mapq 4 2))
                                                       k 0
                                                       results 0]
                                                  (let [c (.charAt column j)]
                                                    (if (Character/isDigit c)
                                                      (recur (inc j) (inc k) (+ (* 10 results) (- (int c) 48)))
                                                      [k results]))))
                  indel-seq (when indel-num
                              (let [s (+ i (if mapq 4 2) (long indel-num-chars))]
                                (cstr/upper-case (.substring column s (+ s (long indel-num))))))
                  end? (boolean (or (= \$ x)
                                    (and indel-num
                                         (not= len (+ i (if mapq 4 2) (long indel-num-chars) (long indel-num)))
                                         (= \$ (.charAt column (+ i (if mapq 4 2) (long indel-num-chars) (long indel-num)))))))
                  next-pos (long (+ i 1 (if mapq 2 0)
                                    (if indel-num (+ 1 (long indel-num-chars) (long indel-num)) 0)
                                    (if end? 1 0)))]
              (recur next-pos
                     (conj! results (PileupBase. start? mapq upper-base -1 reverse? end? (when ins? indel-seq) (when del? indel-num) nil nil))))))
        (persistent! results)))))
(defn- parse-pileup-line
  [line]
  (let [[rname pos [ref-base] cnt bases' quals & _] (cstr/split line #"\t")
        ;; TODO: handle extra columns like mapq, qname...
        upper-ref-base (Character/toUpperCase ^char ref-base)
        pile (mapv (fn [b q] (assoc b :qual q))
                   (some->> bases' (parse-bases-col upper-ref-base))
                   (some-> quals qual/fastq->phred))]
    (-> (LocusPile. rname (p/as-long pos) pile)
        (assoc :count (p/as-int cnt) :ref ref-base))))

Reads piled-up bases of the pileup file, returning them as a lazy sequence of cljam.io.pileup.LocusPile.

(defn read-piles
  [^PileupReader pileup-reader]
  (->> pileup-reader
       .reader
       line-seq
       (map parse-pileup-line)))

Writer

(defn- write-mpileup-alignment!
  [^BufferedWriter w ref-reader rname ref-pos ref'
   {:keys [base reverse? mapq start? end? insertion deletion]}]
  (let [case-base-fn (if-not reverse?
                       identity
                       (fn [c]
                         (if (= c \>)
                           \<
                           (Character/toLowerCase ^char c))))
        case-fn (if-not reverse? identity cstr/lower-case)]
    (when start?
      (.append w \^)
      (.append w (qual/phred-byte->fastq-char mapq)))
    (if (= base ref') ;; match / deleted / skipped
      (.append w (if reverse? \, \.))
      (.append w (unchecked-char (case-base-fn base))))
    (when deletion
      (.append w \-)
      (.append w (String/valueOf (int deletion)))
      (.append w ^String (case-fn
                          (if ref-reader
                            (->> {:chr rname
                                  :start (inc (long ref-pos))
                                  :end (+ (long ref-pos) (int deletion))}
                                 (cseq/read-sequence ref-reader))
                            (apply str (repeat deletion \N))))))
    (when insertion
      (.append w \+)
      (.append w (String/valueOf (count insertion)))
      (.append w ^String (case-fn insertion)))
    (when end?
      (.append w \$))))
(defn- write-mpileup-line!
  [^BufferedWriter w ref-reader {:keys [rname pos pile]}]
  (let [ref-base (some-> ref-reader
                         (cseq/read-sequence
                          {:chr rname :start pos :end pos}
                          {:mask? true}))
        ref-char (some-> ref-base cstr/upper-case first)]
    (.write w ^String rname)
    (.append w \tab)
    (.write w (String/valueOf (long pos)))
    (.append w \tab)
    (.write w (or ^String ref-base "N"))
    (.append w \tab)
    (.write w (String/valueOf (count pile)))
    (.append w \tab)
    (doseq [p pile]
      (write-mpileup-alignment! w ref-reader rname pos ref-char p))
    (.append w \tab)
    (doseq [p pile]
      (.append w (qual/phred-byte->fastq-char (:qual p))))))
(deftype PileupWriter [writer ref-reader]
  Closeable
  (close [this]
    (.close ^Closeable (.writer this))
    (some-> ^Closeable (.ref-reader this) .close)))

Returns an open instance of cljam.io.pileup.PileupWriter of f. Should be used inside with-open to ensure the writer is properly closed. A reference sequence is used when the second arg reference-path is supplied.

(defn writer
  (^PileupWriter [f]
   (writer f nil))
  (^PileupWriter [f reference-path]
   (PileupWriter. (cio/writer f) (some-> reference-path cseq/reader))))

Writes piled-up bases to a file. pileup-writer must be an instance of cljam.io.pileup.PileupWriter.

(defn write-piles
  [^PileupWriter pileup-writer piles]
  (let [^BufferedWriter wtr (.writer pileup-writer)
        ref-reader (.ref-reader pileup-writer)]
    (doseq [pile piles]
      (write-mpileup-line! wtr ref-reader pile)
      (.newLine wtr))
    (.flush wtr)))