(ns cljam.io.pileup
(:require [clojure.java.io :as cio]
[clojure.string :as cstr]
[cljam.io.sam.util.quality :as qual]
[cljam.io.sequence :as cseq]
[proton.core :as p])
(:import [java.io Closeable BufferedWriter])
(:refer-clojure :exclude [ref])) | |
Records | |
(defrecord PileupBase [^boolean start?
mapq ;; byte?
^char base
^short qual
^boolean reverse?
^boolean end?
insertion ;; String?
deletion ;; int?
qname ;; String?
alignment]) ;; SAMAlignment? | |
SAMAlignment? | |
(defrecord LocusPile [rname
^int pos
pile]) | |
Reader | |
(deftype PileupReader [reader]
Closeable
(close [this]
(.close ^Closeable (.reader this)))) | |
Returns an open instance of cljam.io.pileup.PileupReader of f. Should be used inside with-open to ensure the reader is properly closed. | (defn reader ^PileupReader [f] (PileupReader. (cio/reader f))) |
(def ^:private
upper-table
(let [ba (byte-array 128)]
(doseq [c "ATGCN"]
(aset ba (byte (int c)) (byte (int c)))
(aset ba
(+ (byte (int c))
(- (byte (int \a))
(byte (int \A))))
(byte (int c))))
(doseq [[from to] [[\, -1] [\. -1] [\< \>] [\> \>] [\* \*]]]
(aset ba (int from) (byte (int to))))
ba)) | |
(defn- parse-bases-col
[ref-base ^String column]
;; TODO: Too slow and unreadable
(let [len (.length column)]
(loop [i 0
results (transient [])]
(if (< i len)
(let [c (.charAt column i)
start? (= \^ c)
mapq (when start? (qual/fastq-char->phred-byte (.charAt column (inc i))))
base (if mapq (.charAt column (+ i 2)) c)
base-int (int base)
reverse? (or (= base-int 60)
(= base-int 44)
(<= 97 base-int))
upper-base-int (aget ^bytes upper-table base-int)
upper-base (if (neg? upper-base-int) ref-base (char upper-base-int))]
(when (zero? upper-base-int)
(throw (ex-info (format "Invalid character %s in %s" base column) {:column column :base base})))
(if (= len (+ i (if mapq 3 1)))
(persistent! (conj! results (PileupBase. start? mapq upper-base -1 reverse? false nil nil nil nil)))
(let [x (.charAt column (+ i (if mapq 3 1)))
ins? (= x \+)
del? (= x \-)
[indel-num-chars indel-num] (when (or ins? del?)
(loop [j (+ i (if mapq 4 2))
k 0
results 0]
(let [c (.charAt column j)]
(if (Character/isDigit c)
(recur (inc j) (inc k) (+ (* 10 results) (- (int c) 48)))
[k results]))))
indel-seq (when indel-num
(let [s (+ i (if mapq 4 2) (long indel-num-chars))]
(cstr/upper-case (.substring column s (+ s (long indel-num))))))
end? (boolean (or (= \$ x)
(and indel-num
(not= len (+ i (if mapq 4 2) (long indel-num-chars) (long indel-num)))
(= \$ (.charAt column (+ i (if mapq 4 2) (long indel-num-chars) (long indel-num)))))))
next-pos (long (+ i 1 (if mapq 2 0)
(if indel-num (+ 1 (long indel-num-chars) (long indel-num)) 0)
(if end? 1 0)))]
(recur next-pos
(conj! results (PileupBase. start? mapq upper-base -1 reverse? end? (when ins? indel-seq) (when del? indel-num) nil nil))))))
(persistent! results))))) | |
(defn- parse-pileup-line
[line]
(let [[rname pos [ref-base] cnt bases' quals & _] (cstr/split line #"\t")
;; TODO: handle extra columns like mapq, qname...
upper-ref-base (Character/toUpperCase ^char ref-base)
pile (mapv (fn [b q] (assoc b :qual q))
(some->> bases' (parse-bases-col upper-ref-base))
(some-> quals qual/fastq->phred))]
(-> (LocusPile. rname (p/as-long pos) pile)
(assoc :count (p/as-int cnt) :ref ref-base)))) | |
Reads piled-up bases of the pileup file, returning them as a lazy sequence of cljam.io.pileup.LocusPile. | (defn read-piles
[^PileupReader pileup-reader]
(->> pileup-reader
.reader
line-seq
(map parse-pileup-line))) |
Writer | |
(defn- write-mpileup-alignment!
[^BufferedWriter w ref-reader rname ref-pos ref'
{:keys [base reverse? mapq start? end? insertion deletion]}]
(let [case-base-fn (if-not reverse?
identity
(fn [c]
(if (= c \>)
\<
(Character/toLowerCase ^char c))))
case-fn (if-not reverse? identity cstr/lower-case)]
(when start?
(.append w \^)
(.append w (qual/phred-byte->fastq-char mapq)))
(if (= base ref') ;; match / deleted / skipped
(.append w (if reverse? \, \.))
(.append w (unchecked-char (case-base-fn base))))
(when deletion
(.append w \-)
(.append w (String/valueOf (int deletion)))
(.append w ^String (case-fn
(if ref-reader
(->> {:chr rname
:start (inc (long ref-pos))
:end (+ (long ref-pos) (int deletion))}
(cseq/read-sequence ref-reader))
(apply str (repeat deletion \N))))))
(when insertion
(.append w \+)
(.append w (String/valueOf (count insertion)))
(.append w ^String (case-fn insertion)))
(when end?
(.append w \$)))) | |
(defn- write-mpileup-line!
[^BufferedWriter w ref-reader {:keys [rname pos pile]}]
(let [ref-base (some-> ref-reader
(cseq/read-sequence
{:chr rname :start pos :end pos}
{:mask? true}))
ref-char (some-> ref-base cstr/upper-case first)]
(.write w ^String rname)
(.append w \tab)
(.write w (String/valueOf (long pos)))
(.append w \tab)
(.write w (or ^String ref-base "N"))
(.append w \tab)
(.write w (String/valueOf (count pile)))
(.append w \tab)
(doseq [p pile]
(write-mpileup-alignment! w ref-reader rname pos ref-char p))
(.append w \tab)
(doseq [p pile]
(.append w (qual/phred-byte->fastq-char (:qual p)))))) | |
(deftype PileupWriter [writer ref-reader]
Closeable
(close [this]
(.close ^Closeable (.writer this))
(some-> ^Closeable (.ref-reader this) .close))) | |
Returns an open instance of cljam.io.pileup.PileupWriter of f. Should be used
inside with-open to ensure the writer is properly closed. A reference sequence
is used when the second arg | (defn writer (^PileupWriter [f] (writer f nil)) (^PileupWriter [f reference-path] (PileupWriter. (cio/writer f) (some-> reference-path cseq/reader)))) |
Writes piled-up bases to a file. | (defn write-piles
[^PileupWriter pileup-writer piles]
(let [^BufferedWriter wtr (.writer pileup-writer)
ref-reader (.ref-reader pileup-writer)]
(doseq [pile piles]
(write-mpileup-line! wtr ref-reader pile)
(.newLine wtr))
(.flush wtr))) |