(ns cljam.io.bcf.writer
(:require [clojure.string :as cstr]
[cljam.io.protocols :as protocols]
[cljam.io.util.bgzf :as bgzf]
[cljam.io.util.lsb.io-stream :as lsb]
[cljam.io.vcf.writer :as vw]
[cljam.io.vcf.util :as vcf-util]
[cljam.util :as util])
(:import [java.io Closeable DataOutputStream]
[java.net URL]
[java.nio ByteBuffer ByteOrder])) | |
(declare write-variants) | |
(deftype BCFWriter [^URL url meta-info header ^DataOutputStream writer]
Closeable
(close [this]
(.close ^Closeable (.writer this)))
protocols/IWriter
(writer-url [this] (.url this))
protocols/IVariantWriter
(write-variants [this variants]
(write-variants this variants))) | |
(def ^:private ^:const default-fileformat "VCFv4.3")
(def ^:private ^:const bcf-meta-keys
[:fileformat :file-date :source :reference :contig :phasing :info :filter
:format :alt :sample :pedigree])
(def ^:private ^:const meta-info-prefix "##")
(def ^:private ^:const type-kws
{"String" :str, "Character" :char,
"Integer" :int, "Float" :float, "Flag" :flag})
(def ^:private ^:const default-pass-filter
{:id "PASS", :description "All filters passed"}) | |
Converts meta-info rows to a sequence of strings. | (defn- stringify-meta
[meta-info]
(apply
concat
(for [k bcf-meta-keys :let [v (meta-info k)] :when v]
(if (sequential? v)
(for [x v]
(str meta-info-prefix (vw/stringify-key k) "=<" (vw/stringify-structured-line k x) ">"))
[(str meta-info-prefix (vw/stringify-key k) "=" v)])))) |
Writes BCF file header, meta-info and header row to writer. | (defn- write-file-header
[^BCFWriter w]
(let [wtr ^DataOutputStream (.writer w)
hdr-ba (-> \newline
(cstr/join
(concat
(stringify-meta (.meta-info w))
[(vw/stringify-header (.header w))]))
(str \newline) ;; newline at the end of the header
(str (char 0)) ;; NULL-terminated
.getBytes)
hlen (alength hdr-ba)]
(lsb/write-bytes wtr (byte-array (map (comp byte int) "BCF\2\2")))
(lsb/write-int wtr hlen)
(lsb/write-bytes wtr hdr-ba))) |
(defn- index-meta
[meta-info]
(let [m (update meta-info :filter
(fn [xs]
(let [{[p] true, f false} (group-by #(= "PASS" (:id %)) xs)]
(cons (or p default-pass-filter) f))))
fif (->> [:filter :info :format]
(mapcat #(map vector (repeat %) (% m)))
(map-indexed (fn [i [k v]] [k (assoc v :idx (str i))]))
(reduce (fn [r [k v]] (update r k (fnil conj []) v)) {}))]
(-> meta-info
(update :contig #(map-indexed (fn [i c] (assoc c :idx (str i))) %))
(merge fif)))) | |
Returns an open cljam.bcf.BCFWriter of f. Meta-information lines and a header line will be written in this function. Should be used inside with-open to ensure the Writer is properly closed. e.g. (with-open [wtr (writer "out.bcf" {:file-date "20090805", :source "myImpu..." ...} ["CHROM" "POS" "ID" "REF" "ALT" ...])] (WRITING-BCF)) | (defn writer
^BCFWriter
[f meta-info header]
(let [bos (bgzf/bgzf-output-stream f)
dos (DataOutputStream. bos)
indexed-meta (->> meta-info
(merge {:fileformat default-fileformat})
index-meta)]
(doto (BCFWriter. (util/as-url f) indexed-meta header dos)
(write-file-header)))) |
Returns an integer indicating type of input value. | (defn- value-type
^long [v]
(cond
(nil? v) 1
(keyword? v) 1
(float? v) 5
(char? v) 7
;; 0x80-0x87, 0x8000-0x8007 and 0x80000000-0x80000007
;; are reserved for future use in the BCF2 spec.
(<= (+ Byte/MIN_VALUE 8) v Byte/MAX_VALUE) 1
(<= (+ Short/MIN_VALUE 8) v Short/MAX_VALUE) 2
(<= (+ Integer/MIN_VALUE 8) v Integer/MAX_VALUE) 3)) |
(def ^:private ^:const int8-special-map
{nil 0x80 :eov 0x81 :exists 1})
(def ^:private ^:const int16-special-map
{nil 0x8000 :eov 0x8001 :exists 1})
(def ^:private ^:const int32-special-map
{nil 0x80000000 :eov 0x80000001 :exists 1})
(def ^:private ^:const float32-special-map
{nil 0x7F800001 :eov 0x7F800002 :exists 1}) | |
(defn- encode-typed-value
(^bytes [element-type v]
(if (= v :exists)
(byte-array [0x00])
(encode-typed-value element-type [v] 1)))
(^bytes [element-type vs ^long n-sample]
(let [str? (or (= element-type :str) (= element-type :char))
vs (map (fn [v]
(let [v (if (sequential? v) v [v])]
(if str? (cstr/join \, (map #(or % ".") v)) v))) vs)
max-len (long (apply max 0 (map count vs)))
vs (map
(fn [v]
(let [l (max 1 (count v))]
(if (< l max-len)
(if str?
(apply str v (repeat (- max-len l) (char 0)))
(concat
(or (seq v) [nil])
(repeat (- max-len l) :eov)))
v))) vs)
type-id (case element-type
(:str :char) 7
:float 5
(long (apply max 1 (mapcat (partial map value-type) vs))))
type-byte (bit-or (bit-shift-left (min 15 max-len) 4) type-id)
len-bytes (when (<= 15 max-len)
(encode-typed-value :int max-len))
n-bytes (+ (* n-sample max-len (case type-id 1 1 2 2 3 4 5 4 7 1))
1 (if len-bytes (long (alength ^bytes len-bytes)) 0))
bb (ByteBuffer/allocate n-bytes)]
(.order bb ByteOrder/LITTLE_ENDIAN)
(.put bb (unchecked-byte type-byte))
(when len-bytes
(.put bb ^bytes len-bytes))
(doseq [v vs
b v]
(case type-id
1 (.put bb (unchecked-byte (get int8-special-map b b)))
2 (.putShort bb (unchecked-short (get int16-special-map b b)))
3 (.putInt bb (unchecked-int (get int32-special-map b b)))
5 (.putInt bb (unchecked-int (or (get float32-special-map b)
(Float/floatToRawIntBits b))))
7 (.put bb (byte (get {nil 0 :eov 0} b (int b))))))
(.array bb)))) | |
(defn- concat-bytes
^bytes [xs]
(let [l (long (transduce (map alength) + xs))
bb (ByteBuffer/allocate l)]
(doseq [x xs]
(.put bb ^bytes x))
(.array bb))) | |
Encodes shared part of a variant and returns as a byte buffer. | (defn- encode-variant-shared
^ByteBuffer [{:keys [chr pos id ref-length alt qual info n-sample]
ref-bases :ref filters :filter formats :format}]
(let [chrom-id (unchecked-int chr)
pos (unchecked-int (dec ^long pos))
rlen (unchecked-int ref-length)
qual (unchecked-int (if qual
(Float/floatToRawIntBits qual)
(float32-special-map nil)))
n-allele (inc (count alt))
n-info (count info)
n-allele-info (bit-or (bit-shift-left n-allele 16) n-info)
n-fmt (count formats)
n-fmt-sample (bit-or (bit-shift-left n-fmt 24) ^long n-sample)
id (if id (encode-typed-value :str id) (byte-array [0x07]))
refseq ^bytes (encode-typed-value :str ref-bases)
altseq (concat-bytes (map (partial encode-typed-value :str) alt))
filters (if-let [f (seq filters)]
(encode-typed-value :int f)
(byte-array [0x00]))
info (if (pos? n-info)
(->> info
(mapcat
(fn [[k t v]]
[(encode-typed-value :int k)
(encode-typed-value t v)]))
concat-bytes)
(byte-array 0))
l-shared (+ 24
(alength id) (alength refseq) (alength altseq)
(alength filters) (alength info))]
(doto (ByteBuffer/allocate l-shared)
(.order ByteOrder/LITTLE_ENDIAN)
(.putInt chrom-id)
(.putInt pos)
(.putInt rlen)
(.putInt qual)
(.putInt n-allele-info)
(.putInt n-fmt-sample)
(.put id)
(.put refseq)
(.put altseq)
(.put filters)
(.put info)))) |
(defn- encode-variant-indv
[{:keys [^long n-sample genotype]}]
(->> genotype
(mapcat
(fn [[k t vs]]
[(encode-typed-value :int k)
(encode-typed-value t vs n-sample)]))
concat-bytes
(ByteBuffer/wrap))) | |
Encodes a BCF-style variant map and write it to writer. | (defn- write-variant
[w v]
(let [shared-ba (.array ^ByteBuffer (encode-variant-shared v))
indv-ba (.array ^ByteBuffer (encode-variant-indv v))]
(lsb/write-uint w (alength shared-ba))
(lsb/write-uint w (alength indv-ba))
(lsb/write-bytes w shared-ba)
(lsb/write-bytes w indv-ba))) |
Converts a parsed variant map to BCF-style map. | (defn- parsed-variant->bcf-map
[[fmt-kw & indiv-kws :as kws] contigs filters formats info variant]
(let [fmts (keep (fn [f] (when-let [m (formats f)] [f m])) (variant fmt-kw))
genotype (map
(fn [[k {:keys [idx type-kw]}]]
(->> indiv-kws
(map #(cond-> (get-in variant [% k] nil)
(= k :GT) vcf-util/genotype->ints))
(vector idx type-kw)))
fmts)]
(-> (apply dissoc variant kws)
(assoc :n-sample (count indiv-kws)
:ref-length (if-let [e (get-in variant [:info :END])]
(inc (- (long e) (long (:pos variant))))
(count (:ref variant)))
:format (map (comp :idx second) fmts)
:genotype genotype)
(update :chr (comp :idx contigs))
(update :filter (fn [f] (map (comp :idx filters) f)))
(update :info (fn [i]
(keep (fn [[k v]]
(when-let [{:keys [idx type-kw]} (info k)]
[idx type-kw v])) i)))))) |
Creates a map for searching meta-info with (f id). | (defn- meta->map
[meta-info f]
(into {} (map
(fn [{:keys [id] t :type :as m}]
[(f id) (cond-> (update m :idx #(Integer/parseInt %))
t (assoc :type-kw (type-kws t)))])) meta-info)) |
Writes data lines on writer. Returns nil. (write-variants [{:chr "19", :pos 111, :id nil, :ref "A", :alt ["C"], :qual 9.6, :filter [:PASS], :info {:DP 4}, :FORMAT [:GT :HQ] ...} ...]) | (defn write-variants
[^BCFWriter w variants]
(let [kws (mapv keyword (drop 8 (.header w)))
contigs (meta->map (:contig (.meta-info w)) identity)
filters (assoc (meta->map (:filter (.meta-info w)) keyword)
:PASS {:idx 0})
formats (-> (.meta-info w)
:format
(meta->map keyword)
(assoc-in [:GT :type-kw] :int)
(assoc-in [:GT :number] nil))
info (meta->map (:info (.meta-info w)) keyword)
parse-variant (vcf-util/variant-parser (.meta-info w) (.header w))]
(doseq [v variants]
(->> (if (some string? ((apply juxt :filter :info kws) v))
(parse-variant v)
v)
(parsed-variant->bcf-map kws contigs filters formats info)
(write-variant (.writer w)))))) |