A type of VCF reader and internal functions to read VCF contents. See https://samtools.github.io/hts-specs/ for the detail VCF specifications. | (ns cljam.io.vcf.reader
(:require [clojure.string :as cstr]
[clojure.tools.logging :as logging]
[cljam.io.protocols :as protocols]
[camel-snake-kebab.core :refer [->kebab-case-keyword]]
[proton.core :refer [as-long]]
[cljam.io.util.bin :as util-bin]
[cljam.io.vcf.util :as vcf-util])
(:import [java.io Closeable BufferedReader FileNotFoundException]
[clojure.lang LazilyPersistentVector]
bgzf4j.BGZFInputStream)) |
VCFReader | |
(declare read-variants read-variants-randomly read-file-offsets) | |
(deftype VCFReader [url meta-info header reader index-delay]
Closeable
(close [this]
(.close ^Closeable (.reader this)))
protocols/IReader
(reader-url [this] (.url this))
(read [this] (read-variants this))
(read [this option] (read-variants this option))
(indexed? [_]
(try
@index-delay
true
(catch FileNotFoundException _
false)))
protocols/IRegionReader
(read-in-region [this region]
(protocols/read-in-region this region {}))
(read-in-region [this {:keys [chr start end]} option]
(logging/warn "May cause degradation of performance.")
(filter
(fn [v]
(and (if chr (= (:chr v) chr) true)
(if start (<= (long start) (long (:pos v))) true)
(if end (<= (+ (long (:pos v)) (count (:ref v))) (long end)) true)))
(read-variants this option)))) | |
need dynamic extension for namespace issue. | (extend-type VCFReader
protocols/IVariantReader
(meta-info [this] (.meta-info this))
(header [this] (.header this))
(read-variants
([this] (protocols/read-variants this {}))
([this option] (read-variants this option)))
(read-variants-randomly [this region-option deep-option]
(read-variants-randomly this region-option deep-option))
(read-file-offsets [this]
(read-file-offsets this))) |
Utilities | |
(defn- dot->nil [^String s] ;; Avoid calling equiv on strings. (when-not (and (= 1 (.length s)) (= \. (.charAt s 0))) s)) | |
Loading meta-information | |
(defn- meta-line? [line] (= (subs line 0 2) "##")) | |
e.g. (parse-structured-line "ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\"") => {:id "NS" :number "1" :type "Integer" :description "Number of Samples With Data"} | (defn- parse-structured-line
"e.g. (parse-structured-line \"ID=NS,Number=1,Type=Integer,Description=\\\"Number of Samples With Data\\\"\")
=> {:id \"NS\" :number \"1\" :type \"Integer\"
:description \"Number of Samples With Data\"}"
[s]
(letfn [(parse-string-field [s]
(loop [s s, acc []]
(if-let [[_ pre c post] (re-matches #"([^\\\"]*?)([\\\"])(.*)" s)]
(if (= c "\\")
;; expecting one of the special characters occurs immediately after \\
(if (#{\\ \"} (first post))
(recur (subs post 1) (conj acc pre (first post)))
(let [msg (str "Either '\\' or '\"' was expected immediately after '\\', "
"but got '" (first post) "'")]
(throw (RuntimeException. msg))))
;; c == \", that is, we are now at the end of the string field
[nil (cstr/join (conj acc pre)) (cstr/replace post #"^," "")])
(throw (RuntimeException. "Unexpected end of string field")))))]
(loop [s s, m {}]
(if-let [[_ k more] (some->> s (re-matches #"([^=]+?)=(.*)"))]
(let [[_ v more] (if (= (first more) \")
(parse-string-field (subs more 1))
(re-matches #"([^,]*?)(?:,(.*))?" more))]
(recur more (assoc m (->kebab-case-keyword k) (dot->nil v))))
m)))) |
(defn- parse-meta-info-contig [m] (update m :length as-long)) | |
(defn- parse-meta-info-info
[m]
(update m :number (fn [s]
(if (#{"A" "R" "G"} s)
s
(as-long s))))) | |
Parses a single line string as meta-info. | (defn parse-meta-info-line
[line]
(let [[_ k* v] (re-find #"^##([\w:/\.\?\-]*)=(.*)$" line)
k (->kebab-case-keyword k*)]
[k (if-let [[_ s] (re-find #"^<(.+)>$" v)]
(cond-> (parse-structured-line s)
(#{:info :format} k) parse-meta-info-info
(= k :contig) parse-meta-info-contig)
v)])) |
Reads from | (defn load-meta-info
[^BufferedReader rdr]
(loop [line (.readLine rdr), meta-info {}]
(if (meta-line? line)
(let [[k v] (parse-meta-info-line line)]
(recur (.readLine rdr)
(if (#{:contig :info :filter :format :alt :sample :pedigree} k)
(if (get meta-info k)
(update meta-info k conj v)
(assoc meta-info k [v]))
(assoc meta-info k v))))
meta-info))) |
Loading header | |
(defn- header-line? [line] (not (nil? (re-find #"^#[^#]*$" line)))) | |
Parses a single line string as header | (defn parse-header-line [line] (cstr/split (subs line 1) #"\t")) |
Reads from | (defn load-header
[^BufferedReader rdr]
(loop [line (.readLine rdr)]
(if (header-line? line)
(parse-header-line line)
(recur (.readLine rdr))))) |
Reading data lines | |
(defn- parse-data-line
[line kws]
;; When splitting a string with single-character delimiter,
;; `java.lang.String#split` is slightly faster than `clojure.string/split`.
;; For more details, please refer to https://github.com/chrovis/cljam/pull/29.
(let [[fields gt-fields] (->> (.split ^String line "\t")
LazilyPersistentVector/createOwning
(map dot->nil)
(split-at 8))]
(->> gt-fields
(interleave kws)
(concat [:chr (first fields)
:pos (as-long (nth fields 1))
:id (nth fields 2)
:ref (nth fields 3)
:alt (some-> (nth fields 4) (cstr/split #","))
:qual (some-> (nth fields 5) Double/parseDouble)
:filter (nth fields 6)
:info (nth fields 7)])
(apply hash-map)))) | |
(defn- read-data-lines
[rdr header kws]
(when-let [line (if (instance? BufferedReader rdr)
(.readLine ^BufferedReader rdr)
(.readLine ^BGZFInputStream rdr))]
(if-not (or (meta-line? line) (header-line? line))
(cons (parse-data-line line kws)
(lazy-seq (read-data-lines rdr header kws)))
(recur rdr header kws)))) | |
Reads variants of the VCF/BCF file, returning them as a lazy sequence. | (defn read-variants
([rdr]
(read-variants rdr {}))
([^VCFReader rdr {:keys [depth] :or {depth :deep}}]
(let [kws (mapv keyword (drop 8 (.header rdr)))
parse-fn (case depth
:deep (vcf-util/variant-parser (.meta-info rdr) (.header rdr))
:vcf identity)]
(map parse-fn (read-data-lines (.reader rdr) (.header rdr) kws))))) |
(defn- make-lazy-variants [f s]
(when-first [fs s]
(lazy-cat
(f fs)
(make-lazy-variants f (rest s))))) | |
Reads variants of the bgzip compressed VCF file randomly using tabix/csi file Returning them as a lazy sequence. | (defn read-variants-randomly
[^VCFReader rdr
{:keys [chr ^long start ^long end] :or {start 1 end 4294967296}}
{:keys [depth] :or {depth :deep}}]
(let [kws (mapv keyword (drop 8 (.header rdr)))
index-data @(.index-delay rdr)
chr-names (or (util-bin/get-chr-names index-data)
(->> (.meta-info rdr) :contig (mapv :id)))
ref-idx (.indexOf ^clojure.lang.PersistentVector chr-names chr)
spans (when-not (neg? ref-idx)
(util-bin/get-spans index-data ref-idx start end))
input-stream ^BGZFInputStream (.reader rdr)
parse-fn (case depth
:deep (vcf-util/variant-parser (.meta-info rdr)
(.header rdr))
:vcf identity)]
(make-lazy-variants
(fn [[chunk-beg ^long chunk-end]]
(.seek input-stream chunk-beg)
(->> #(when (< (.getFilePointer input-stream) chunk-end)
(-> input-stream
.readLine
(parse-data-line kws)
parse-fn))
repeatedly
(take-while identity)
(filter
(fn [{chr' :chr
ref' :ref
:keys [^long pos info]}]
(and (= chr' chr)
(<= pos end)
(<= start
(long (get info :END (dec (+ pos (count ref')))))))))))
spans))) |
Reads file offsets and a genomic position of variants from bgzip compressed VCF and returns them as a lazy sequence. Each element is a map containing :chr, :chr-index, :beg, :end, :file-beg, :file-end. | (defn read-file-offsets
[^VCFReader rdr]
(let [^BGZFInputStream input-stream (.reader rdr)
meta-info-contigs (->> (:contig (.meta-info rdr))
(map-indexed (fn [index contig]
[(:id contig) index]))
(into {}))
parse (comp (vcf-util/variant-parser (.meta-info rdr)
(take 8 (.header rdr)))
#(parse-data-line % nil))]
(letfn [(step [contigs beg-pointer]
(when-let [line (.readLine input-stream)]
(let [end-pointer (.getFilePointer input-stream)]
(if (or (meta-line? line) (header-line? line))
(lazy-seq (step contigs end-pointer))
(let [{:keys [chr pos info] ref' :ref} (parse line)
contigs' (if (contains? contigs chr)
contigs
(assoc contigs chr (count contigs)))]
(cons {:file-beg beg-pointer, :file-end end-pointer
:chr-index (contigs' chr), :beg pos, :chr chr,
:end (or (:END info)
(dec (+ (long pos) (count ref'))))}
(lazy-seq (step contigs' end-pointer))))))))]
(step meta-info-contigs 0)))) |