(ns cljam.io.vcf.util
(:require [clojure.string :as cstr]
[proton.core :as p])) | |
Checks if given string is equal to "." or nil. | (definline dot-or-nil?
[^String s]
`(or (nil? ~s)
(empty? ~s)
(and (= 1 (.length ~s))
(= \. (.charAt ~s 0))))) |
(defn- parse-float [^String s]
(let [m (re-matcher #"(?i)[+-]?(nan|inf(?:inity)?)" s)]
(if (.matches m)
(case (.charAt (.group m 1) 0)
(\n \N) Float/NaN
(\i \I) (if (= (.charAt s 0) \-)
Float/NEGATIVE_INFINITY
Float/POSITIVE_INFINITY))
(Float/parseFloat s)))) | |
Returns a parser function for given type identifier. | (defn- value-parser
[type']
(case type'
"Flag" (constantly :exists)
"Integer" #(when-not (dot-or-nil? ^String %) (Integer/parseInt %))
"Float" #(when-not (dot-or-nil? ^String %) (parse-float %))
"Character" #(when-not (dot-or-nil? ^String %) (first %))
"String" #(when-not (dot-or-nil? ^String %) %))) |
Creates a key-value paired vector of 'id' and its parser. | (defn- meta->parser
[{:keys [id number] type' :type}]
(let [p (value-parser type')]
[id (comp (if (or (= number 1) (= number 0)) first identity)
(fn [x] (map p (cstr/split (or x "") #","))))])) |
Returns a parser function defined by meta-info. info-meta must be a sequence of map containing keys :id, :number and :type. The parser takes a string and returns a map. | (defn info-parser
[info-meta]
(let [parser-map (into {} (map meta->parser) info-meta)]
(fn [^String s]
(when-not (dot-or-nil? s)
(->> (cstr/split s #";")
(into
{}
(map (fn [ss]
(let [[k vs] (cstr/split ss #"\=" 2)]
(if-let [parser (parser-map k)]
[(keyword k) (parser vs)]
(throw (ex-info (str "Undeclared INFO, " k ".")
{:info ss})))))))))))) |
Returns a stringifier function of INFO field. Takes a vector of maps from info rows of meta-info. The stringifier takes a map and returns a string. | (defn info-stringifier
[info-meta]
(let [id-ordered (mapv :id info-meta)
info-type (into {} (map (juxt :id :type)) info-meta)]
(fn [info]
(when info
(->> id-ordered
(keep
(fn [k]
(when-let [v (info (keyword k))]
(if (or (= v :exists) (= (info-type k) "Flag"))
k
(str k "=" (if (sequential? v) (cstr/join "," v) v))))))
(cstr/join ";")
not-empty))))) |
Parses FILTER field and returns a sequence of keywords. | (defn parse-filter
[^String s]
(when-not (dot-or-nil? s)
(map keyword (cstr/split s #";")))) |
Stringifies FILTER field. Takes a sequence of keywords or strings. | (defn stringify-filter
[fltr]
(when (and (some? fltr) (some some? fltr))
(cstr/join ";" (map name fltr)))) |
Parses FORMAT field and returns as a sequence of keywords. | (defn parse-format
[^String s]
(when-not (dot-or-nil? s)
(map keyword (cstr/split s #":")))) |
Stringifies FORMAT field. Takes a sequence of keywords or strings. | (defn stringify-format
[formats]
(when-not (or (nil? formats) (empty? formats))
(cstr/join ":" (map name formats)))) |
Parses genotype (GT) format and returns a vector of pairs: [allele, phased]. Allele 0 indicates REF allele. 1,2,3... for 1st, 2nd, 3rd allele of ALT. | (defn parse-genotype
[^String gt]
(when-not (dot-or-nil? gt)
(->> gt
(re-seq #"([\||/])?(.|\d+)")
(map (fn [[_ phase ^String allele]]
[(when-not (dot-or-nil? allele) (Integer/parseInt allele))
(if phase
(= phase "|")
(neg? (.indexOf ^String gt "/")))]))))) |
(defn- stringify-allele [indicator-needed? [allele phase]]
[(if indicator-needed?
(if phase "|" "/"))
(or allele \.)]) | |
Stringifies genotype map into VCF-style GT string. The first phasing indicator will be omitted unless necessary. | (defn stringify-genotype
[gt-seq]
(when-let [[[_ phase :as a1] & as] (seq gt-seq)]
(->> as
(into (stringify-allele (not= phase (every? second as)) a1)
(mapcat (partial stringify-allele true)))
(apply str)))) |
Returns a sequence of genotypes represented as sequences of integers. | (defn genotype-seq
([^long ploidy ^long n-alt-alleles]
(genotype-seq ploidy n-alt-alleles []))
([^long ploidy ^long n-alt-alleles s]
(mapcat
(fn [i]
(if (= 1 ploidy)
[(cons i s)]
(genotype-seq (dec ploidy) i (cons i s))))
(range (inc n-alt-alleles))))) |
Returns an index for given genotype. | (defn genotype-index
^long [genotype]
{:pre [(seq genotype)
(every? (complement neg?) genotype)]}
(letfn [(combination ^long [^long n ^long k]
(cond
(< n k) 0
(= k n) 1
(or (= k 1) (= k (dec n))) n
(< (quot n 2) k) (recur n (- n k))
:else (loop [i (inc (- n k)), j 1, r 1]
(if (<= j k)
(recur (inc i) (inc j) (quot (* r i) j))
r))))]
(->> genotype
sort
(map-indexed (fn [^long m ^long k] (combination (+ k m) (inc m))))
(apply +)))) |
Takes a parsed genotype (i.e. a vector of pairs [allele, phased]) and returns the parsed genotype with each allele unphased. | (defn unphase-genotype
[gt-seq]
(some->> (seq gt-seq)
(map first)
sort
(#(map vector % (repeat false))))) |
Converts a multiallelic | (defn biallelic-genotype
[genotype ^long target-allele]
(->> genotype
parse-genotype
(map (fn [[allele phased?]]
[(when allele (if (= target-allele allele) 1 0)) phased?]))
stringify-genotype)) |
Picks up elements in multiallelic | (defn biallelic-coll
[^long ploidy ^long n-alt-alleles ^long target-allele coll]
(keep-indexed
(fn [i gt]
(when (every? (fn [^long a] (or (= a 0) (= a target-allele))) gt)
(nth coll i)))
(genotype-seq ploidy n-alt-alleles))) |
Converts genotype to a sequence of integers. | (defn genotype->ints
[gt]
(let [parsed-gt (or (cond-> gt (string? gt) parse-genotype) [[nil]])]
(->> (assoc-in (vec parsed-gt) [0 1] false)
(map (fn [[^long allele phased]]
(bit-or
(bit-shift-left (if allele (inc allele) 0) 1)
(if phased 1 0))))))) |
Converts a sequence of integers to genotype string. | (defn ints->genotype
[vs]
(->> vs
(mapcat (fn [^long i]
[(if (odd? i) \| \/)
(if (zero? (quot i 2)) "." (dec (quot i 2)))]))
rest
(apply str)
(#(when-not (dot-or-nil? ^String %) %)))) |
Returns a parser function defined by meta-formats. info must be a sequence of map containing keys :id, :number and :type. The parser takes two string (format-line and sample-line) and returns a map. | (defn sample-parser
[formats-meta]
(let [parser-map (into {} (map meta->parser) formats-meta)]
(fn [^String format-line sample-line]
(when-not (dot-or-nil? format-line)
(let [ks (cstr/split format-line #":")
vs (concat
(when (not-empty sample-line)
(cstr/split sample-line #":"))
(repeat nil))]
(into
{}
(map (fn [[k ^String v]]
[(keyword k) (when-not (dot-or-nil? v) ((parser-map k) v))]))
(map vector ks vs))))))) |
Converts sample map into a string. formats must be a sequence of keys in sample-map. | (defn stringify-sample
[formats sample-map]
(->> formats
(map (fn [k] [k (get sample-map k)]))
reverse
(drop-while (fn [[_ v]] (or (nil? v) (= [nil] v))))
(map (fn [[_ v]]
(cond
(sequential? v) (cstr/join "," (map (fn [i] (if (nil? i) "." i)) v))
(nil? v) "."
:else v)))
reverse
(cstr/join ":")
not-empty)) |
Returns a parser function to parse :filter, :info, :FORMAT and sample columns of VCF. Takes meta-info and header of VCF. The parser takes a variant map and returns a parsed map. | (defn variant-parser
[meta-info header]
(let [[fmt-kw & sample-kws] (mapv keyword (drop 8 header))
parse-info (info-parser (:info meta-info))
parse-sample (sample-parser (:format meta-info))]
(fn [v]
(cond-> v
true (update :filter parse-filter)
true (update :info parse-info)
fmt-kw (update fmt-kw parse-format)
sample-kws (into (for [k sample-kws] [k (parse-sample (v fmt-kw) (v k))])))))) |
Returns a stringifier function to stringify :filter, :info, :FORMAT and sample columns of VCF. Takes meta-info and header of VCF. The stringifier takes a parsed variant map and returns a map. | (defn variant-vals-stringifier
[meta-info header]
(let [[fmt-kw & sample-kws] (mapv keyword (drop 8 header))
stringify-info (info-stringifier (:info meta-info))]
(fn [v]
(-> v
(update :filter stringify-filter)
(update :info stringify-info)
(update fmt-kw stringify-format)
(merge (into {} (for [k sample-kws] [k (stringify-sample (v fmt-kw) (v k))]))))))) |
(def ^:private ^:const long-breakend-regexp ;; pre-seq [ or ] chr pos [ or ] post-seq #"([ACGTN]*|\.)([\[\]])(.+?)(?::(\d+))([\[\]])([ACGTN]*|\.)") | |
(def ^:private ^:const short-breakend-regexp #"(\.?)([ATGCN]+)(\.?)") | |
Parses an ALT allele string of SVTYPE=BND.
Returns a map with mandatory keys:
- | (defn parse-breakend
[alt]
(if-let [[_ pre-seq [left-bracket] chr pos [right-bracket] post-seq]
(re-matches long-breakend-regexp alt)]
(let [pre (not-empty pre-seq)
post (not-empty post-seq)]
(when (and (= left-bracket right-bracket)
(or pre post)
(not (and pre post)))
(when-let [p (p/as-long pos)]
{:chr chr,
:pos p,
:join (if post :before :after),
:bases (or pre post),
:strand (if (= left-bracket \])
(if post :forward :reverse)
(if post :reverse :forward))})))
(when-let [[_ [pre-dot] bases' [post-dot]]
(re-matches short-breakend-regexp alt)]
(when (and (or pre-dot post-dot)
(not (and pre-dot post-dot)))
{:bases bases', :join (if pre-dot :before :after)})))) |
Returns a string representation of a breakend. If the input is malformed,
returns | (defn stringify-breakend
[{:keys [chr pos strand join] s :bases}]
(when (and (not-empty s) (#{:before :after} join))
(if (and chr pos (#{:forward :reverse} strand))
(let [before? (= join :before)
bracket (if (= strand :forward)
(if before? \] \[)
(if before? \[ \]))]
(str (when-not before? s) bracket chr \: pos bracket (when before? s)))
(str (when (= :before join) \.) s (when (= :after join) \.))))) |
(defn- inspect-nucleotides-allele
[ref' alt]
(let [ref-length (count ref')
alt-length (count alt)
upper-ref (cstr/upper-case ref')
upper-alt (cstr/upper-case alt)
left-match (->> (map = upper-ref upper-alt)
(take-while true?)
count)
right-match (->> (cstr/reverse upper-alt)
(map = (cstr/reverse upper-ref))
(take-while true?)
count
long
(Math/min (- (Math/min ref-length alt-length)
left-match)))
matched-length (+ left-match right-match)]
(cond
(= left-match ref-length alt-length)
{:type :ref}
(and (= left-match ref-length) (< left-match alt-length))
{:type :insertion, :offset (dec left-match),
:n-bases (- alt-length left-match), :inserted (subs alt left-match)}
(and (< left-match ref-length) (= left-match alt-length))
{:type :deletion, :offset (dec left-match),
:n-bases (- ref-length left-match), :deleted (subs ref' left-match)}
(= (inc matched-length) ref-length alt-length)
{:type :snv, :ref (nth ref' left-match),
:alt (nth alt left-match), :offset left-match}
(= matched-length alt-length)
{:type :deletion, :offset (dec left-match),
:n-bases (- ref-length right-match left-match),
:deleted (subs ref' left-match (- ref-length right-match))}
(= matched-length ref-length)
{:type :insertion, :offset (dec left-match),
:n-bases (- alt-length right-match left-match),
:inserted (subs alt left-match (- alt-length right-match))}
(= ref-length alt-length)
{:type :mnv, :offset left-match,
:ref (subs ref' left-match (- ref-length right-match)),
:alt (subs alt left-match (- alt-length right-match))}
:else {:type :complex}))) | |
Inspects an | (defn inspect-allele
[ref' alt]
(or
(when (re-matches #"(?i)[ACGTN]+" (or ref' ""))
(condp re-matches (or (not-empty alt) ".")
#"\." {:type :no-call}
#"\*" {:type :spanning-deletion}
#"(X|<\*>|<X>)" {:type :unspecified}
#"<(.+)>" :>> (fn [[_ id]] {:type :id, :id id})
#"(?i)([ACGTN])<(.+)>" :>> (fn [[_ [base] id]]
{:type :complete-insertion,
:join :after, :base base, :id id})
#"(?i)<(.+)>([ACGTN])" :>> (fn [[_ id [base]]]
{:type :complete-insertion,
:join :before, :base base, :id id})
#"(?i)[ACGTN]+" (inspect-nucleotides-allele ref' alt)
(some-> (parse-breakend alt) (assoc :type :breakend))))
{:type :other})) |