(ns cljam.io.vcf.util (:require [clojure.string :as cstr] [proton.core :as p])) | |
Checks if given string is equal to "." or nil. | (definline dot-or-nil? [^String s] `(or (nil? ~s) (empty? ~s) (and (= 1 (.length ~s)) (= \. (.charAt ~s 0))))) |
(defn- parse-float [^String s] (let [m (re-matcher #"(?i)[+-]?(nan|inf(?:inity)?)" s)] (if (.matches m) (case (.charAt (.group m 1) 0) (\n \N) Float/NaN (\i \I) (if (= (.charAt s 0) \-) Float/NEGATIVE_INFINITY Float/POSITIVE_INFINITY)) (Float/parseFloat s)))) | |
Returns a parser function for given type identifier. | (defn- value-parser [type'] (case type' "Flag" (constantly :exists) "Integer" #(when-not (dot-or-nil? ^String %) (Integer/parseInt %)) "Float" #(when-not (dot-or-nil? ^String %) (parse-float %)) "Character" #(when-not (dot-or-nil? ^String %) (first %)) "String" #(when-not (dot-or-nil? ^String %) %))) |
Creates a key-value paired vector of 'id' and its parser. | (defn- meta->parser [{:keys [id number] type' :type}] (let [p (value-parser type')] [id (comp (if (or (= number 1) (= number 0)) first identity) (fn [x] (map p (cstr/split (or x "") #","))))])) |
Returns a parser function defined by meta-info. info-meta must be a sequence of map containing keys :id, :number and :type. The parser takes a string and returns a map. | (defn info-parser [info-meta] (let [parser-map (into {} (map meta->parser) info-meta)] (fn [^String s] (when-not (dot-or-nil? s) (->> (cstr/split s #";") (into {} (map (fn [ss] (let [[k vs] (cstr/split ss #"\=" 2)] (if-let [parser (parser-map k)] [(keyword k) (parser vs)] (throw (ex-info (str "Undeclared INFO, " k ".") {:info ss})))))))))))) |
Returns a stringifier function of INFO field. Takes a vector of maps from info rows of meta-info. The stringifier takes a map and returns a string. | (defn info-stringifier [info-meta] (let [id-ordered (mapv :id info-meta) info-type (into {} (map (juxt :id :type)) info-meta)] (fn [info] (when info (->> id-ordered (keep (fn [k] (when-let [v (info (keyword k))] (if (or (= v :exists) (= (info-type k) "Flag")) k (str k "=" (if (sequential? v) (cstr/join "," v) v)))))) (cstr/join ";") not-empty))))) |
Parses FILTER field and returns a sequence of keywords. | (defn parse-filter [^String s] (when-not (dot-or-nil? s) (map keyword (cstr/split s #";")))) |
Stringifies FILTER field. Takes a sequence of keywords or strings. | (defn stringify-filter [fltr] (when (and (some? fltr) (some some? fltr)) (cstr/join ";" (map name fltr)))) |
Parses FORMAT field and returns as a sequence of keywords. | (defn parse-format [^String s] (when-not (dot-or-nil? s) (map keyword (cstr/split s #":")))) |
Stringifies FORMAT field. Takes a sequence of keywords or strings. | (defn stringify-format [formats] (when-not (or (nil? formats) (empty? formats)) (cstr/join ":" (map name formats)))) |
Parses genotype (GT) format and returns a vector of pairs: [allele, phased]. Allele 0 indicates REF allele. 1,2,3... for 1st, 2nd, 3rd allele of ALT. | (defn parse-genotype [^String gt] (when-not (dot-or-nil? gt) (->> gt (re-seq #"([\||/])?(.|\d+)") (map (fn [[_ phase ^String allele]] [(when-not (dot-or-nil? allele) (Integer/parseInt allele)) (if phase (= phase "|") (neg? (.indexOf ^String gt "/")))]))))) |
(defn- stringify-allele [indicator-needed? [allele phase]] [(if indicator-needed? (if phase "|" "/")) (or allele \.)]) | |
Stringifies genotype map into VCF-style GT string. The first phasing indicator will be omitted unless necessary. | (defn stringify-genotype [gt-seq] (when-let [[[_ phase :as a1] & as] (seq gt-seq)] (->> as (into (stringify-allele (not= phase (every? second as)) a1) (mapcat (partial stringify-allele true))) (apply str)))) |
Returns a sequence of genotypes represented as sequences of integers. | (defn genotype-seq ([^long ploidy ^long n-alt-alleles] (genotype-seq ploidy n-alt-alleles [])) ([^long ploidy ^long n-alt-alleles s] (mapcat (fn [i] (if (= 1 ploidy) [(cons i s)] (genotype-seq (dec ploidy) i (cons i s)))) (range (inc n-alt-alleles))))) |
Returns an index for given genotype. | (defn genotype-index ^long [genotype] {:pre [(seq genotype) (every? (complement neg?) genotype)]} (letfn [(combination ^long [^long n ^long k] (cond (< n k) 0 (= k n) 1 (or (= k 1) (= k (dec n))) n (< (quot n 2) k) (recur n (- n k)) :else (loop [i (inc (- n k)), j 1, r 1] (if (<= j k) (recur (inc i) (inc j) (quot (* r i) j)) r))))] (->> genotype sort (map-indexed (fn [^long m ^long k] (combination (+ k m) (inc m)))) (apply +)))) |
Takes a parsed genotype (i.e. a vector of pairs [allele, phased]) and returns the parsed genotype with each allele unphased. | (defn unphase-genotype [gt-seq] (some->> (seq gt-seq) (map first) sort (#(map vector % (repeat false))))) |
Converts a multiallelic | (defn biallelic-genotype [genotype ^long target-allele] (->> genotype parse-genotype (map (fn [[allele phased?]] [(when allele (if (= target-allele allele) 1 0)) phased?])) stringify-genotype)) |
Picks up elements in multiallelic | (defn biallelic-coll [^long ploidy ^long n-alt-alleles ^long target-allele coll] (keep-indexed (fn [i gt] (when (every? (fn [^long a] (or (= a 0) (= a target-allele))) gt) (nth coll i))) (genotype-seq ploidy n-alt-alleles))) |
Converts genotype to a sequence of integers. | (defn genotype->ints [gt] (let [parsed-gt (or (cond-> gt (string? gt) parse-genotype) [[nil]])] (->> (assoc-in (vec parsed-gt) [0 1] false) (map (fn [[^long allele phased]] (bit-or (bit-shift-left (if allele (inc allele) 0) 1) (if phased 1 0))))))) |
Converts a sequence of integers to genotype string. | (defn ints->genotype [vs] (->> vs (mapcat (fn [^long i] [(if (odd? i) \| \/) (if (zero? (quot i 2)) "." (dec (quot i 2)))])) rest (apply str) (#(when-not (dot-or-nil? ^String %) %)))) |
Returns a parser function defined by meta-formats. info must be a sequence of map containing keys :id, :number and :type. The parser takes two string (format-line and sample-line) and returns a map. | (defn sample-parser [formats-meta] (let [parser-map (into {} (map meta->parser) formats-meta)] (fn [^String format-line sample-line] (when-not (dot-or-nil? format-line) (let [ks (cstr/split format-line #":") vs (concat (when (not-empty sample-line) (cstr/split sample-line #":")) (repeat nil))] (into {} (map (fn [[k ^String v]] [(keyword k) (when-not (dot-or-nil? v) ((parser-map k) v))])) (map vector ks vs))))))) |
Converts sample map into a string. formats must be a sequence of keys in sample-map. | (defn stringify-sample [formats sample-map] (->> formats (map (fn [k] [k (get sample-map k)])) reverse (drop-while (fn [[_ v]] (or (nil? v) (= [nil] v)))) (map (fn [[_ v]] (cond (sequential? v) (cstr/join "," (map (fn [i] (if (nil? i) "." i)) v)) (nil? v) "." :else v))) reverse (cstr/join ":") not-empty)) |
Returns a parser function to parse :filter, :info, :FORMAT and sample columns of VCF. Takes meta-info and header of VCF. The parser takes a variant map and returns a parsed map. | (defn variant-parser [meta-info header] (let [[fmt-kw & sample-kws] (mapv keyword (drop 8 header)) parse-info (info-parser (:info meta-info)) parse-sample (sample-parser (:format meta-info))] (fn [v] (cond-> v true (update :filter parse-filter) true (update :info parse-info) fmt-kw (update fmt-kw parse-format) sample-kws (into (for [k sample-kws] [k (parse-sample (v fmt-kw) (v k))])))))) |
Returns a stringifier function to stringify :filter, :info, :FORMAT and sample columns of VCF. Takes meta-info and header of VCF. The stringifier takes a parsed variant map and returns a map. | (defn variant-vals-stringifier [meta-info header] (let [[fmt-kw & sample-kws] (mapv keyword (drop 8 header)) stringify-info (info-stringifier (:info meta-info))] (fn [v] (-> v (update :filter stringify-filter) (update :info stringify-info) (update fmt-kw stringify-format) (merge (into {} (for [k sample-kws] [k (stringify-sample (v fmt-kw) (v k))]))))))) |
(def ^:private ^:const long-breakend-regexp ;; pre-seq [ or ] chr pos [ or ] post-seq #"([ACGTN]*|\.)([\[\]])(.+?)(?::(\d+))([\[\]])([ACGTN]*|\.)") | |
(def ^:private ^:const short-breakend-regexp #"(\.?)([ATGCN]+)(\.?)") | |
Parses an ALT allele string of SVTYPE=BND.
Returns a map with mandatory keys:
- | (defn parse-breakend [alt] (if-let [[_ pre-seq [left-bracket] chr pos [right-bracket] post-seq] (re-matches long-breakend-regexp alt)] (let [pre (not-empty pre-seq) post (not-empty post-seq)] (when (and (= left-bracket right-bracket) (or pre post) (not (and pre post))) (when-let [p (p/as-long pos)] {:chr chr, :pos p, :join (if post :before :after), :bases (or pre post), :strand (if (= left-bracket \]) (if post :forward :reverse) (if post :reverse :forward))}))) (when-let [[_ [pre-dot] bases' [post-dot]] (re-matches short-breakend-regexp alt)] (when (and (or pre-dot post-dot) (not (and pre-dot post-dot))) {:bases bases', :join (if pre-dot :before :after)})))) |
Returns a string representation of a breakend. If the input is malformed,
returns | (defn stringify-breakend [{:keys [chr pos strand join] s :bases}] (when (and (not-empty s) (#{:before :after} join)) (if (and chr pos (#{:forward :reverse} strand)) (let [before? (= join :before) bracket (if (= strand :forward) (if before? \] \[) (if before? \[ \]))] (str (when-not before? s) bracket chr \: pos bracket (when before? s))) (str (when (= :before join) \.) s (when (= :after join) \.))))) |
(defn- inspect-nucleotides-allele [ref' alt] (let [ref-length (count ref') alt-length (count alt) upper-ref (cstr/upper-case ref') upper-alt (cstr/upper-case alt) left-match (->> (map = upper-ref upper-alt) (take-while true?) count) right-match (->> (cstr/reverse upper-alt) (map = (cstr/reverse upper-ref)) (take-while true?) count long (Math/min (- (Math/min ref-length alt-length) left-match))) matched-length (+ left-match right-match)] (cond (= left-match ref-length alt-length) {:type :ref} (and (= left-match ref-length) (< left-match alt-length)) {:type :insertion, :offset (dec left-match), :n-bases (- alt-length left-match), :inserted (subs alt left-match)} (and (< left-match ref-length) (= left-match alt-length)) {:type :deletion, :offset (dec left-match), :n-bases (- ref-length left-match), :deleted (subs ref' left-match)} (= (inc matched-length) ref-length alt-length) {:type :snv, :ref (nth ref' left-match), :alt (nth alt left-match), :offset left-match} (= matched-length alt-length) {:type :deletion, :offset (dec left-match), :n-bases (- ref-length right-match left-match), :deleted (subs ref' left-match (- ref-length right-match))} (= matched-length ref-length) {:type :insertion, :offset (dec left-match), :n-bases (- alt-length right-match left-match), :inserted (subs alt left-match (- alt-length right-match))} (= ref-length alt-length) {:type :mnv, :offset left-match, :ref (subs ref' left-match (- ref-length right-match)), :alt (subs alt left-match (- alt-length right-match))} :else {:type :complex}))) | |
Inspects an | (defn inspect-allele [ref' alt] (or (when (re-matches #"(?i)[ACGTN]+" (or ref' "")) (condp re-matches (or (not-empty alt) ".") #"\." {:type :no-call} #"\*" {:type :spanning-deletion} #"(X|<\*>|<X>)" {:type :unspecified} #"<(.+)>" :>> (fn [[_ id]] {:type :id, :id id}) #"(?i)([ACGTN])<(.+)>" :>> (fn [[_ [base] id]] {:type :complete-insertion, :join :after, :base base, :id id}) #"(?i)<(.+)>([ACGTN])" :>> (fn [[_ id [base]]] {:type :complete-insertion, :join :before, :base base, :id id}) #"(?i)[ACGTN]+" (inspect-nucleotides-allele ref' alt) (some-> (parse-breakend alt) (assoc :type :breakend)))) {:type :other})) |