(ns cljam.io.vcf.util.normalize (:require [clojure.string :as cstr] [cljam.io.vcf.util :as vcf-util] [cljam.io.sequence :as io-seq])) | |
(defn- biallelic-map [m id->number ^long i ^long n-allele] (let [ploidy (or (some-> m :GT vcf-util/parse-genotype count) 2)] (into {} (map (fn [[k v]] (let [n (id->number k)] [k (cond (= k :GT) (vcf-util/biallelic-genotype v (inc i)) (= n "A") [(nth v i)] (= n "R") [(first v) (nth v (inc i))] (= n "G") (vcf-util/biallelic-coll ploidy n-allele (inc i) v) :else v)]))) m))) | |
(defn- get-id->number [coll] (into {} (map (juxt (comp keyword :id) :number)) coll)) | |
Returns a function that splits a multiallelic variant into a sequence of biallelic variants. | (defn make-splitter [{:keys [info] format' :format} header] (let [info-id->number (get-id->number info) format-id->number (get-id->number format') samples (map keyword (drop 9 header))] (fn split [{:keys [alt] :as v}] (let [n (count alt)] (if (<= n 1) [v] (->> alt (map-indexed (fn sample-ith-alt [i allele] (-> v (assoc :alt [allele]) (update :info biallelic-map info-id->number i n) (into (map (fn [s] [s (biallelic-map (v s) format-id->number i n)])) samples)))))))))) |
(defn- regions-before [chr ^long pos ^long window] (->> (range pos 1 (- window)) (map (fn [^long e] {:chr chr, :start (Math/max 1 (- e window)), :end (dec e)})))) | |
(defn- char-equals-ignore-case? ([_] true) ([x y] (= (Character/toUpperCase ^char x) (Character/toUpperCase ^char y))) ([x y & z] (and (char-equals-ignore-case? x y) (apply char-equals-ignore-case? y z)))) | |
Trims all duplicated allele sequences from right. If reached the start,
reference sequences of length | (defn- trim-right [seq-reader {:keys [chr ^long pos alt] ref' :ref {:keys [END]} :info :as v} ^long window] (let [alleles (cons ref' alt) min-end (dec (+ pos (long (apply min (map count alleles))))) regs (regions-before chr pos window) ref-seqs (sequence ;; unchunk (mapcat (fn [r] (-> seq-reader (io-seq/read-sequence r {:mask? false}) cstr/reverse))) regs) allele-seqs (map #(concat (cstr/reverse %) ref-seqs) alleles) matched-length (->> allele-seqs (apply map char-equals-ignore-case?) (take-while true?) count long (Math/min (dec min-end)))] (if (pos? matched-length) (let [pos' (-> (some (fn [{:keys [start end]}] (when (<= start (- min-end matched-length) end) start)) regs) (or pos) long) [ref'' & alt'] (map #(->> %2 (take (inc (- (dec (+ pos (count %1))) pos'))) (drop matched-length) reverse cstr/join) alleles allele-seqs)] (cond-> (assoc v :pos pos' :ref ref'' :alt alt') (integer? END) (assoc-in [:info :END] (dec (+ pos' (count ref'')))))) v))) |
Trims all duplicated allele sequences from left. | (defn- trim-left [{:keys [alt] ref' :ref :as v}] (let [alleles (cons ref' alt) n-trim-left (->> alleles (apply map char-equals-ignore-case?) (take-while true?) count) min-len (long (apply min (map count alleles))) n-trim-left (min (dec min-len) n-trim-left)] (if (pos? n-trim-left) (-> v (update :pos + n-trim-left) (update :ref subs n-trim-left) (assoc :alt (map #(subs % n-trim-left) alt))) v))) |
Left-align and trim REF/ALT alleles. | (defn realign ([seq-reader variant] (realign seq-reader variant {})) ([seq-reader {:keys [alt] ref' :ref :as variant} {:keys [window] :or {window 100}}] (cond (some->> (seq alt) (every? (comp #{:snv :mnv :insertion :deletion :complex} :type (partial vcf-util/inspect-allele ref')))) (trim-left (trim-right seq-reader variant window)) (and ref' (nil? (seq alt))) (cond-> (update variant :ref subs 0 1) (integer? (:END (:info variant))) (assoc-in [:info :END] (:pos variant))) :else variant))) |
Splits multiallelic variants and realigns all alleles. Returns a lazy
sequence of normalized variants. Note that the result may be unsorted due to
the realignment. Returns a transducer when no variant is provided. The
transducer does not guarantee thread safety of | (defn normalize ([ref-reader meta-info header] (comp (mapcat (make-splitter meta-info header)) (map (partial realign ref-reader)))) ([ref-reader meta-info header variants] (sequence (normalize ref-reader meta-info header) variants))) |