(ns cljam.io.vcf.util.normalize
(:require [clojure.string :as cstr]
[cljam.io.vcf.util :as vcf-util]
[cljam.io.sequence :as io-seq])) | |
(defn- biallelic-map
[m id->number ^long i ^long n-allele]
(let [ploidy (or (some-> m :GT vcf-util/parse-genotype count) 2)]
(into
{}
(map
(fn [[k v]]
(let [n (id->number k)]
[k (cond
(= k :GT) (vcf-util/biallelic-genotype v (inc i))
(= n "A") [(nth v i)]
(= n "R") [(first v) (nth v (inc i))]
(= n "G") (vcf-util/biallelic-coll ploidy n-allele (inc i) v)
:else v)])))
m))) | |
(defn- get-id->number
[coll]
(into {} (map (juxt (comp keyword :id) :number)) coll)) | |
Returns a function that splits a multiallelic variant into a sequence of biallelic variants. | (defn make-splitter
[{:keys [info]
format' :format} header]
(let [info-id->number (get-id->number info)
format-id->number (get-id->number format')
samples (map keyword (drop 9 header))]
(fn split [{:keys [alt] :as v}]
(let [n (count alt)]
(if (<= n 1)
[v]
(->> alt
(map-indexed
(fn sample-ith-alt [i allele]
(-> v
(assoc :alt [allele])
(update :info biallelic-map info-id->number i n)
(into
(map
(fn [s]
[s (biallelic-map (v s) format-id->number i n)]))
samples)))))))))) |
(defn- regions-before [chr ^long pos ^long window]
(->> (range pos 1 (- window))
(map
(fn [^long e]
{:chr chr,
:start (Math/max 1 (- e window)),
:end (dec e)})))) | |
(defn- char-equals-ignore-case?
([_]
true)
([x y]
(= (Character/toUpperCase ^char x)
(Character/toUpperCase ^char y)))
([x y & z]
(and (char-equals-ignore-case? x y)
(apply char-equals-ignore-case? y z)))) | |
Trims all duplicated allele sequences from right. If reached the start,
reference sequences of length | (defn- trim-right
[seq-reader {:keys [chr ^long pos alt]
ref' :ref
{:keys [END]} :info :as v} ^long window]
(let [alleles (cons ref' alt)
min-end (dec (+ pos (long (apply min (map count alleles)))))
regs (regions-before chr pos window)
ref-seqs (sequence ;; unchunk
(mapcat
(fn [r]
(-> seq-reader
(io-seq/read-sequence r {:mask? false})
cstr/reverse)))
regs)
allele-seqs (map #(concat (cstr/reverse %) ref-seqs) alleles)
matched-length (->> allele-seqs
(apply map char-equals-ignore-case?)
(take-while true?)
count
long
(Math/min (dec min-end)))]
(if (pos? matched-length)
(let [pos' (-> (some (fn [{:keys [start end]}]
(when (<= start (- min-end matched-length) end)
start)) regs)
(or pos)
long)
[ref'' & alt'] (map
#(->> %2
(take (inc (- (dec (+ pos (count %1))) pos')))
(drop matched-length)
reverse
cstr/join)
alleles
allele-seqs)]
(cond-> (assoc v :pos pos' :ref ref'' :alt alt')
(integer? END) (assoc-in [:info :END] (dec (+ pos' (count ref''))))))
v))) |
Trims all duplicated allele sequences from left. | (defn- trim-left
[{:keys [alt] ref' :ref :as v}]
(let [alleles (cons ref' alt)
n-trim-left (->> alleles
(apply map char-equals-ignore-case?)
(take-while true?)
count)
min-len (long (apply min (map count alleles)))
n-trim-left (min (dec min-len) n-trim-left)]
(if (pos? n-trim-left)
(-> v
(update :pos + n-trim-left)
(update :ref subs n-trim-left)
(assoc :alt (map #(subs % n-trim-left) alt)))
v))) |
Left-align and trim REF/ALT alleles. | (defn realign
([seq-reader variant]
(realign seq-reader variant {}))
([seq-reader
{:keys [alt]
ref' :ref
:as variant}
{:keys [window] :or {window 100}}]
(cond
(some->> (seq alt)
(every? (comp #{:snv :mnv :insertion :deletion :complex}
:type
(partial vcf-util/inspect-allele ref'))))
(trim-left (trim-right seq-reader variant window))
(and ref' (nil? (seq alt)))
(cond-> (update variant :ref subs 0 1)
(integer? (:END (:info variant)))
(assoc-in [:info :END] (:pos variant)))
:else variant))) |
Splits multiallelic variants and realigns all alleles. Returns a lazy
sequence of normalized variants. Note that the result may be unsorted due to
the realignment. Returns a transducer when no variant is provided. The
transducer does not guarantee thread safety of | (defn normalize
([ref-reader meta-info header]
(comp
(mapcat (make-splitter meta-info header))
(map (partial realign ref-reader))))
([ref-reader meta-info header variants]
(sequence (normalize ref-reader meta-info header) variants))) |