(ns cljam.io.vcf.util
  (:require [clojure.string :as cstr]
            [proton.core :as p]))

Checks if given string is equal to "." or nil.

(definline dot-or-nil?
  [^String s]
  `(or (nil? ~s)
       (empty? ~s)
       (and (= 1 (.length ~s))
            (= \. (.charAt ~s 0)))))
(defn- parse-float [^String s]
  (let [m (re-matcher #"(?i)[+-]?(nan|inf(?:inity)?)" s)]
    (if (.matches m)
      (case (.charAt (.group m 1) 0)
        (\n \N) Float/NaN
        (\i \I) (if (= (.charAt s 0) \-)
                  Float/NEGATIVE_INFINITY
                  Float/POSITIVE_INFINITY))
      (Float/parseFloat s))))

Returns a parser function for given type identifier.

(defn- value-parser
  [type']
  (case type'
    "Flag" (constantly :exists)
    "Integer" #(when-not (dot-or-nil? ^String %) (Integer/parseInt %))
    "Float" #(when-not (dot-or-nil? ^String %) (parse-float %))
    "Character" #(when-not (dot-or-nil? ^String %) (first %))
    "String" #(when-not (dot-or-nil? ^String %) %)))

Creates a key-value paired vector of 'id' and its parser.

(defn- meta->parser
  [{:keys [id number] type' :type}]
  (let [p (value-parser type')]
    [id (comp (if (or (= number 1) (= number 0)) first identity)
              (fn [x] (map p (cstr/split (or x "") #","))))]))

Returns a parser function defined by meta-info. info-meta must be a sequence of map containing keys :id, :number and :type. The parser takes a string and returns a map.

(defn info-parser
  [info-meta]
  (let [parser-map (into {} (map meta->parser) info-meta)]
    (fn [^String s]
      (when-not (dot-or-nil? s)
        (->> (cstr/split s #";")
             (into
              {}
              (map (fn [ss]
                     (let [[k vs] (cstr/split ss #"\=" 2)]
                       (if-let [parser (parser-map k)]
                         [(keyword k) (parser vs)]
                         (throw (ex-info (str "Undeclared INFO, " k ".")
                                         {:info ss}))))))))))))

Returns a stringifier function of INFO field. Takes a vector of maps from info rows of meta-info. The stringifier takes a map and returns a string.

(defn info-stringifier
  [info-meta]
  (let [id-ordered (mapv :id info-meta)
        info-type (into {} (map (juxt :id :type)) info-meta)]
    (fn [info]
      (when info
        (->> id-ordered
             (keep
              (fn [k]
                (when-let [v (info (keyword k))]
                  (if (or (= v :exists) (= (info-type k) "Flag"))
                    k
                    (str k "=" (if (sequential? v) (cstr/join "," v) v))))))
             (cstr/join ";")
             not-empty)))))

Parses FILTER field and returns a sequence of keywords.

(defn parse-filter
  [^String s]
  (when-not (dot-or-nil? s)
    (map keyword (cstr/split s #";"))))

Stringifies FILTER field. Takes a sequence of keywords or strings.

(defn stringify-filter
  [fltr]
  (when (and (some? fltr) (some some? fltr))
    (cstr/join ";" (map name fltr))))

Parses FORMAT field and returns as a sequence of keywords.

(defn parse-format
  [^String s]
  (when-not (dot-or-nil? s)
    (map keyword (cstr/split s #":"))))

Stringifies FORMAT field. Takes a sequence of keywords or strings.

(defn stringify-format
  [formats]
  (when-not (or (nil? formats) (empty? formats))
    (cstr/join ":" (map name formats))))

Parses genotype (GT) format and returns a vector of pairs: [allele, phased]. Allele 0 indicates REF allele. 1,2,3... for 1st, 2nd, 3rd allele of ALT.

(defn parse-genotype
  [^String gt]
  (when-not (dot-or-nil? gt)
    (->> gt
         (re-seq #"([\||/])?(.|\d+)")
         (map (fn [[_ phase ^String allele]]
                [(when-not (dot-or-nil? allele) (Integer/parseInt allele))
                 (if phase
                   (= phase "|")
                   (neg? (.indexOf ^String gt "/")))])))))
(defn- stringify-allele [indicator-needed? [allele phase]]
  [(if indicator-needed?
     (if phase "|" "/"))
   (or allele \.)])

Stringifies genotype map into VCF-style GT string. The first phasing indicator will be omitted unless necessary.

(defn stringify-genotype
  [gt-seq]
  (when-let [[[_ phase :as a1] & as] (seq gt-seq)]
    (->> as
         (into (stringify-allele (not= phase (every? second as)) a1)
               (mapcat (partial stringify-allele true)))
         (apply str))))

Returns a sequence of genotypes represented as sequences of integers.

(defn genotype-seq
  ([^long ploidy ^long n-alt-alleles]
   (genotype-seq ploidy n-alt-alleles []))
  ([^long ploidy ^long n-alt-alleles s]
   (mapcat
    (fn [i]
      (if (= 1 ploidy)
        [(cons i s)]
        (genotype-seq (dec ploidy) i (cons i s))))
    (range (inc n-alt-alleles)))))

Returns an index for given genotype.

(defn genotype-index
  ^long [genotype]
  {:pre [(seq genotype)
         (every? (complement neg?) genotype)]}
  (letfn [(combination ^long [^long n ^long k]
            (cond
              (< n k) 0
              (= k n) 1
              (or (= k 1) (= k (dec n))) n
              (< (quot n 2) k) (recur n (- n k))
              :else (loop [i (inc (- n k)), j 1, r 1]
                      (if (<= j k)
                        (recur (inc i) (inc j) (quot (* r i) j))
                        r))))]
    (->> genotype
         sort
         (map-indexed (fn [^long m ^long k] (combination (+ k m) (inc m))))
         (apply +))))

Takes a parsed genotype (i.e. a vector of pairs [allele, phased]) and returns the parsed genotype with each allele unphased.

(defn unphase-genotype
  [gt-seq]
  (some->> (seq gt-seq)
           (map first)
           sort
           (#(map vector % (repeat false)))))

Converts a multiallelic genotype string into a biallelic one. Ignores all alleles other than the reference allele and the target-allele.

(defn biallelic-genotype
  [genotype ^long target-allele]
  (->> genotype
       parse-genotype
       (map (fn [[allele phased?]]
              [(when allele (if (= target-allele allele) 1 0)) phased?]))
       stringify-genotype))

Picks up elements in multiallelic coll and make a biallelic one. Ignores all alleles other than the reference allele and the target-allele.

(defn biallelic-coll
  [^long ploidy ^long n-alt-alleles ^long target-allele coll]
  (keep-indexed
   (fn [i gt]
     (when (every? (fn [^long a] (or (= a 0) (= a target-allele))) gt)
       (nth coll i)))
   (genotype-seq ploidy n-alt-alleles)))

Converts genotype to a sequence of integers.

(defn genotype->ints
  [gt]
  (let [parsed-gt (or (cond-> gt (string? gt) parse-genotype) [[nil]])]
    (->> (assoc-in (vec parsed-gt) [0 1] false)
         (map (fn [[^long allele phased]]
                (bit-or
                 (bit-shift-left (if allele (inc allele) 0) 1)
                 (if phased 1 0)))))))

Converts a sequence of integers to genotype string.

(defn ints->genotype
  [vs]
  (->> vs
       (mapcat (fn [^long i]
                 [(if (odd? i) \| \/)
                  (if (zero? (quot i 2)) "." (dec (quot i 2)))]))
       rest
       (apply str)
       (#(when-not (dot-or-nil? ^String %) %))))

Returns a parser function defined by meta-formats. info must be a sequence of map containing keys :id, :number and :type. The parser takes two string (format-line and sample-line) and returns a map.

(defn sample-parser
  [formats-meta]
  (let [parser-map (into {} (map meta->parser) formats-meta)]
    (fn [^String format-line sample-line]
      (when-not (dot-or-nil? format-line)
        (let [ks (cstr/split format-line #":")
              vs (concat
                  (when (not-empty sample-line)
                    (cstr/split sample-line #":"))
                  (repeat nil))]
          (into
           {}
           (map (fn [[k ^String v]]
                  [(keyword k) (when-not (dot-or-nil? v) ((parser-map k) v))]))
           (map vector ks vs)))))))

Converts sample map into a string. formats must be a sequence of keys in sample-map.

(defn stringify-sample
  [formats sample-map]
  (->> formats
       (map (fn [k] [k (get sample-map k)]))
       reverse
       (drop-while (fn [[_ v]] (or (nil? v) (= [nil] v))))
       (map (fn [[_ v]]
              (cond
                (sequential? v) (cstr/join "," (map (fn [i] (if (nil? i) "." i)) v))
                (nil? v) "."
                :else v)))
       reverse
       (cstr/join ":")
       not-empty))

Returns a parser function to parse :filter, :info, :FORMAT and sample columns of VCF. Takes meta-info and header of VCF. The parser takes a variant map and returns a parsed map.

(defn variant-parser
  [meta-info header]
  (let [[fmt-kw & sample-kws] (mapv keyword (drop 8 header))
        parse-info (info-parser (:info meta-info))
        parse-sample (sample-parser (:format meta-info))]
    (fn [v]
      (cond-> v
        true (update :filter parse-filter)
        true (update :info parse-info)
        fmt-kw (update fmt-kw parse-format)
        sample-kws (into (for [k sample-kws] [k (parse-sample (v fmt-kw) (v k))]))))))

Returns a stringifier function to stringify :filter, :info, :FORMAT and sample columns of VCF. Takes meta-info and header of VCF. The stringifier takes a parsed variant map and returns a map.

(defn variant-vals-stringifier
  [meta-info header]
  (let [[fmt-kw & sample-kws] (mapv keyword (drop 8 header))
        stringify-info (info-stringifier (:info meta-info))]
    (fn [v]
      (-> v
          (update :filter stringify-filter)
          (update :info stringify-info)
          (update fmt-kw stringify-format)
          (merge (into {} (for [k sample-kws] [k (stringify-sample (v fmt-kw) (v k))])))))))
(def ^:private ^:const long-breakend-regexp
  ;;   pre-seq    [ or ]  chr    pos     [ or ]    post-seq
  #"([ACGTN]*|\.)([\[\]])(.+?)(?::(\d+))([\[\]])([ACGTN]*|\.)")
(def ^:private ^:const short-breakend-regexp
  #"(\.?)([ATGCN]+)(\.?)")

Parses an ALT allele string of SVTYPE=BND. Returns a map with mandatory keys: - :bases bases that replaces the reference place - :join :before or :after and optional keys for a mate: - :chr chromosome name of the mate sequence - :pos genomic position of the mate sequence - :strand strand of the mate sequence Returns nil if input alt is not a breakend.

(defn parse-breakend
  [alt]
  (if-let [[_ pre-seq [left-bracket] chr pos [right-bracket] post-seq]
           (re-matches long-breakend-regexp alt)]
    (let [pre (not-empty pre-seq)
          post (not-empty post-seq)]
      (when (and (= left-bracket right-bracket)
                 (or pre post)
                 (not (and pre post)))
        (when-let [p (p/as-long pos)]
          {:chr chr,
           :pos p,
           :join (if post :before :after),
           :bases (or pre post),
           :strand (if (= left-bracket \])
                     (if post :forward :reverse)
                     (if post :reverse :forward))})))
    (when-let [[_ [pre-dot] bases' [post-dot]]
               (re-matches short-breakend-regexp alt)]
      (when (and (or pre-dot post-dot)
                 (not (and pre-dot post-dot)))
        {:bases bases', :join (if pre-dot :before :after)}))))

Returns a string representation of a breakend. If the input is malformed, returns nil. See the docstring of parse-breakend for the format.

(defn stringify-breakend
  [{:keys [chr pos strand join] s :bases}]
  (when (and (not-empty s) (#{:before :after} join))
    (if (and chr pos (#{:forward :reverse} strand))
      (let [before? (= join :before)
            bracket (if (= strand :forward)
                      (if before? \] \[)
                      (if before? \[ \]))]
        (str (when-not before? s) bracket chr \: pos bracket (when before? s)))
      (str (when (= :before join) \.) s (when (= :after join) \.)))))
(defn- inspect-nucleotides-allele
  [ref' alt]
  (let [ref-length (count ref')
        alt-length (count alt)
        upper-ref (cstr/upper-case ref')
        upper-alt (cstr/upper-case alt)
        left-match (->> (map = upper-ref upper-alt)
                        (take-while true?)
                        count)
        right-match (->> (cstr/reverse upper-alt)
                         (map = (cstr/reverse upper-ref))
                         (take-while true?)
                         count
                         long
                         (Math/min (- (Math/min ref-length alt-length)
                                      left-match)))
        matched-length (+ left-match right-match)]
    (cond
      (= left-match ref-length alt-length)
      {:type :ref}
      (and (= left-match ref-length) (< left-match alt-length))
      {:type :insertion, :offset (dec left-match),
       :n-bases (- alt-length left-match), :inserted (subs alt left-match)}
      (and (< left-match ref-length) (= left-match alt-length))
      {:type :deletion, :offset (dec left-match),
       :n-bases (- ref-length left-match), :deleted (subs ref' left-match)}
      (= (inc matched-length) ref-length alt-length)
      {:type :snv, :ref (nth ref' left-match),
       :alt (nth alt left-match), :offset left-match}
      (= matched-length alt-length)
      {:type :deletion, :offset (dec left-match),
       :n-bases (- ref-length right-match left-match),
       :deleted (subs ref' left-match (- ref-length right-match))}
      (= matched-length ref-length)
      {:type :insertion, :offset (dec left-match),
       :n-bases (- alt-length right-match left-match),
       :inserted (subs alt left-match (- alt-length right-match))}
      (= ref-length alt-length)
      {:type :mnv, :offset left-match,
       :ref (subs ref' left-match (- ref-length right-match)),
       :alt (subs alt left-match (- alt-length right-match))}
      :else {:type :complex})))

Inspects an alt allele by comparing to a ref allele string. Returns a map containing :type and other detailed information. A value of the key :type can be one of the followings: - :no-call No variant called - :spanning-deletion Placeholder to signify an absent sequence - :unspecified Unspecified non-ref allele - :ref Duplicated allele of REF - :id Symbolic reference - :snv Single nucleotide variant - :mnv Multiple nucleotide variants - :insertion Insertion of a short base sequence - :deletion Deletion of a short base sequence - :complete-insertion Complete insertion of a long sequence - :breakend Breakend of a complex rearrangement - :complex Complex nucleotide variants other than snv/mnv/indel - :other Can't categorize the allele, might be malformed

(defn inspect-allele
  [ref' alt]
  (or
   (when (re-matches #"(?i)[ACGTN]+" (or ref' ""))
     (condp re-matches (or (not-empty alt) ".")
       #"\." {:type :no-call}
       #"\*" {:type :spanning-deletion}
       #"(X|<\*>|<X>)" {:type :unspecified}
       #"<(.+)>" :>> (fn [[_ id]] {:type :id, :id id})
       #"(?i)([ACGTN])<(.+)>" :>> (fn [[_ [base] id]]
                                    {:type :complete-insertion,
                                     :join :after, :base base, :id id})
       #"(?i)<(.+)>([ACGTN])" :>> (fn [[_ id [base]]]
                                    {:type :complete-insertion,
                                     :join :before, :base base, :id id})
       #"(?i)[ACGTN]+" (inspect-nucleotides-allele ref' alt)
       (some-> (parse-breakend alt) (assoc :type :breakend))))
   {:type :other}))