Utility functions for manipulating chromosomal regions.

(ns cljam.util.region
  (:require [clojure.string :as cstr]
            [proton.core :as proton]))

region predicates

Returns true if two regions are overlapped with each other.

(defn overlapped-regions?
  [{^long x-start :start
    ^long x-end :end
    x-chr :chr}
   {^long y-start :start
    ^long y-end :end
    y-chr :chr}]
  (and (= x-chr y-chr)
       (<= y-start x-end)
       (<= x-start y-end)))

region conversion

Returns a lazy sequence of merged regions. Input regions must be sorted. Neighboring regions apart less than or equal to 'max-gap' will be merged with 'merge-fn'. Returns a stateful transducer when no input 'regions' is provided.

(defn merge-regions-with
  ([merge-fn]
   (merge-regions-with merge-fn 0))
  ([merge-fn ^long max-gap]
   (fn merge-regions-transducer [rf]
     (let [last-reg (volatile! nil)]
       (fn merge-regions-rf
         ([] (rf))
         ([r] (rf (if-let [l @last-reg] (rf r l) r)))
         ([r {^long x-start :start :as x}]
          (if-let [{^long l-end :end :as l} @last-reg]
            (if (and (= (:chr l) (:chr x))
                     (<= (- (dec x-start) l-end) max-gap))
              (do (vswap! last-reg merge-fn x) r)
              (do (vreset! last-reg x) (rf r l)))
            (do (vreset! last-reg x) r)))))))
  ([merge-fn ^long max-gap regs]
   (if-let [{^long f-end :end :as f} (first regs)]
     (if-let [{^long s-start :start :as s}
              (second regs)]
       (if (and (= (:chr f) (:chr s))
                (<= (- (dec s-start) f-end) max-gap))
         (let [next-regs (cons (merge-fn f s) (nnext regs))]
           (lazy-seq (merge-regions-with merge-fn max-gap next-regs)))
         (cons f (lazy-seq (merge-regions-with merge-fn max-gap (next regs)))))
       [f])
     [])))

Default function to merge two regions.

(defn- merge-two-regions
  [x y]
  (update x :end max (:end y)))

Same as 'merge-regions-with' except for 'merge-two-regions' is partially applied as merge-fn.

(def
  ^{:doc 
    :arglists '([] [max-gap] [max-gap regions])}
  merge-regions
  (partial merge-regions-with merge-two-regions))

Subtract a region from another one. Returns a vector of regions.

(defn subtract-region
  [lhs-reg
   {^long rhs-reg-start :start
    ^long rhs-reg-end :end
    rhs-reg-chr :chr}]
  (if (= (:chr lhs-reg) rhs-reg-chr)
    (filterv
     (fn [{:keys [^long start ^long end]}] (<= start end))
     [(update lhs-reg :end min (dec rhs-reg-start))
      (update lhs-reg :start max (inc rhs-reg-end))])
    [lhs-reg]))

Returns a sequence of regions complement to in-regions. in-regions must be sorted. Returns a stateful transducer when no regions provided.

(defn complement-regions
  ([base-region]
   (fn [rf]
     (let [last-reg (volatile! base-region)]
       (fn
         ([] (rf))
         ([r] (rf (if-let [l @last-reg] (rf r l) r)))
         ([r x]
          (if-let [l @last-reg]
            (let [[a b] (subtract-region l x)]
              (vreset! last-reg (or b a))
              (if b (rf r a) r))
            r))))))
  ([base-region in-regions]
   (if-let [reg (first in-regions)]
     (if-let [[a b] (seq (subtract-region base-region reg))]
       (if b
         (cons a (lazy-seq (complement-regions b (next in-regions))))
         (lazy-seq (complement-regions a (next in-regions))))
       [])
     [base-region])))

Divides a region [start end] into several chunks with maximum length 'step'. Returns a lazy sequence of vector.

(defn divide-region
  [^long start ^long end ^long step]
  (->> [(inc end)]
       (concat (range start (inc end) step))
       (partition 2 1)
       (map (fn [[^long s ^long e]] [s (dec e)]))))

Divides refs into several chunks with maximum length 'step'. Returns a lazy sequence of map containing {:chr :start :end}.

(defn divide-refs
  [refs step]
  (mapcat
   (fn [{:keys [len] name' :name}]
     (map (fn [[s e]] {:chr name' :start s :end e})
          (divide-region 1 len step)))
   refs))

validation

Checks if the given rname conforms to the spec of sam.

(defn valid-rname?
  [rname]
  (and rname (string? rname) (re-matches #"[!-)+-<>-~][!-~]*" rname)))

Checks if the given region map is a valid 1-based closed range.

(defn valid-region?
  [{:keys [chr ^long start ^long end]}]
  (and start end
       (valid-rname? chr)
       (number? start) (pos? start)
       (number? end) (pos? end)
       (<= start end)))

region <=> string

Parse a region string into a map.

(defn parse-region
  [region-str]
  (when region-str
    (let [pattern #"([!-)+-<>-~][!-~]*?)(:([\d,]+)?(-([\d,]+))?)?"
          [_ chr _ start _ end] (re-matches pattern region-str)
          start' (proton/as-long start)
          end' (proton/as-long end)]
      (when chr
        (cond-> {:chr chr}
          start' (assoc :start start')
          end' (assoc :end end'))))))

Parse a region string into a map strictly.

(defn parse-region-strict
  [region-str]
  (let [region-map (parse-region region-str)]
    (when (valid-region? region-map) region-map)))

Format a region map into a string.

(defn format-region
  [{:keys [chr start end]}]
  (let [result (->> [chr start end]
                    (take-while some?)
                    (interleave [nil \: \-])
                    (apply str))]
    (when-not (cstr/blank? result) result)))

Format a region map into a string strictly.

(defn format-region-strict
  [region-map]
  (when (valid-region? region-map) (format-region region-map)))