Functions to read and write the BED (Browser Extensible Data) format. See http://genome.ucsc.edu/FAQ/FAQformat#format1 for the detail BED specifications. | (ns cljam.io.bed
(:require [clojure.java.io :as cio]
[clojure.string :as cstr]
[proton.core :refer [as-long]]
[cljam.io.protocols :as protocols]
[cljam.util :as util]
[cljam.util.chromosome :as chr]
[cljam.util.region :as region]
[clojure.tools.logging :as logging])
(:import [java.net URL]
[java.io BufferedReader BufferedWriter Closeable])) |
(declare read-fields write-fields) | |
(defrecord BEDReader [^BufferedReader reader ^URL url]
Closeable
(close [this]
(.close ^Closeable (.reader this)))
protocols/IReader
(reader-url [this] (.url this))
(read [this] (protocols/read this {}))
(read [this _] (read-fields this))
(indexed? [_] false)
protocols/IRegionReader
(read-in-region [this region]
(protocols/read-in-region this region {}))
(read-in-region [this {:keys [chr ^long start ^long end]} _]
(logging/warn "May cause degradation of performance.")
(filter (fn [{^long m-start :start
^long m-end :end
m-chr :chr}]
(and (or (not chr) (= m-chr chr))
(or (not start) (<= start m-start))
(or (not end) (<= m-end end))))
(read-fields this)))) | |
(defrecord BEDWriter [^BufferedWriter writer ^URL url]
Closeable
(close [this]
(.close ^Closeable (.writer this)))
protocols/IWriter
(writer-url [this] (.url this))) | |
Returns an open cljam.io.bed.BEDReader of f. Should be used inside with-open to ensure the reader is properly closed. | (defn reader ^BEDReader [f] (BEDReader. (cio/reader (util/compressor-input-stream f)) (util/as-url f))) |
Returns an open cljam.io.bed.BEDWriter of f. Should be used inside with-open to ensure the writer is properly closed. | (defn writer ^BEDWriter [f] (BEDWriter. (cio/writer (util/compressor-output-stream f)) (util/as-url f))) |
The keywords of bed columns. | (def ^:const bed-columns [:chr :start :end :name :score :strand :thick-start :thick-end :item-rgb :block-count :block-sizes :block-starts]) |
Converts string of comma-separated long values into list of longs. Comma at the end of input string will be ignored. Returns nil if input is nil. | (defn- str->long-list
[^String s]
(when-not (nil? s)
(map as-long (cstr/split s #",")))) |
Inverse function of str->long-list. | (defn- long-list->str
[xs]
(when (seq xs)
(cstr/join "," xs))) |
Same as update if map 'm' contains key 'k'. Otherwise, returns the original map 'm'. | (defn- update-some
[m k f & args]
(if (get m k)
(apply update m k f args)
m)) |
Parse BED fields string and returns a map. Based on information at https://genome.ucsc.edu/FAQ/FAQformat#format1. | (defn- deserialize-bed
[^String s]
{:post [;; First 3 fields are required.
(:chr %) (:start %) (:end %)
;; The chromEnd base is not included in the display of the feature.
(< (long (:start %)) (long (:end %)))
;; Lower-numbered fields must be populated if higher-numbered fields are used.
(every? true? (drop-while false? (map nil? ((apply juxt bed-columns) %))))
;; A score between 0 and 1000.
(if-let [s (:score %)] (<= 0 s 1000) true)
;; The number of items in this list should correspond to blockCount.
(if-let [xs (:block-sizes %)] (= (count xs) (:block-count %)) true)
;; The number of items in this list should correspond to blockCount.
(if-let [xs (:block-starts %)] (= (count xs) (:block-count %)) true)
;; The first blockStart value must be 0.
(if-let [[f] (:block-starts %)] (= 0 f) true)
;; The final blockStart position plus the final blockSize value must equal chromEnd.
(if-let [xs (:block-starts %)] (= (+ (long (last xs))
(long (last (:block-sizes %))))
(- (long (:end %))
(long (:start %)))) true)
;; Blocks may not overlap.
(if-let [xs (:block-starts %)]
(apply <= (mapcat (fn [^long a ^long b] [a (+ a b)])
xs (:block-sizes %)))
true)]}
(reduce
(fn deserialize-bed-reduce-fn [m [k f]] (update-some m k f))
(zipmap bed-columns (cstr/split s #"\s+"))
{:start as-long
:end as-long
:score as-long
:strand #(case % "." nil "+" :forward "-" :reverse)
:thick-start as-long
:thick-end as-long
:block-count as-long
:block-sizes str->long-list
:block-starts str->long-list})) |
Serializes bed fields into a string. | (defn- serialize-bed
[m]
{:pre [;; First 3 fields are required.
(:chr m) (:start m) (:end m)
;; The chromEnd base is not included in the display of the feature.
(< (long (:start m)) (long (:end m)))
;; Lower-numbered fields must be populated if higher-numbered fields are used.
(every? true? (drop-while false? (map nil? ((apply juxt bed-columns) m))))
;; A score between 0 and 1000.
(if-let [s (:score m)] (<= 0 s 1000) true)
;; The number of items in this list should correspond to blockCount.
(if-let [xs (:block-sizes m)] (= (count xs) (:block-count m)) true)
;; The number of items in this list should correspond to blockCount.
(if-let [xs (:block-starts m)] (= (count xs) (:block-count m)) true)
;; The first blockStart value must be 0.
(if-let [[f] (:block-starts m)] (= 0 f) true)
;; The final blockStart position plus the final blockSize value must equal chromEnd.
(if-let [xs (:block-starts m)]
(= (+ (long (last xs)) (long (last (:block-sizes m))))
(- (long (:end m)) (long (:start m)))) true)
;; Blocks may not overlap.
(if-let [xs (:block-starts m)]
(apply <= (mapcat (fn [^long a ^long b] [a (+ a b)])
xs (:block-sizes m)))
true)]}
(->> (-> m
(update-some :strand #(case % :forward "+" :reverse "-" nil "."))
(update-some :block-sizes long-list->str)
(update-some :block-starts long-list->str))
((apply juxt bed-columns))
(take-while identity)
(cstr/join \tab))) |
Checks if given string is neither a header nor a comment line. | (defn- header-or-comment?
[^String s]
(or (empty? s)
(.startsWith s "browser")
(.startsWith s "track")
(.startsWith s "#"))) |
Normalizes BED fields. BED fields are stored in format: 0-origin and inclusive-start / exclusive-end. This function converts the coordinate into cljam style: 1-origin and inclusive-start / inclusive-end. | (defn- normalize
[m]
(-> m
(update :start inc)
(update-some :thick-start inc))) |
De-normalizes BED fields. This is an inverse function of normalize. | (defn- denormalize
[m]
(-> m
(update :start dec)
(update-some :thick-start dec))) |
Returns a lazy sequence of normalized BED fields. | (defn read-fields
[^BEDReader rdr]
(sequence
(comp (remove header-or-comment?)
(map deserialize-bed)
(map normalize))
(line-seq (.reader rdr)))) |
Sort BED fields based on :chr, :start and :end. :chr with common names come first, in order of (chr1, chr2, ..., chrX, chrY, chrM). Other chromosomes follow after in lexicographic order. | (defn sort-fields
[xs]
(sort-by
(fn [m]
[(chr/chromosome-order-key (:chr m))
(:start m)
(:end m)])
xs)) |
Sort and merge overlapped regions. Currently, this function affects only :end and :name fields. | (defn merge-fields
[xs]
(region/merge-regions-with
(fn [x {:keys [end] name' :name}]
(-> x
(update :end max end)
(update-some :name str "+" name')))
0
(sort-fields xs))) |
Compare two BED fields on the basis of :chr and :end fields. | (defn- fields-<=
[x y]
(<= (compare [(chr/chromosome-order-key (:chr x)) (:end x)]
[(chr/chromosome-order-key (:chr y)) (:end y)])
0)) |
Returns a lazy sequence that is the intersection of the two BED sequences. The input sequences will first be sorted with sort-fields, which may cause an extensive memory use for ones with a large number of elements. Note also that this function assumes the input sequences contain only valid regions, and thus :start <= :end holds for each region. Make sure yourself the input sequences meet the condition, or the function may return a wrong result. | (defn intersect-fields
[xs ys]
(letfn [(intersect [xs ys]
(lazy-seq
(if-let [x (first xs)]
(if-let [y (first ys)]
(cond->> (if (fields-<= x y)
(intersect (next xs) ys)
(intersect xs (next ys)))
(region/overlapped-regions? x y)
(cons (-> x
(update :start max (:start y))
(update :end min (:end y)))))
[])
[])))]
(intersect (sort-fields xs) (merge-fields ys)))) |
Returns a lazy sequence that is the result of subtracting the BED fields in the sequence ys from the sequence xs. The input sequences will first be sorted with sort-fields, which may cause an extensive memory use for ones with a large number of elements. Note also that this function assumes the input sequences contain only valid regions, and thus :start <= :end holds for each region. Make sure yourself the input sequences meet the condition, or the function may return a wrong result. | (defn subtract-fields
[xs ys]
(letfn [(subtract [xs ys]
(lazy-seq
(if-let [x (first xs)]
(if-let [y (first ys)]
(let [[r1 r2] (region/subtract-region x y)]
(if r2
(cons r1 (subtract (cons r2 (next xs)) (next ys)))
(if r1
(if (fields-<= r1 y)
(cons r1 (subtract (next xs) ys))
(subtract (cons r1 (next xs)) (next ys)))
(subtract (next xs) ys))))
xs)
[])))]
(subtract (sort-fields xs) (merge-fields ys)))) |
Takes a sequence of maps containing :name and :len keys (representing chromosome's name and length, resp.) and a sequence of BED fields, and returns a lazy sequence that is the complement of the BED sequence. The input sequence will first be sorted with sort-fields, which may cause an extensive memory use for ones with a large number of elements. Note also that this function assumes the BED sequence contains only valid regions, and thus :start <= :end holds for each region. Make sure yourself the BED sequence meets the condition, or the function may return a wrong result. | (defn complement-fields
[refs xs]
(let [chr->len (into {} (map (juxt :name :len)) refs)
chrs (sort-by chr/chromosome-order-key (map :name refs))]
(when-first [{:keys [chr]} (filter #(not (chr->len (:chr %))) xs)]
(let [msg (str "Length of chromosome " chr " not specified")]
(throw (IllegalArgumentException. msg))))
(letfn [(complement [xs chrs ^long pos]
(lazy-seq
(when-let [chr (first chrs)]
(let [len (long (get chr->len chr))
{^long x-start :start
^long x-end :end
x-chr :chr
:as x} (first xs)]
(if (and x (= x-chr chr))
(cond->> (complement (next xs) chrs (inc x-end))
(< pos (long (:start x)))
(cons {:chr chr :start pos :end (dec x-start)}))
(cond->> (complement xs (next chrs) 1)
(< pos len)
(cons {:chr chr :start pos :end len})))))))]
(complement (merge-fields xs) chrs 1)))) |
Writes sequence of BED fields to writer. | (defn write-fields
[^BEDWriter wtr xs]
(let [w ^BufferedWriter (.writer wtr)]
(->> xs
(map (comp serialize-bed denormalize))
(cstr/join \newline)
(.write w)))) |