(ns cljam.io.gff
(:require [clojure.string :as cstr]
[clojure.java.io :as cio]
[proton.core :as p]
[cljam.util :as util])
(:import [java.io Closeable BufferedReader BufferedWriter]
java.nio.CharBuffer)) | |
Encoder / Decoder | |
(defn- escape-in-column? [^long i]
(or (<= i 0x1F)
(= i 0x25)
(= i 0x7F))) | |
(defn- escape-in-attr? [^long i]
(or (<= i 0x1F)
(= i 0x25)
(= i 0x26)
(= i 0x2C)
(= i 0x3B)
(= i 0x3D)
(= i 0x7F))) | |
(defn- escape-in-target? [^long i]
(or (<= i 0x1F)
(= i 0x20)
(= i 0x25)
(= i 0x26)
(= i 0x2C)
(= i 0x3B)
(= i 0x3D)
(= i 0x7F))) | |
(defn- encode ^String [pred ^String s]
(let [cb (CharBuffer/wrap s)
sb (StringBuilder. (.length s))]
(while (.hasRemaining cb)
(let [c (.get cb)
i (int c)]
(if (pred i)
(let [upper (bit-and 0xf (unsigned-bit-shift-right i 4))
lower (bit-and 0xf i)]
(.append sb \%)
(.append sb (unchecked-char
(if (<= upper 9) (+ 48 upper) (+ 55 upper))))
(.append sb (unchecked-char
(if (<= lower 9) (+ 48 lower) (+ 55 lower)))))
(.append sb c))))
(str sb))) | |
(defn- encode-in-attr ^String [s] (encode escape-in-attr? s)) | |
(defn- decode ^String [pred ^String s]
(when s
(let [cb (CharBuffer/wrap s)
sb (StringBuilder. (.length s))]
(while (.hasRemaining cb)
(let [c (.get cb)]
(if (= \% c)
(let [upper (.get cb)
lower (.get cb)
i (bit-or
(bit-shift-left
(unchecked-int (Character/digit upper 16)) 4)
(unchecked-int (Character/digit lower 16)))]
(if (pred i)
(.append sb (unchecked-char i))
(throw
(ex-info
"Found an invalid character encoding while decoding GFF3 file"
{:input s, :invalid-string (str c upper lower)}))))
(.append sb c))))
(str sb)))) | |
(defn- decode-in-attr ^String [s] (decode escape-in-attr? s)) | |
(defn- encode-multiple ^String [xs] (cstr/join \, (map encode-in-attr xs))) | |
(defn- decode-multiple [s] (map (partial decode escape-in-attr?) (cstr/split s #","))) | |
(def ^:const ^:private target-regexp #"(\S+) ([1-9]\d*) ([1-9]\d*)(?: ([+-]))?") | |
(defn- encode-target ^String [{:keys [chr start end strand]}]
(cstr/join \space (cond-> [(encode escape-in-target? chr) start end]
strand (conj (case strand :forward \+ :reverse \-))))) | |
(defn- decode-target [s]
(when-let [[_ target-id start end strand] (re-matches target-regexp s)]
(cond-> {:chr (decode escape-in-target? target-id)
:start (p/as-long start)
:end (p/as-long end)}
strand (assoc :strand (case (first strand) \+ :forward \- :reverse))))) | |
(defn- encode-gap ^String [xs] (cstr/join \space (map (fn [[op len]] (str op len)) xs))) | |
(defn- decode-gap [s]
(->> s
(re-seq #"(?:^|\s)([MIDFR])([1-9]\d*)")
(map
(fn [[_ [op] len]]
[op (p/as-long len)])))) | |
(defn- encode-db ^String [xs]
(->> xs
(map
(fn [{:keys [db-tag id]}]
(str
(encode escape-in-attr? db-tag)
\:
(encode escape-in-attr? id))))
(cstr/join \,))) | |
(defn- decode-db [s]
(->> (cstr/split s #",")
(map (fn [x]
(let [[db-tag id] (cstr/split x #":" 2)]
{:db-tag (decode escape-in-attr? db-tag),
:id (decode escape-in-attr? id)}))))) | |
(defn- dot->nil [^String s]
(when-not (and s
(zero? (dec (.length s)))
(= \. (.charAt s 0)))
s)) | |
(def ^:const ^:private
predefined-tags
;; `:index` is not defined in the spec, can be ignored
{"ID" {:index 0, :key :id,
:encoder encode-in-attr, :decoder decode-in-attr},
"Name" {:index 2, :key :name,
:encoder encode-in-attr, :decoder decode-in-attr},
"Alias" {:index 3, :key :alias,
:encoder encode-multiple, :decoder decode-multiple},
"Parent" {:index 1, :key :parent,
:encoder encode-multiple, :decoder decode-multiple},
"Target" {:index 4, :key :target,
:encoder encode-target, :decoder decode-target},
"Gap" {:index 5, :key :gap,
:encoder encode-gap, :decoder decode-gap},
"Derives_from" {:index 6, :key :derives-from,
:encoder encode-in-attr, :decoder decode-in-attr},
"Note" {:index 7, :key :note,
:encoder encode-multiple, :decoder decode-multiple},
"Dbxref" {:index 8, :key :db-xref,
:encoder encode-db, :decoder decode-db},
"Ontology_term" {:index 9, :key :ontology-term,
:encoder encode-db, :decoder decode-db},
"Is_circular" {:index 10, :key :circular?,
:encoder str, :decoder #(Boolean/parseBoolean %)}}) | |
Reader | |
(deftype GFFReader [reader version]
Closeable
(close [this]
(.close ^Closeable (.reader this)))) | |
(def ^:const ^:private version-regexp #"##gff-version ([1-9]\d*)(?:\.([1-9]\d*)(?:\.([1-9]\d*))?)?") | |
Returns a file format version of the given | (defn version [^GFFReader reader] (.version reader)) |
Returns an open | (defn reader
^GFFReader
[f]
(let [r ^BufferedReader (cio/reader (util/compressor-input-stream f))]
(try
(let [version-line (.readLine r)
[version-directive & xs] (re-matches version-regexp version-line)
{:keys [version] :as v} (-> [:version
:major-revision
:minor-revision]
(zipmap (map p/as-long xs)))]
(when-not version-directive
(throw
(ex-info
"GFF3 must start with the `##gff-version 3.#.#` directive"
{:url (try (util/as-url f) (catch Exception _ nil)),
:version-directive version-line})))
(when-not (= version 3)
(throw
(ex-info
"Only GFF version 3 is supported"
(assoc v :url (try (util/as-url f) (catch Exception _ nil))))))
(GFFReader. r v))
(catch Exception e
(.close r)
(throw e))))) |
(defn- parse-attr [s]
(let [[raw-tag value] (cstr/split s #"=" 2)
tag' (decode-in-attr raw-tag)
{:keys [decoder] key' :key
:or {key' tag'
decoder decode-multiple}} (predefined-tags tag')]
[key' (decoder value)])) | |
(defn- parse-attrs
[s]
(into {} (map parse-attr) (some-> s (cstr/split #";")))) | |
(defn- parse-gff-line
[s]
(let [[seq-id src typ start end
score strand phase attrs] (cstr/split s #"\t" 9)]
{:chr (->> seq-id dot->nil (decode escape-in-column?))
:source (->> src dot->nil (decode escape-in-column?))
:type (->> typ dot->nil (decode escape-in-column?))
:start (p/as-long start)
:end (p/as-long end)
:score (p/as-double score)
;; +: forward, -: reverse, ?: unknown, nil: not-stranded
:strand (some-> strand dot->nil first (case \+ :forward \- :reverse \? :unknown))
:phase (some-> phase dot->nil first (case \0 0 \1 1 \2 2))
:attributes (-> attrs dot->nil parse-attrs)})) | |
Reads features of the GFF file, returning them as a lazy sequence. | (defn read-features
[^GFFReader gff-reader]
(->> gff-reader
.reader
line-seq
(sequence
(comp
;; TODO: handle FASTA sequences
(take-while #(not (or (cstr/starts-with? % "##FASTA")
(cstr/starts-with? % ">"))))
(comp
;; TODO: handle other directives
(remove #(cstr/starts-with? % "#"))
;; TODO: construct tree structures
(map parse-gff-line)))))) |
Writer | |
(deftype GFFWriter [writer version]
Closeable
(close [this]
(.close ^Closeable (.writer this)))) | |
Returns an open | (defn writer
(^GFFWriter [f]
(writer f {}))
(^GFFWriter [f options]
(let [{:keys [encoding version] :as opts} (merge {:version 3} options)
url (try (util/as-url f) (catch Exception _ nil))]
(when-not (= 3 version)
(throw (ex-info "Only GFF3 is supported" (assoc opts :url url))))
(-> (cond
encoding (util/compressor-output-stream f encoding)
url (util/compressor-output-stream f)
:else f)
cio/writer
(GFFWriter. opts))))) |
(def ^:const ^:private
inv-predefined-tags
(->> predefined-tags
(map (fn [[key-str {key' :key :as x}]]
[key' (assoc x :key-str key-str)]))
(into {}))) | |
(def ^:const ^:private predefined-keys (map (comp :key val) (sort-by (comp :index val) predefined-tags))) | |
(defn- write-attrs!
[^BufferedWriter w attrs]
(let [first? (volatile! true)]
(doseq [key' (concat predefined-keys
(apply disj (set (keys attrs)) predefined-keys))
:let [value (get attrs key')]
:when value
:let [{:keys [^String key-str encoder]
:or {key-str key'
encoder encode-multiple}} (inv-predefined-tags key')]]
(if @first?
(vreset! first? false)
(.append w \;))
(.write w key-str)
(.append w \=)
(.write w ^String (encoder value))))) | |
(defn- write-feature!
[^BufferedWriter w {:keys [chr ^String source ^String ^long start
^long end score ^Character strand phase
attributes]
type' :type}]
(.write w (encode escape-in-column? chr))
(.append w \tab)
(.write w (or (some->> source (encode escape-in-column?)) "."))
(.append w \tab)
(.write w (or (some->> type' (encode escape-in-column?)) "."))
(.append w \tab)
(.write w (String/valueOf start))
(.append w \tab)
(.write w (String/valueOf end))
(.append w \tab)
(.write w (or (some->> score String/valueOf cstr/lower-case) "."))
(.append w \tab)
(.append w (case strand :forward \+ :reverse \- :unknown \? nil \.))
(.append w \tab)
(.append w (if (nil? phase) \. (case (byte phase) 0 \0 1 \1 2 \2)))
(.append w \tab)
(if (seq attributes)
(write-attrs! w attributes)
(.append w \.))) | |
Writes | (defn write-features
[^GFFWriter gff-writer features]
(let [w ^BufferedWriter (.writer gff-writer)
{:keys [^long version major-revision minor-revision]} (.version gff-writer)]
(.write w "##gff-version ")
(.write w (String/valueOf version))
(when major-revision
(.append w \.)
(.write w (String/valueOf major-revision))
(when minor-revision
(.append w \.)
(.write w (String/valueOf minor-revision))))
(doseq [f features]
(.newLine w)
(write-feature! w f)))) |