(ns cljam.io.fasta.core
(:refer-clojure :exclude [read])
(:require [clojure.java.io :as cio]
[clojure.string :as cstr]
[cljam.io.protocols :as protocols]
[cljam.util :as util]
[cljam.io.fasta-index.core :as fai]
[cljam.io.fasta.reader :as reader]
[cljam.io.util.bgzf :as bgzf]
[cljam.io.util.bgzf.gzi :as gzi])
(:import [java.io FileNotFoundException RandomAccessFile]
[cljam.io.fasta.reader FASTAReader IndexedBGZFInputStream])) | |
Reading | |
(defn- fasta-index
[fasta-url]
(let [fasta-exts #"(?i)(\.(fa|fasta|fas|fsa|seq|fna|faa|ffn|frn|mpfa))$"]
(or (->> ["$1.fai" ".fai" "$1.FAI" ".FAI"]
(map #(util/as-url (cstr/replace (str fasta-url) fasta-exts %)))
(cons (util/as-url (str fasta-url ".fai")))
(some #(try (fai/reader %) (catch FileNotFoundException _))))
(throw (FileNotFoundException.
(str "Could not find FASTA Index file for " fasta-url)))))) | |
(defn- bgzip-index [fasta-url] (gzi/read-gzi (util/as-url (str fasta-url ".gzi")))) | |
(defn- random-accessor [f]
(if (bgzf/bgzip? f)
(reader/->IndexedBGZFInputStream
(bgzf/bgzf-input-stream f)
(delay (bgzip-index f)))
(RandomAccessFile. (cio/as-file f) "r"))) | |
Makes FastaReader from the File. | (defn reader
^FASTAReader
[f]
(let [url (util/as-url f)]
(FASTAReader. (random-accessor url)
(util/compressor-input-stream url)
url
(delay (fasta-index url))))) |
Clones fasta reader sharing persistent objects. | (defn clone-reader
^FASTAReader
[^FASTAReader rdr]
(let [url (.url rdr)
r (if (instance? RandomAccessFile (.reader rdr))
(RandomAccessFile. (cio/as-file url) "r")
(reader/->IndexedBGZFInputStream
(bgzf/bgzf-input-stream url)
(.idx ^IndexedBGZFInputStream (.reader rdr))))
stream (util/compressor-input-stream url)]
(FASTAReader. r stream url (.index-delay rdr)))) |
Returns fasta headers(offset, name and desc). | (defn read-headers
[^FASTAReader rdr]
(try
(fai/get-headers @(.index-delay rdr))
(catch FileNotFoundException _
(reader/load-headers (.reader rdr))))) |
Reads summaries of sequences in this FASTA file. | (defn read-seq-summaries
[^FASTAReader rdr]
(mapv #(select-keys % [:name :len])
(fai/get-indices @(.index-delay rdr)))) |
Reads fasta indices. | (defn read-indices [^FASTAReader rdr] (fai/get-indices @(.index-delay rdr))) |
Reads sequences by line, returning the line-separated sequences as lazy sequence. | (defn read-sequences [rdr] (reader/read-sequences rdr)) |
Reads the specified range of sequence. Start and end can be null. | (defn read-sequence
[rdr {:keys [chr start end]} opts]
(reader/read-sequence rdr chr start end opts)) |
Returns a lazy sequence of maps representing each sequence in FASTA. | (defn read [rdr] (reader/read rdr)) |
Resets the file pointer of rdr. | (defn reset [rdr] (reader/reset rdr)) |
Returns a list of maps containing sequence as upper-case string. | (defn sequential-read
([rdr]
(sequential-read rdr {}))
([^FASTAReader rdr opts]
(reader/sequential-read-string (.stream rdr) (* 1024 1024 10) 536870912 opts))) |
(extend-type FASTAReader
protocols/IReader
(reader-url [this] (.url this))
(read
([this] (protocols/read this {}))
([this option] (protocols/read-all-sequences this option)))
(indexed? [this]
(try
@(.index-delay this)
true
(catch FileNotFoundException _
false)))
protocols/IRegionReader
(read-in-region
([this region]
(protocols/read-in-region this region {}))
([this region option]
(protocols/read-sequence this region option)))
protocols/ISequenceReader
(read-seq-summaries
[this] (read-seq-summaries this))
(read-indices
[this] (read-indices this))
(read-all-sequences
([this] (protocols/read-all-sequences this {}))
([this opts]
(sequential-read this opts)))
(read-sequence
([this region]
(protocols/read-sequence this region {}))
([this region opts]
(read-sequence this region opts)))) | |