(ns cljam.io.fasta.core (:refer-clojure :exclude [read]) (:require [clojure.java.io :as cio] [clojure.string :as cstr] [cljam.io.protocols :as protocols] [cljam.util :as util] [cljam.io.fasta-index.core :as fai] [cljam.io.fasta.reader :as reader] [cljam.io.util.bgzf :as bgzf] [cljam.io.util.bgzf.gzi :as gzi]) (:import [java.io FileNotFoundException RandomAccessFile] [cljam.io.fasta.reader FASTAReader IndexedBGZFInputStream])) | |
Reading | |
(defn- fasta-index [fasta-url] (let [fasta-exts #"(?i)(\.(fa|fasta|fas|fsa|seq|fna|faa|ffn|frn|mpfa))$"] (or (->> ["$1.fai" ".fai" "$1.FAI" ".FAI"] (map #(util/as-url (cstr/replace (str fasta-url) fasta-exts %))) (cons (util/as-url (str fasta-url ".fai"))) (some #(try (fai/reader %) (catch FileNotFoundException _)))) (throw (FileNotFoundException. (str "Could not find FASTA Index file for " fasta-url)))))) | |
(defn- bgzip-index [fasta-url] (gzi/read-gzi (util/as-url (str fasta-url ".gzi")))) | |
(defn- random-accessor [f] (if (bgzf/bgzip? f) (reader/->IndexedBGZFInputStream (bgzf/bgzf-input-stream f) (delay (bgzip-index f))) (RandomAccessFile. (cio/as-file f) "r"))) | |
(defn ^FASTAReader reader [f] (let [url (util/as-url f)] (FASTAReader. (random-accessor url) (util/compressor-input-stream url) url (delay (fasta-index url))))) | |
Clones fasta reader sharing persistent objects. | (defn ^FASTAReader clone-reader [^FASTAReader rdr] (let [url (.url rdr) r (if (instance? RandomAccessFile (.reader rdr)) (RandomAccessFile. (cio/as-file url) "r") (reader/->IndexedBGZFInputStream (bgzf/bgzf-input-stream url) (.idx ^IndexedBGZFInputStream (.reader rdr)))) stream (util/compressor-input-stream url)] (FASTAReader. r stream url (.index-delay rdr)))) |
(defn read-headers [^FASTAReader rdr] (try (fai/get-headers @(.index-delay rdr)) (catch FileNotFoundException _ (reader/load-headers (.reader rdr))))) | |
Read summaries of sequences in this FASTA file. | (defn read-seq-summaries [^FASTAReader rdr] (mapv (fn [{:keys [name len]}] {:name name, :len len}) (fai/get-indices @(.index-delay rdr)))) |
(defn read-indices [^FASTAReader rdr] (fai/get-indices @(.index-delay rdr))) | |
Reads sequences by line, returning the line-separated sequences as lazy sequence. | (defn read-sequences [rdr] (reader/read-sequences rdr)) |
(defn read-sequence [rdr {:keys [chr start end]} opts] (reader/read-sequence rdr chr start end opts)) | |
(defn read [rdr] (reader/read rdr)) | |
(defn reset [rdr] (reader/reset rdr)) | |
(defn sequential-read ([rdr] (sequential-read rdr {})) ([^FASTAReader rdr opts] (reader/sequential-read-string (.stream rdr) (* 1024 1024 10) 536870912 opts))) | |
(extend-type FASTAReader protocols/IReader (reader-url [this] (.url this)) (read ([this] (protocols/read this {})) ([this option] (protocols/read-all-sequences this option))) (indexed? [this] (try @(.index-delay this) true (catch FileNotFoundException _ false))) protocols/IRegionReader (read-in-region ([this region] (protocols/read-in-region this region {})) ([this region option] (protocols/read-sequence this region option))) protocols/ISequenceReader (read-seq-summaries [this] (read-seq-summaries this)) (read-indices [this] (read-indices this)) (read-all-sequences ([this] (protocols/read-all-sequences this {})) ([this opts] (sequential-read this opts))) (read-sequence ([this region] (protocols/read-sequence this region {})) ([this region opts] (read-sequence this region opts)))) | |