Functions to read and write the FASTQ format. | (ns cljam.io.fastq
(:require [clojure.java.io :as cio]
[clojure.string :as string]
[cljam.io.protocols :as protocols]
[cljam.util :as util])
(:import [java.io Closeable]
[java.nio Buffer CharBuffer])
(:refer-clojure :exclude [name sequence])) |
(declare read-sequences write-sequences) | |
(deftype FASTQReader [reader url]
Closeable
(close [this]
(.close ^Closeable (.reader this)))
protocols/IReader
(reader-url [this] (.url this))
(read [this] (read-sequences this))
(read [this opts] (read-sequences this opts))
(indexed? [_] false)) | |
(deftype FASTQWriter [writer url]
Closeable
(close [this]
(.close ^Closeable (.writer this)))
protocols/IWriter
(writer-url [this] (.url this))) | |
Returns an open cljam.io.fastq.FASTQReader of f. Should be used inside with-open to ensure the reader is properly closed. | (defn reader
^FASTQReader
[f]
(-> (util/compressor-input-stream f)
cio/reader
(FASTQReader. (util/as-url f)))) |
Returns an open cljam.io.fastq.FASTQWriter of f. Should be used inside with-open to ensure the writer is properly closed. | (defn writer
^FASTQWriter
[f]
(-> (util/compressor-output-stream f)
cio/writer
(FASTQWriter. (util/as-url f)))) |
(defrecord FASTQRead [^String name ^String sequence quality]) | |
Deserialize a read from 4 lines of fastq file. | (defn- deserialize-fastq
^FASTQRead
[[^String name-line ^String seq-line ^String plus-line ^String qual-line]
{:keys [decode-quality] :or {decode-quality :phred33}}]
{:pre [(not-empty name-line)
(not-empty seq-line)
(not-empty plus-line)
(not-empty qual-line)
(= (first name-line) \@)
(= (first plus-line) \+)
(not-empty (rest name-line))
(or (empty? (rest plus-line))
(= (rest plus-line) (rest name-line)))
(= (count seq-line) (count qual-line))]
:post [(every? (fn [q] (case decode-quality
:phred33 (<= 0 q 93)
:phred64 (<= 0 q 62)
true))
(:quality %))]}
(FASTQRead.
(subs name-line 1)
seq-line
(case decode-quality
:phred33 (map #(- (int %) 33) qual-line)
:phred64 (map #(- (int %) 64) qual-line)
qual-line))) |
Returns a lazy sequence of FASTQReads deserialized from given reader. | (defn read-sequences
([rdr]
(read-sequences rdr {}))
([^FASTQReader rdr opts]
(clojure.core/sequence
(comp (map string/trim)
(partition-all 4)
(map #(deserialize-fastq % opts)))
(line-seq (.reader rdr))))) |
Serializes a FASTQRead to FASTQ format string. | (defn- serialize-fastq
^String
[^FASTQRead {:keys [^String name ^String sequence quality]}
{:keys [encode-quality] :or {encode-quality :phred33}}]
{:pre [(not-empty name)
(not-empty sequence)
(not-empty quality)
(= (count sequence) (count quality))
(every? #(case encode-quality
:phred33 (<= 0 % 93)
:phred64 (<= 0 % 62)
true) quality)]}
(let [cb (CharBuffer/allocate (+ 6 (.length name) (.length sequence) (.length sequence)))]
(.put cb \@)
(.put cb name)
(.put cb \newline)
(.put cb sequence)
(.put cb \newline)
(.put cb \+)
(.put cb \newline)
(if (string? quality)
(.put cb ^String quality)
(doseq [q quality]
(.put cb (char (case encode-quality
:phred33 (+ (long q) 33)
:phred64 (+ (long q) 64)
q)))))
(.put cb \newline)
(.flip ^Buffer cb)
(.toString cb))) |
Writes given sequence of reads to a FASTQ file. | (defn write-sequences
([wtr sequences]
(write-sequences wtr sequences {}))
([^FASTQWriter wtr sequences opts]
(let [w ^java.io.Writer (.writer wtr)]
(doseq [s sequences]
(.write w ^String (serialize-fastq s opts)))))) |
Parses Casava-style name of fastq read. | (def ^:private casava-pattern
#"^@?([^\s^:]+):(\d+):(\d+):(\d+):(\d+)#(\d+)/(\d)+$")
(defn deserialize-casava-name
[^String name]
(let [[match instrument lane tile x y index pair]
(re-matches casava-pattern name)]
(when match
{:instrument instrument
:lane (Integer/parseInt lane)
:tile (Integer/parseInt tile)
:x (Integer/parseInt x)
:y (Integer/parseInt y)
:index (Integer/parseInt index)
:pair (Integer/parseInt pair)}))) |
Encodes fastq name map to Casava-style string. | (defn serialize-casava-name
^String
[{:keys [instrument lane tile x y index pair]}]
(when (and instrument lane tile x y index pair)
(str instrument \: lane \: tile \: x \: y \# index \/ pair))) |
Parses Casava1.8-style name of fastq read. | (def ^:private casava-1_8-pattern
#"^@?([^\s^:]+):(\d+):([^\s^\:]+):(\d+):(\d+):(\d+):(\d+)\s+(\d+):(Y|N):(\d+):(\S+)$")
(defn deserialize-casava-1_8-name
[^String name]
(let [[match instrument run flowcell lane tile x y pair filtered control index]
(re-matches casava-1_8-pattern name)]
(when match
{:instrument instrument
:run (Integer/parseInt run)
:flowcell flowcell
:lane (Integer/parseInt lane)
:tile (Integer/parseInt tile)
:x (Integer/parseInt x)
:y (Integer/parseInt y)
:pair (Integer/parseInt pair)
:filtered (= filtered "Y")
:control (Integer/parseInt control)
:index index}))) |
Encodes fastq name map to Casava1.8-style string. | (defn serialize-casava-1_8-name
^String
[{:keys [instrument run flowcell lane tile x y pair filtered control index]}]
(when (and instrument run flowcell lane tile x y pair (not (nil? filtered)) control index)
(str instrument \: run \: flowcell \: lane \: tile \: x \: y " "
pair \: (if filtered \Y \N) \: control \: index))) |
Tries parsing name of fastq read. | (defn deserialize-name [^String name] (first (keep #(% name) [deserialize-casava-1_8-name deserialize-casava-name]))) |
Tries encoding name of fastq read. | (defn serialize-name ^String [name] (first (keep #(% name) [serialize-casava-1_8-name serialize-casava-name]))) |