Functions to read and write the FASTQ format.

(ns cljam.io.fastq
  (:require [clojure.java.io :as cio]
            [clojure.string :as string]
            [cljam.io.protocols :as protocols]
            [cljam.util :as util])
  (:import [java.io Closeable]
           [java.nio Buffer CharBuffer])
  (:refer-clojure :exclude [name sequence]))
(declare read-sequences write-sequences)
(deftype FASTQReader [reader url]
  Closeable
  (close [this]
    (.close ^Closeable (.reader this)))
  protocols/IReader
  (reader-url [this] (.url this))
  (read [this] (read-sequences this))
  (read [this opts] (read-sequences this opts))
  (indexed? [_] false))
(deftype FASTQWriter [writer url]
  Closeable
  (close [this]
    (.close ^Closeable (.writer this)))
  protocols/IWriter
  (writer-url [this] (.url this)))

Returns an open cljam.io.fastq.FASTQReader of f. Should be used inside with-open to ensure the reader is properly closed.

(defn reader
  ^FASTQReader
  [f]
  (-> (util/compressor-input-stream f)
      cio/reader
      (FASTQReader. (util/as-url f))))

Returns an open cljam.io.fastq.FASTQWriter of f. Should be used inside with-open to ensure the writer is properly closed.

(defn writer
  ^FASTQWriter
  [f]
  (-> (util/compressor-output-stream f)
      cio/writer
      (FASTQWriter. (util/as-url f))))
(defrecord FASTQRead [^String name ^String sequence quality])

Deserialize a read from 4 lines of fastq file.

(defn- deserialize-fastq
  ^FASTQRead
  [[^String name-line ^String seq-line ^String plus-line ^String qual-line]
   {:keys [decode-quality] :or {decode-quality :phred33}}]
  {:pre [(not-empty name-line)
         (not-empty seq-line)
         (not-empty plus-line)
         (not-empty qual-line)
         (= (first name-line) \@)
         (= (first plus-line) \+)
         (not-empty (rest name-line))
         (or (empty? (rest plus-line))
             (= (rest plus-line) (rest name-line)))
         (= (count seq-line) (count qual-line))]
   :post [(every? (fn [q] (case decode-quality
                            :phred33 (<= 0 q 93)
                            :phred64 (<= 0 q 62)
                            true))
                  (:quality %))]}
  (FASTQRead.
   (subs name-line 1)
   seq-line
   (case decode-quality
     :phred33 (map #(- (int %) 33) qual-line)
     :phred64 (map #(- (int %) 64) qual-line)
     qual-line)))

Returns a lazy sequence of FASTQReads deserialized from given reader.

(defn read-sequences
  ([rdr]
   (read-sequences rdr {}))
  ([^FASTQReader rdr opts]
   (clojure.core/sequence
    (comp (map string/trim)
          (partition-all 4)
          (map #(deserialize-fastq % opts)))
    (line-seq (.reader rdr)))))

Serializes a FASTQRead to FASTQ format string.

(defn- serialize-fastq
  ^String
  [^FASTQRead {:keys [^String name ^String sequence quality]}
   {:keys [encode-quality] :or {encode-quality :phred33}}]
  {:pre [(not-empty name)
         (not-empty sequence)
         (not-empty quality)
         (= (count sequence) (count quality))
         (every? #(case encode-quality
                    :phred33 (<= 0 % 93)
                    :phred64 (<= 0 % 62)
                    true) quality)]}
  (let [cb (CharBuffer/allocate (+ 6 (.length name) (.length sequence) (.length sequence)))]
    (.put cb \@)
    (.put cb name)
    (.put cb \newline)
    (.put cb sequence)
    (.put cb \newline)
    (.put cb \+)
    (.put cb \newline)
    (if (string? quality)
      (.put cb ^String quality)
      (doseq [q quality]
        (.put cb (char (case encode-quality
                         :phred33 (+ (long q) 33)
                         :phred64 (+ (long q) 64)
                         q)))))
    (.put cb \newline)
    (.flip ^Buffer cb)
    (.toString cb)))

Writes given sequence of reads to a FASTQ file.

(defn write-sequences
  ([wtr sequences]
   (write-sequences wtr sequences {}))
  ([^FASTQWriter wtr sequences opts]
   (let [w ^java.io.Writer (.writer wtr)]
     (doseq [s sequences]
       (.write w ^String (serialize-fastq s opts))))))

Parses Casava-style name of fastq read.

(def ^:private casava-pattern
  #"^@?([^\s^:]+):(\d+):(\d+):(\d+):(\d+)#(\d+)/(\d)+$")
(defn deserialize-casava-name
  [^String name]
  (let [[match instrument lane tile x y index pair]
        (re-matches casava-pattern name)]
    (when match
      {:instrument instrument
       :lane (Integer/parseInt lane)
       :tile (Integer/parseInt tile)
       :x (Integer/parseInt x)
       :y (Integer/parseInt y)
       :index (Integer/parseInt index)
       :pair (Integer/parseInt pair)})))

Encodes fastq name map to Casava-style string.

(defn serialize-casava-name
  ^String
  [{:keys [instrument lane tile x y index pair]}]
  (when (and instrument lane tile x y index pair)
    (str instrument \: lane \: tile \: x \: y \# index \/ pair)))

Parses Casava1.8-style name of fastq read.

(def ^:private casava-1_8-pattern
  #"^@?([^\s^:]+):(\d+):([^\s^\:]+):(\d+):(\d+):(\d+):(\d+)\s+(\d+):(Y|N):(\d+):(\S+)$")
(defn deserialize-casava-1_8-name
  [^String name]
  (let [[match instrument run flowcell lane tile x y pair filtered control index]
        (re-matches casava-1_8-pattern name)]
    (when match
      {:instrument instrument
       :run (Integer/parseInt run)
       :flowcell flowcell
       :lane (Integer/parseInt lane)
       :tile (Integer/parseInt tile)
       :x (Integer/parseInt x)
       :y (Integer/parseInt y)
       :pair (Integer/parseInt pair)
       :filtered (= filtered "Y")
       :control (Integer/parseInt control)
       :index index})))

Encodes fastq name map to Casava1.8-style string.

(defn serialize-casava-1_8-name
  ^String
  [{:keys [instrument run flowcell lane tile x y pair filtered control index]}]
  (when (and instrument run flowcell lane tile x y pair (not (nil? filtered)) control index)
    (str instrument \: run \: flowcell \: lane \: tile \: x \: y " "
         pair \: (if filtered \Y \N) \: control \: index)))

Tries parsing name of fastq read.

(defn deserialize-name
  [^String name]
  (first (keep #(% name) [deserialize-casava-1_8-name deserialize-casava-name])))

Tries encoding name of fastq read.

(defn serialize-name
  ^String
  [name]
  (first (keep #(% name) [serialize-casava-1_8-name serialize-casava-name])))