Makes a sequence dictionary from FASTA data, and writes it to a file.

(ns cljam.io.dict.writer
  (:require [digest]
            [cljam.io.sam.common :refer [sam-version]]
            [cljam.util :refer [string->bytes graph?]])
  (:import java.io.BufferedWriter))

DICTWriter

(deftype DICTWriter [^java.io.BufferedWriter writer url]
  java.io.Closeable
  (close [_]
    (.close writer)))

Making dict

Equals to (- (byte (int \A)) (byte (int \a))).

(def ^:const ^:private upper-case-offset
  "Equals to `(- (byte (int \\A)) (byte (int \\a)))`."
  -32)
(defn- upper-case [^long b]
  (if (or (< b (byte (int \a))) (> b (byte (int \z))))
    b
    (byte (+ b upper-case-offset))))

Normalizes the sequence string, calculates its MD5 hash, and returns it.

(defn- make-hash
  [sequence']
  (let [bases' ^bytes (string->bytes sequence')]
    (loop [i 0]
      (when (< i (count bases'))
        (aset bases' i ^byte (upper-case (nth bases' i)))
        (recur (inc i))))
    (digest/md5 bases')))
(defn- init-dict-status
  []
  {:sequence , :len 0})
(defn- update-dict-status
  [dict-status sequence']
  {:sequence (str (:sequence dict-status) sequence')
   :len (+ (long (:len dict-status)) (count (filter graph? sequence')))})

Calculates sequence dictionary from the headers and sequences, returning it as a map.

(defn make-dict
  [_headers sequences ur]
  (loop [[seq* & rest'] sequences
         name' (:name seq*)
         dict-status (init-dict-status)
         dicts {}]
    (if seq*
      (let [name'' (:name seq*)
            new? (not= name'' name')
            dict-status' (update-dict-status
                          (if new? (init-dict-status) dict-status) (:sequence seq*))
            dicts' (if new?
                     (assoc dicts name' {:blen (:len dict-status)
                                         :ur ur
                                         :m5 (make-hash (:sequence dict-status))})
                     dicts)]
        (recur rest' name'' dict-status' dicts'))
      (assoc dicts name' {:blen (:len dict-status)
                          :ur ur
                          :m5 (make-hash (:sequence dict-status))}))))

Writing

(defn- write-header!
  [^BufferedWriter wtr]
  (.write wtr (str "@HD\tVN:" sam-version "\tSO:unsorted"))
  (.newLine wtr))
(defn- write-sequence!
  [^BufferedWriter wtr name' blen ur m5]
  (.write wtr (str "@SQ\tSN:" name' "\tLN:" blen "\tM5:" m5 "\tUR:" ur))
  (.newLine wtr))
(defn- write-dict*!
  [wtr headers sequences ur]
  (let [dicts (make-dict headers sequences ur)]
    (write-header! wtr)
    (doseq [header headers]
      (let [dict (get dicts (:name header))]
        (write-sequence! wtr
                         (:name header)
                         (:blen dict)
                         (:ur dict)
                         (:m5 dict))))))

Writes dict for fasta to wtr.

(defn write-dict!
  [^DICTWriter wtr headers sequences ur]
  (write-dict*! (.writer wtr) headers sequences ur))