Functions to remove duplications. | (ns cljam.algo.dedupe (:refer-clojure :exclude [dedupe]) (:require [com.climate.claypoole :as cp] [cljam.io.sam :as sam] [cljam.io.sam.util.flag :as flag])) |
(defn- refs->regions [refs] (for [{:keys [len] name' :name} refs] {:chr name' :start 1 :end len})) | |
(defn- sum-quals [a] (when-let [q ^String (:qual a)] (when-not (and (= (.length q) 1) (= \* (.charAt q 0))) (reduce (fn [^long r x] (+ r (- (int x) 33))) 0 q)))) | |
Returns a transducer which removes PCR duplications. | (defn dedupe-xform [& {:keys [remove-dups] :or {remove-dups true}}] (let [removed (atom #{}) cp (juxt sum-quals :mapq)] (comp (partition-by (juxt :rname :pos)) (mapcat (fn [alns] (loop [[{:keys [pos tlen qname flag] :as x} :as xs] alns heads {} tails [] dups []] (if x (cond (not (and (flag/multiple? flag) (not (flag/both-unmapped? flag)) (= (:rnext x) "="))) (recur (next xs) heads (conj tails x) dups) (pos? (long tlen)) (let [k [pos tlen]] (if-let [v (get heads k)] (let [[good bad] (if (pos? (compare (cp v) (cp x))) [v x] [x v])] (swap! removed conj (:qname bad)) (recur (next xs) (assoc heads k good) tails (conj dups bad))) (recur (next xs) (assoc heads k x) tails dups))) (@removed qname) (do (swap! removed disj qname) (recur (next xs) heads tails (conj dups x))) :else (recur (next xs) heads (conj tails x) dups)) (concat (vals heads) tails (when-not remove-dups (map (fn [a] (update a :flag #(bit-or (int %) 1024))) dups)))))))))) |
Removes PCR duplications from paired-end alignments. | (defn dedupe [in out & {:keys [remove-dups] :or {remove-dups true}}] (cp/with-shutdown! [pool (cp/ncpus)] (let [[header refs] (with-open [r (sam/bam-reader in)] [(sam/read-header r) (sam/read-refs r)])] (with-open [w (sam/bam-writer out)] (sam/write-header w header) (sam/write-refs w header) (sam/write-alignments w (->> refs refs->regions (cp/pmap pool (fn [region] (with-open [r (sam/bam-reader in)] (->> (sam/read-alignments r region) (sequence (dedupe-xform :remove-dups remove-dups)) doall)))) (sequence cat)) header))))) |