| |
A type of VCF reader and internal functions to read VCF contents. See
https://samtools.github.io/hts-specs/ for the detail VCF specifications.
| ( ns cljam.io.vcf.reader
( :require [ clojure.string :as cstr ]
[ clojure.tools.logging :as logging ]
[ cljam.io.protocols :as protocols ]
[ camel-snake-kebab.core :refer [ ->kebab-case-keyword ] ]
[ proton.core :refer [ as-long ] ]
[ cljam.io.util.bin :as util-bin ]
[ cljam.io.vcf.util :as vcf-util ] )
( :import [ java.io Closeable BufferedReader FileNotFoundException ]
[ clojure.lang LazilyPersistentVector ]
bgzf4j.BGZFInputStream ) )
|
|
VCFReader
| |
| ( declare read-variants read-variants-randomly read-file-offsets )
|
|
| ( deftype VCFReader [ url meta-info header reader index-delay ]
Closeable
( close [ this ]
( .close ^ Closeable ( .reader this ) ) )
protocols/IReader
( reader-url [ this ] ( .url this ) )
( read [ this ] ( read-variants this ) )
( read [ this option ] ( read-variants this option ) )
( indexed? [ _ ]
( try
@ index-delay
true
( catch FileNotFoundException _
false ) ) )
protocols/IRegionReader
( read-in-region [ this region ]
( protocols/read-in-region this region { } ) )
( read-in-region [ this { :keys [ chr start end ] } option ]
( logging/warn "May cause degradation of performance." )
( filter
( fn [ v ]
( and ( if chr ( = ( :chr v ) chr ) true )
( if start ( <= ( long start ) ( long ( :pos v ) ) ) true )
( if end ( <= ( + ( long ( :pos v ) ) ( count ( :ref v ) ) ) ( long end ) ) true ) ) )
( read-variants this option ) ) ) )
|
|
need dynamic extension for namespace issue.
| ( extend-type VCFReader
protocols/IVariantReader
( meta-info [ this ] ( .meta-info this ) )
( header [ this ] ( .header this ) )
( read-variants
( [ this ] ( protocols/read-variants this { } ) )
( [ this option ] ( read-variants this option ) ) )
( read-variants-randomly [ this region-option deep-option ]
( read-variants-randomly this region-option deep-option ) )
( read-file-offsets [ this ]
( read-file-offsets this ) ) )
|
|
Utilities
| |
| ( defn- dot->nil
[ ^ String s ]
( when-not ( and ( = 1 ( .length s ) ) ( = \. ( .charAt s 0 ) ) ) s ) )
|
|
Loading meta-information
| |
| ( defn- meta-line?
[ line ]
( = ( subs line 0 2 ) "##" ) )
|
|
e.g. (parse-structured-line "ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\"")
=> {:id "NS" :number "1" :type "Integer"
:description "Number of Samples With Data"}
| ( defn- parse-structured-line
"e.g. (parse-structured-line \"ID=NS,Number=1,Type=Integer,Description=\\\"Number of Samples With Data\\\"\")
=> {:id \"NS\" :number \"1\" :type \"Integer\"
:description \"Number of Samples With Data\"}"
[ s ]
( letfn [ ( parse-string-field [ s ]
( loop [ s s , acc [ ] ]
( if-let [ [ _ pre c post ] ( re-matches #"([^\\\"]*?)([\\\"])(.*)" s ) ]
( if ( = c "\\" )
( if ( #{ \\ \" } ( first post ) )
( recur ( subs post 1 ) ( conj acc pre ( first post ) ) )
( let [ msg ( str "Either '\\' or '\"' was expected immediately after '\\', "
"but got '" ( first post ) "'" ) ]
( throw ( RuntimeException. msg ) ) ) )
[ nil ( cstr/join ( conj acc pre ) ) ( cstr/replace post #"^," "" ) ] )
( throw ( RuntimeException. "Unexpected end of string field" ) ) ) ) ) ]
( loop [ s s , m { } ]
( if-let [ [ _ k more ] ( some->> s ( re-matches #"([^=]+?)=(.*)" ) ) ]
( let [ [ _ v more ] ( if ( = ( first more ) \" )
( parse-string-field ( subs more 1 ) )
( re-matches #"([^,]*?)(?:,(.*))?" more ) ) ]
( recur more ( assoc m ( ->kebab-case-keyword k ) ( dot->nil v ) ) ) )
m ) ) ) )
|
|
| ( defn- parse-meta-info-contig
[ m ]
( update m :length as-long ) )
|
|
| ( defn- parse-meta-info-info
[ m ]
( update m :number ( fn [ s ]
( if ( #{ "A" "R" "G" } s )
s
( as-long s ) ) ) ) )
|
|
Parses a single line string as meta-info.
| ( defn parse-meta-info-line
[ line ]
( let [ [ _ k* v ] ( re-find #"^##([\w:/\.\?\-]*)=(.*)$" line )
k ( ->kebab-case-keyword k* ) ]
[ k ( if-let [ [ _ s ] ( re-find #"^<(.+)>$" v ) ]
( cond-> ( parse-structured-line s )
( #{ :info :format } k ) parse-meta-info-info
( = k :contig ) parse-meta-info-contig )
v ) ] ) )
|
|
Reads from rdr and parses them as meta-info.
| ( defn load-meta-info
[ ^ BufferedReader rdr ]
( loop [ line ( .readLine rdr ) , meta-info { } ]
( if ( meta-line? line )
( let [ [ k v ] ( parse-meta-info-line line ) ]
( recur ( .readLine rdr )
( if ( #{ :contig :info :filter :format :alt :sample :pedigree } k )
( if ( get meta-info k )
( update meta-info k conj v )
( assoc meta-info k [ v ] ) )
( assoc meta-info k v ) ) ) )
meta-info ) ) )
|
|
Loading header
| |
| ( defn- header-line?
[ line ]
( not ( nil? ( re-find #"^#[^#]*$" line ) ) ) )
|
|
Parses a single line string as header
| ( defn parse-header-line
[ line ]
( cstr/split ( subs line 1 ) #"\t" ) )
|
|
Reads from rdr and parses them as header.
| ( defn load-header
[ ^ BufferedReader rdr ]
( loop [ line ( .readLine rdr ) ]
( if ( header-line? line )
( parse-header-line line )
( recur ( .readLine rdr ) ) ) ) )
|
|
Reading data lines
| |
| ( defn- parse-data-line
[ line kws ]
( let [ [ fields gt-fields ] ( ->> ( .split ^ String line "\t" )
LazilyPersistentVector/createOwning
( map dot->nil )
( split-at 8 ) ) ]
( ->> gt-fields
( interleave kws )
( concat [ :chr ( first fields )
:pos ( as-long ( nth fields 1 ) )
:id ( nth fields 2 )
:ref ( nth fields 3 )
:alt ( some-> ( nth fields 4 ) ( cstr/split #"," ) )
:qual ( some-> ( nth fields 5 ) Double/parseDouble )
:filter ( nth fields 6 )
:info ( nth fields 7 ) ] )
( apply hash-map ) ) ) )
|
|
| ( defn- read-data-lines
[ rdr header kws ]
( when-let [ line ( if ( instance? BufferedReader rdr )
( .readLine ^ BufferedReader rdr )
( .readLine ^ BGZFInputStream rdr ) ) ]
( if-not ( or ( meta-line? line ) ( header-line? line ) )
( cons ( parse-data-line line kws )
( lazy-seq ( read-data-lines rdr header kws ) ) )
( recur rdr header kws ) ) ) )
|
|
Reads variants of the VCF/BCF file, returning them as a lazy sequence.
| ( defn read-variants
( [ rdr ]
( read-variants rdr { } ) )
( [ ^ VCFReader rdr { :keys [ depth ] :or { depth :deep } } ]
( let [ kws ( mapv keyword ( drop 8 ( .header rdr ) ) )
parse-fn ( case depth
:deep ( vcf-util/variant-parser ( .meta-info rdr ) ( .header rdr ) )
:vcf identity ) ]
( map parse-fn ( read-data-lines ( .reader rdr ) ( .header rdr ) kws ) ) ) ) )
|
|
| ( defn- make-lazy-variants [ f s ]
( when-first [ fs s ]
( lazy-cat
( f fs )
( make-lazy-variants f ( rest s ) ) ) ) )
|
|
Reads variants of the bgzip compressed VCF file randomly using tabix/csi file
Returning them as a lazy sequence.
| ( defn read-variants-randomly
[ ^ VCFReader rdr
{ :keys [ chr ^ long start ^ long end ] :or { start 1 end 4294967296 } }
{ :keys [ depth ] :or { depth :deep } } ]
( let [ kws ( mapv keyword ( drop 8 ( .header rdr ) ) )
index-data @ ( .index-delay rdr )
chr-names ( or ( util-bin/get-chr-names index-data )
( ->> ( .meta-info rdr ) :contig ( mapv :id ) ) )
ref-idx ( .indexOf ^ clojure.lang.PersistentVector chr-names chr )
spans ( when-not ( neg? ref-idx )
( util-bin/get-spans index-data ref-idx start end ) )
input-stream ^ BGZFInputStream ( .reader rdr )
parse-fn ( case depth
:deep ( vcf-util/variant-parser ( .meta-info rdr )
( .header rdr ) )
:vcf identity ) ]
( make-lazy-variants
( fn [ [ chunk-beg ^ long chunk-end ] ]
( .seek input-stream chunk-beg )
( ->> #( when ( < ( .getFilePointer input-stream ) chunk-end )
( -> input-stream
.readLine
( parse-data-line kws )
parse-fn ) )
repeatedly
( take-while identity )
( filter
( fn [ { chr' :chr
ref' :ref
:keys [ ^ long pos info ] } ]
( and ( = chr' chr )
( <= pos end )
( <= start
( long ( get info :END ( dec ( + pos ( count ref' ) ) ) ) ) ) ) ) ) ) )
spans ) ) )
|
|
Reads file offsets and a genomic position of variants from bgzip compressed
VCF and returns them as a lazy sequence. Each element is a map containing
:chr, :chr-index, :beg, :end, :file-beg, :file-end.
| ( defn read-file-offsets
[ ^ VCFReader rdr ]
( let [ ^ BGZFInputStream input-stream ( .reader rdr )
meta-info-contigs ( ->> ( :contig ( .meta-info rdr ) )
( map-indexed ( fn [ index contig ]
[ ( :id contig ) index ] ) )
( into { } ) )
parse ( comp ( vcf-util/variant-parser ( .meta-info rdr )
( take 8 ( .header rdr ) ) )
#( parse-data-line % nil ) ) ]
( letfn [ ( step [ contigs beg-pointer ]
( when-let [ line ( .readLine input-stream ) ]
( let [ end-pointer ( .getFilePointer input-stream ) ]
( if ( or ( meta-line? line ) ( header-line? line ) )
( lazy-seq ( step contigs end-pointer ) )
( let [ { :keys [ chr pos info ] ref' :ref } ( parse line )
contigs' ( if ( contains? contigs chr )
contigs
( assoc contigs chr ( count contigs ) ) ) ]
( cons { :file-beg beg-pointer , :file-end end-pointer
:chr-index ( contigs' chr ) , :beg pos , :chr chr ,
:end ( or ( :END info )
( dec ( + ( long pos ) ( count ref' ) ) ) ) }
( lazy-seq ( step contigs' end-pointer ) ) ) ) ) ) ) ) ]
( step meta-info-contigs 0 ) ) ) )
|
|
| |