From 2dc9d10e441a979c742877491094ad4cdb5957d0 Mon Sep 17 00:00:00 2001 From: shenwei356 Date: Sat, 17 Sep 2016 11:10:27 +0800 Subject: [PATCH] v0.3.4 --- README.md | 31 +++++++++++++++++-------------- doc/docs/download.md | 9 ++++++--- doc/docs/usage.md | 8 +++++--- doc/site | 2 +- seqkit/cmd/fx2tab.go | 29 ++++++++++++++++++++++++++++- seqkit/cmd/helper.go | 2 +- seqkit/cmd/replace.go | 11 +++++++++-- seqkit/download_all_binaries.sh | 2 +- 8 files changed, 68 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 965df275..b56658ba 100644 --- a/README.md +++ b/README.md @@ -10,19 +10,22 @@ Latest version: [![Latest Version](https://img.shields.io/github/release/shenwei ## Introduction -FASTA and FASTQ are basic formats for storing nucleotide and protein sequences. -The manipulations of FASTA/Q file include converting, clipping, searching, -filtering, deduplication, splitting, shuffling, sampling and so on. -Existed tools only implemented parts of the functions, -and some of them are only available for specific operating systems. -Furthermore, the complicated installation process of dependencies packages and -running environment also make them less friendly to common users. - -SeqKit is a cross-platform, ultrafast, and practical FASTA/Q manipulations tool -that is friendly for researchers to complete wide ranges of FASTA/Q file processing. -The toolkit supports plain or gzip-compressed input and output -from either standard stream or files, -therefore, it could be easily used in command-line pipe. +FASTA and FASTQ are basic and ubiquitous formats for storing nucleotide and +protein sequences. Common manipulations of FASTA/Q file include converting, +searching, filtering, deduplication, splitting, shuffling, and sampling. +Existing tools only implement some of these manipulations, +and not particularly efficiently, and some are only available for certain +operating systems. Furthermore, the complicated installation process of +required packages and running environments can render these programs less +user friendly. + +This project describes a cross-platform ultrafast comprehensive +toolkit for FASTA/Q processing. SeqKit provides executable binary files for +all major operating systems, including Windows, Linux, and Mac OS X, and can +be directly used without any dependencies or pre-configurations. +SeqKit demonstrates competitive performance in execution time and memory +usage compared to similar tools. The efficiency and usability of SeqKit +enable researchers to rapidly accomplish common FASTA/Q file manipulations. ## Features @@ -203,7 +206,7 @@ when input files are (plain or gzipped) FASTA files, FASTA index would be optional used for rapid access of sequences and reducing memory occupation. -ATTENTION: the `.seqkit.fai` file created by SeqKit is a little different from `.fai` file +ATTENTION: the `.seqkit.fai` file created by SeqKit is slightly different from `.fai` file created by `samtools`. SeqKit uses full sequence head instead of just ID as key. ### Parallelization of CPU intensive jobs diff --git a/doc/docs/download.md b/doc/docs/download.md index ff5a747c..0cd2ebd3 100644 --- a/doc/docs/download.md +++ b/doc/docs/download.md @@ -6,8 +6,10 @@ SeqKit is implemented in [Golang](https://golang.org/) programming language, ## Latest Version -[SeqKit v0.3.3](https://github.com/shenwei356/seqkit/releases/tag/v0.3.3) - +[SeqKit v0.3.4](https://github.com/shenwei356/seqkit/releases/tag/v0.3.3) +- new feature: `fxtab` could output alphabet letters of a sequence +- new feature: new flag `-K` (`--keep-key`) for `replace`, when replacing +with key-value file, one can choose keeping the key as value or not. ***64-bit versions are highly recommended.*** @@ -76,7 +78,8 @@ For Go developer, just one command: go get -u github.com/shenwei356/seqkit/seqkit ## Release History - +- [SeqKit v0.3.3](https://github.com/shenwei356/seqkit/releases/tag/v0.3.4) + - add feature: `seqkit fx2tab ` can print the alphabet letters of every sequence with flag `-a` (`--alphabet`) - [SeqKit v0.3.3](https://github.com/shenwei356/seqkit/releases/tag/v0.3.3) - fix bug of `seqkit replace`, wrongly starting from 2 when using `{nr}` in `-r` (`--replacement`) diff --git a/doc/docs/usage.md b/doc/docs/usage.md index 75de42bf..dc930185 100644 --- a/doc/docs/usage.md +++ b/doc/docs/usage.md @@ -1038,7 +1038,7 @@ Or use the \ escape character. more on: http://bioinf.shenwei.me/seqkit/usage/#replace -Special repalcement symbols: +Special replacement symbols (only for replacing name not sequence): {nr} Record number, starting from 1 {kv} Corresponding value of the key ($1) by key-value file @@ -1048,7 +1048,8 @@ Usage: Flags: -s, --by-seq replace seq - -i, --ignore-case ignore case + -i, --ignore-case ignore case + -K, --keep-key keep the key as value when no value found for the key -k, --kv-file string tab-delimited key-value file for replacing key with value when using "{kv}" in -r (--replacement) -p, --pattern string search regular expression -r, --replacement string replacement. supporting capture variables. e.g. $1 represents the text of the first submatch. ATTENTION: use SINGLE quote NOT double quotes in *nix OS or use the \ escape character. Record number is also supported by "{nr}" @@ -1095,12 +1096,13 @@ Examples 1. Rename with number of record - echo -e ">abc\nACTG\n>123\nATTT" | seqkit replace -p .+ -r "seq_{NR}" + echo -e ">abc\nACTG\n>123\nATTT" | seqkit replace -p .+ -r "seq_{nr}" >seq_1 ACTG >seq_2 ATTT +1. Replace key ## shuffle diff --git a/doc/site b/doc/site index f7f86e8b..e4beb142 160000 --- a/doc/site +++ b/doc/site @@ -1 +1 @@ -Subproject commit f7f86e8bcf80c0b21de7c2a3bceb65ff04faca42 +Subproject commit e4beb1421165cf23ddc74d97ff2078b62ec698fe diff --git a/seqkit/cmd/fx2tab.go b/seqkit/cmd/fx2tab.go index 3c554fe8..4f37489f 100644 --- a/seqkit/cmd/fx2tab.go +++ b/seqkit/cmd/fx2tab.go @@ -24,10 +24,12 @@ import ( "fmt" "io" "runtime" + "sort" + "strings" - "github.com/shenwei356/xopen" "github.com/shenwei356/bio/seq" "github.com/shenwei356/bio/seqio/fastx" + "github.com/shenwei356/xopen" "github.com/spf13/cobra" ) @@ -57,6 +59,7 @@ like sequence length, GC content/GC skew. baseContents := getFlagStringSlice(cmd, "base-content") onlyName := getFlagBool(cmd, "name") printTitle := getFlagBool(cmd, "header-line") + printAlphabet := getFlagBool(cmd, "alphabet") outfh, err := xopen.Wopen(outFile) checkError(err) @@ -78,6 +81,10 @@ like sequence length, GC content/GC skew. outfh.WriteString(fmt.Sprintf("\t%s", bc)) } } + if printAlphabet { + outfh.WriteString("\talphabet") + } + outfh.WriteString("\n") } @@ -132,6 +139,10 @@ like sequence length, GC content/GC skew. outfh.WriteString(fmt.Sprintf("\t%.2f", record.Seq.BaseContent(bc)*100)) } } + + if printAlphabet { + outfh.WriteString(fmt.Sprintf("\t%s", alphabetStr(record.Seq.Seq))) + } outfh.WriteString("\n") } } @@ -148,4 +159,20 @@ func init() { fx2tabCmd.Flags().BoolP("only-id", "i", false, "print ID instead of full head") fx2tabCmd.Flags().BoolP("name", "n", false, "only print names (no sequences and qualities)") fx2tabCmd.Flags().BoolP("header-line", "H", false, "print header line") + fx2tabCmd.Flags().BoolP("alphabet", "a", false, "print alphabet letters") +} + +func alphabetStr(s []byte) string { + m := make(map[byte]struct{}) + for _, b := range s { + m[b] = struct{}{} + } + alphabet := make([]string, len(m)) + i := 0 + for a := range m { + alphabet[i] = string([]byte{a}) + i++ + } + sort.Strings(alphabet) + return strings.Join(alphabet, "") } diff --git a/seqkit/cmd/helper.go b/seqkit/cmd/helper.go index 5fdd8840..a579ef89 100644 --- a/seqkit/cmd/helper.go +++ b/seqkit/cmd/helper.go @@ -43,7 +43,7 @@ import ( ) // VERSION of seqkit -const VERSION = "0.3.3" +const VERSION = "0.3.4" func checkError(err error) { if err != nil { diff --git a/seqkit/cmd/replace.go b/seqkit/cmd/replace.go index 1aaeccc3..e4fbfa62 100644 --- a/seqkit/cmd/replace.go +++ b/seqkit/cmd/replace.go @@ -54,7 +54,7 @@ Or use the \ escape character. more on: http://bioinf.shenwei.me/seqkit/usage/#replace -Special repalcement symbols: +Special replacement symbols (only for replacing name not sequence): {nr} Record number, starting from 1 {kv} Corresponding value of the key ($1) by key-value file @@ -73,6 +73,7 @@ Special repalcement symbols: pattern := getFlagString(cmd, "pattern") replacement := []byte(getFlagString(cmd, "replacement")) kvFile := getFlagString(cmd, "kv-file") + keepKey := getFlagBool(cmd, "keep-key") bySeq := getFlagBool(cmd, "by-seq") // byName := getFlagBool(cmd, "by-name") @@ -100,6 +101,9 @@ Special repalcement symbols: if !regexp.MustCompile(`\(.+\)`).MatchString(pattern) { checkError(fmt.Errorf(`value of -p (--pattern) must contains "(" and ")" to capture data which is used specify the KEY`)) } + if bySeq { + checkError(fmt.Errorf(`replaceing with key-value pairs was not supported for sequence`)) + } if kvFile == "" { checkError(fmt.Errorf(`since repalcement symbol "{kv}"/"{KV}" found in value of flag -r (--replacement), tab-delimited key-value file should be given by flag -k (--kv-file)`)) } @@ -167,8 +171,10 @@ Special repalcement symbols: } if _, ok = kvs[k]; ok { r = reKV.ReplaceAll(r, []byte(kvs[k])) - } else { + } else if keepKey { r = reKV.ReplaceAll(r, found[1]) + } else { + r = reKV.ReplaceAll(r, []byte("")) } } } @@ -196,6 +202,7 @@ func init() { replaceCmd.Flags().BoolP("ignore-case", "i", false, "ignore case") replaceCmd.Flags().StringP("kv-file", "k", "", `tab-delimited key-value file for replacing key with value when using "{kv}" in -r (--replacement)`) + replaceCmd.Flags().BoolP("keep-key", "K", false, "keep the key as value when no value found for the key") } var reNR = regexp.MustCompile(`\{(NR|nr)\}`) diff --git a/seqkit/download_all_binaries.sh b/seqkit/download_all_binaries.sh index 47a3185b..61ed7080 100755 --- a/seqkit/download_all_binaries.sh +++ b/seqkit/download_all_binaries.sh @@ -1,7 +1,7 @@ #!/bin/sh rm seqkit_*.tar.gz -version="0.3.2" +version="0.3.4" wget https://github.com/shenwei356/seqkit/releases/download/v$version/seqkit_linux_386.tar.gz wget https://github.com/shenwei356/seqkit/releases/download/v$version/seqkit_linux_amd64.tar.gz