diff --git a/doc/docs/index.md b/doc/docs/index.md deleted file mode 120000 index fe84005..0000000 --- a/doc/docs/index.md +++ /dev/null @@ -1 +0,0 @@ -../../README.md \ No newline at end of file diff --git a/doc/docs/index.md b/doc/docs/index.md new file mode 100644 index 0000000..75e9cad --- /dev/null +++ b/doc/docs/index.md @@ -0,0 +1,141 @@ +# SeqKit - a cross-platform and ultrafast toolkit for FASTA/Q file manipulation + + +- **Documents:** [http://bioinf.shenwei.me/seqkit](http://bioinf.shenwei.me/seqkit) +([**Usage**](http://bioinf.shenwei.me/seqkit/usage/), +[**FAQ**](http://bioinf.shenwei.me/seqkit/faq/), +[**Tutorial**](http://bioinf.shenwei.me/seqkit/tutorial/), +and +[**Benchmark**](http://bioinf.shenwei.me/seqkit/benchmark/)) +- **Source code:** [https://github.com/shenwei356/seqkit](https://github.com/shenwei356/seqkit) +[![GitHub stars](https://img.shields.io/github/stars/shenwei356/seqkit.svg?style=social&label=Star&?maxAge=2592000)](https://github.com/shenwei356/seqkit) +[![license](https://img.shields.io/github/license/shenwei356/seqkit.svg?maxAge=2592000)](https://github.com/shenwei356/seqkit/blob/master/LICENSE) +- **Latest version:** [![Latest Version](https://img.shields.io/github/release/shenwei356/seqkit.svg?style=flat?maxAge=86400)](https://github.com/shenwei356/seqkit/releases) +[![Github Releases](https://img.shields.io/github/downloads/shenwei356/seqkit/latest/total.svg?maxAge=3600)](http://bioinf.shenwei.me/seqkit/download/) +[![Cross-platform](https://img.shields.io/badge/platform-any-ec2eb4.svg?style=flat)](http://bioinf.shenwei.me/seqkit/download/) +[![Anaconda Cloud](https://anaconda.org/bioconda/seqkit/badges/version.svg)](https://anaconda.org/bioconda/seqkit) +- **[Please cite](#citation):** [![doi](https://img.shields.io/badge/doi-10.1371%2Fjournal.pone.0163962-blue.svg?style=flat)](https://doi.org/10.1371/journal.pone.0163962) +[![Citation Badge](https://api.juleskreuer.eu/citation-badge.php?doi=10.1371/journal.pone.0163962)](https://scholar.google.com/citations?view_op=view_citation&hl=en&user=wHF3Lm8AAAAJ&citation_for_view=wHF3Lm8AAAAJ:zYLM7Y9cAGgC) +- **Others**: [![check in Biotreasury](https://img.shields.io/badge/Biotreasury-collected-brightgreen)](https://biotreasury.rjmart.cn/#/tool?id=10081) + +## Features + +- **Easy to install** ([download](http://bioinf.shenwei.me/seqkit/download/)) + - Providing statically linked executable binaries for multiple platforms (Linux/Windows/macOS, amd64/arm64) + - Light weight and out-of-the-box, no dependencies, no compilation, no configuration + - `conda install -c bioconda seqkit` +- **Easy to use** + - Ultrafast (see [technical-details](http://bioinf.shenwei.me/seqkit/usage/#technical-details-and-guides-for-use) and [benchmark](http://bioinf.shenwei.me/seqkit/benchmark)) + - Seamlessly parsing both FASTA and FASTQ formats + - Supporting (`gzip`/`xz`/`zstd`/`bzip2` compressed) STDIN/STDOUT and input/output file, easily integrated in pipe + - Reproducible results (configurable rand seed in `sample` and `shuffle`) + - Supporting custom sequence ID via regular expression + - Supporting [Bash/Zsh autocompletion](http://bioinf.shenwei.me/seqkit/download/#shell-completion) +- **Versatile commands** ([usages and examples](http://bioinf.shenwei.me/seqkit/usage/)) + - Practical functions supported by [38 subcommands](#subcommands) + + +## Installation + +Go to [Download Page](http://bioinf.shenwei.me/seqkit/download) for more download options and changelogs, or +install via conda: + + conda install -c bioconda seqkit + +## Subcommands + +|Category |Command |Function |Input |Strand-sensitivity|Multi-threads| +|:----------------|:-------------------------------------------------------------------|:--------------------------------------------------------------------------------------------|:--------------|:-----------------|:------------| +|Basic operation |[seq](https://bioinf.shenwei.me/seqkit/usage/#seq) |Transform sequences: extract ID/seq, filter by length/quality, remove gaps… |FASTA/Q | | | +| |[stats](https://bioinf.shenwei.me/seqkit/usage/#stats) |Simple statistics: #seqs, min/max_len, N50, Q20%, Q30%… |FASTA/Q | |✓ | +| |[subseq](https://bioinf.shenwei.me/seqkit/usage/#subseq) |Get subsequences by region/gtf/bed, including flanking sequences |FASTA/Q |+ or/and - | | +| |[sliding](https://bioinf.shenwei.me/seqkit/usage/#sliding) |Extract subsequences in sliding windows |FASTA/Q |+ only | | +| |[faidx](https://bioinf.shenwei.me/seqkit/usage/#faidx) |Create the FASTA index file and extract subsequences (with more features than samtools faidx)|FASTA |+ or/and - | | +| |[translate](https://bioinf.shenwei.me/seqkit/usage/#translate) |translate DNA/RNA to protein sequence |FASTA/Q |+ or/and - | | +| |[watch ](https://bioinf.shenwei.me/seqkit/usage/#watch ) |Monitoring and online histograms of sequence features |FASTA/Q | | | +| |[scat ](https://bioinf.shenwei.me/seqkit/usage/#scat ) |Real time concatenation and streaming of fastx files |FASTA/Q | |✓ | +|Format conversion|[fq2fa](https://bioinf.shenwei.me/seqkit/usage/#fq2fa) |Convert FASTQ to FASTA format |FASTQ | | | +| |[fx2tab](https://bioinf.shenwei.me/seqkit/usage/#fx2tab) |Convert FASTA/Q to tabular format |FASTA/Q | | | +| |[fa2fq](https://bioinf.shenwei.me/seqkit/usage/#fa2fq) |Retrieve corresponding FASTQ records by a FASTA file |FASTA/Q |+ only | | +| |[tab2fx](https://bioinf.shenwei.me/seqkit/usage/#tab2fx) |Convert tabular format to FASTA/Q format |TSV | | | +| |[convert](https://bioinf.shenwei.me/seqkit/usage/#convert) |Convert FASTQ quality encoding between Sanger, Solexa and Illumina |FASTA/Q | | | +|Searching |[grep](https://bioinf.shenwei.me/seqkit/usage/#grep) |Search sequences by ID/name/sequence/sequence motifs, mismatch allowed |FASTA/Q |+ and - |partly, -m | +| |[locate](https://bioinf.shenwei.me/seqkit/usage/#locate) |Locate subsequences/motifs, mismatch allowed |FASTA/Q |+ and - |partly, -m | +| |[amplicon](https://bioinf.shenwei.me/seqkit/usage/#amplicon) |Extract amplicon (or specific region around it), mismatch allowed |FASTA/Q |+ and - |partly, -m | +| |[fish](https://bioinf.shenwei.me/seqkit/usage/#fish) |Look for short sequences in larger sequences |FASTA/Q |+ and - | | +|Set operation |[sample](https://bioinf.shenwei.me/seqkit/usage/#sample) |Sample sequences by number or proportion |FASTA/Q | | | +| |[rmdup](https://bioinf.shenwei.me/seqkit/usage/#rmdup) |Remove duplicated sequences by ID/name/sequence |FASTA/Q |+ and - | | +| |[common](https://bioinf.shenwei.me/seqkit/usage/#common) |Find common sequences of multiple files by id/name/sequence |FASTA/Q |+ and - | | +| |[duplicate](https://bioinf.shenwei.me/seqkit/usage/#duplicate) |Duplicate sequences N times |FASTA/Q | | | +| |[split](https://bioinf.shenwei.me/seqkit/usage/#split) |Split sequences into files by id/seq region/size/parts (mainly for FASTA) |FASTA preffered| | | +| |[split2](https://bioinf.shenwei.me/seqkit/usage/#split2) |Split sequences into files by size/parts (FASTA, PE/SE FASTQ) |FASTA/Q | | | +| |[head](https://bioinf.shenwei.me/seqkit/usage/#head) |Print first N FASTA/Q records |FASTA/Q | | | +| |[head-genome](https://bioinf.shenwei.me/seqkit/usage/#head-genome) |Print sequences of the first genome with common prefixes in name |FASTA/Q | | | +| |[range](https://bioinf.shenwei.me/seqkit/usage/#range) |Print FASTA/Q records in a range (start:end) |FASTA/Q | | | +| |[pair](https://bioinf.shenwei.me/seqkit/usage/#pair) |Patch up paired-end reads from two fastq files |FASTA/Q | | | +|Edit |[replace](https://bioinf.shenwei.me/seqkit/usage/#replace) |Replace name/sequence by regular expression |FASTA/Q |+ only | | +| |[rename](https://bioinf.shenwei.me/seqkit/usage/#rename) |Rename duplicated IDs |FASTA/Q | | | +| |[concat](https://bioinf.shenwei.me/seqkit/usage/#concat) |Concatenate sequences with same ID from multiple files |FASTA/Q |+ only | | +| |[restart](https://bioinf.shenwei.me/seqkit/usage/#restart) |Reset start position for circular genome |FASTA/Q |+ only | | +| |[mutate](https://bioinf.shenwei.me/seqkit/usage/#mutate) |Edit sequence (point mutation, insertion, deletion) |FASTA/Q |+ only | | +| |[sana](https://bioinf.shenwei.me/seqkit/usage/#sana) |Sanitize broken single line FASTQ files |FASTQ | | | +|Ordering |[sort](https://bioinf.shenwei.me/seqkit/usage/#sort) |Sort sequences by id/name/sequence/length |FASTA preffered| | | +| |[shuffle](https://bioinf.shenwei.me/seqkit/usage/#shuffle) |Shuffle sequences |FASTA preffered| | | +|BAM processing |[bam](https://bioinf.shenwei.me/seqkit/usage/#bam) |Monitoring and online histograms of BAM record features |BAM | | | +|Miscellaneous |[sum](https://bioinf.shenwei.me/seqkit/usage/#sum) |Compute message digest for all sequences in FASTA/Q files |FASTA/Q | |✓ | +| |[merge-slides](https://bioinf.shenwei.me/seqkit/usage/#merge-slides)|Merge sliding windows generated from seqkit sliding |TSV | | + +Notes: + +- Strand-sensitivity: + - `+ only`: only processing on the positive/forward strand. + - `+ and -`: searching on both strands. + - `+ or/and -`: depends on users' flags/options/arguments. +- Multiple-threads: Using the default 4 threads is fast enough for most commands, some commands can benefit from extra threads. + +## Citation + +**W Shen**, S Le, Y Li\*, F Hu\*. SeqKit: a cross-platform and ultrafast toolkit for FASTA/Q file manipulation. +***PLOS ONE***. [doi:10.1371/journal.pone.0163962](https://doi.org/10.1371/journal.pone.0163962). + + +## Contributors + +- [Wei Shen](https://github.com/shenwei356) +- [Botond Sipos](https://github.com/botond-sipos): `bam`, `scat`, `fish`, `sana`, `watch`. +- [others](https://github.com/shenwei356/seqkit/graphs/contributors) + +## Acknowledgements + +We thank [Lei Zhang](https://github.com/jameslz) for testing SeqKit, +and also thank [Jim Hester](https://github.com/jimhester/), +author of [fasta_utilities](https://github.com/jimhester/fasta_utilities), +for advice on early performance improvements of for FASTA parsing +and [Brian Bushnell](https://twitter.com/BBToolsBio), +author of [BBMaps](https://sourceforge.net/projects/bbmap/), +for advice on naming SeqKit and adding accuracy evaluation in benchmarks. +We also thank Nicholas C. Wu from the Scripps Research Institute, +USA for commenting on the manuscript +and [Guangchuang Yu](http://guangchuangyu.github.io/) +from State Key Laboratory of Emerging Infectious Diseases, +The University of Hong Kong, HK for advice on the manuscript. + +We thank [Li Peng](https://github.com/penglbio) for reporting many bugs. + +We appreciate [Klaus Post](https://github.com/klauspost) for his fantastic packages ( +[compress](https://github.com/klauspost/compress) and [pgzip](https://github.com/klauspost/pgzip) +) which accelerate gzip file reading and writing. + +## Contact + +[Create an issue](https://github.com/shenwei356/seqkit/issues) to report bugs, +propose new functions or ask for help. + +## License + +[MIT License](https://github.com/shenwei356/seqkit/blob/master/LICENSE) + +## Starchart + +Stargazers over time + diff --git a/doc/docs/usage.md b/doc/docs/usage.md index 6b687a0..2eaaf32 100644 --- a/doc/docs/usage.md +++ b/doc/docs/usage.md @@ -468,7 +468,7 @@ Usage ``` text get subsequences by region/gtf/bed, including flanking sequences. -Attentions: +Attention: 1. Use "seqkit grep" for extract subsets of sequences. "seqtk subseq seqs.fasta id.txt" equals to "seqkit grep -f id.txt seqs.fasta" @@ -696,7 +696,7 @@ Columns: 17. AvgQual average quality 18. GC(%) percentage of GC content -Attentions: +Attention: 1. Sequence length metrics (sum_len, min_len, avg_len, max_len, Q1, Q2, Q3) count the number of gaps or spaces. You can remove them with "seqkit seq -g": seqkit seq -g input.fasta | seqkit stats @@ -826,7 +826,7 @@ Usage ```text compute message digest for all sequences in FASTA/Q files -Attentions: +Attention: 1. Sequence headers and qualities are skipped, only sequences matter. 2. The order of sequences records does not matter. 3. Circular complete genomes are supported with the flag -c/--circular. @@ -955,7 +955,7 @@ This command is similar with "samtools faidx" but has some extra features: 3. if you have large number of IDs, you can use: seqkit faidx seqs.fasta -l IDs.txt -Attentions: +Attention: 1. The flag -U/--update-faidx is recommended to ensure the .fai file matches the FASTA file. The definition of region is 1-based and with some custom design. @@ -1636,7 +1636,7 @@ Usage ``` text search sequences by ID/name/sequence/sequence motifs, mismatch allowed -Attentions: +Attention: 0. By default, we match sequence ID with patterns, use "-n/--by-name" for matching full name instead of just ID. @@ -1824,7 +1824,7 @@ Usage ``` text locate subsequences/motifs, mismatch allowed -Attentions: +Attention: 1. Motifs could be EITHER plain sequence containing "ACTGN" OR regular expression like "A[TU]G(?:.{3})+?[TU](?:AG|AA|GA)" for ORFs. @@ -2056,7 +2056,7 @@ Usage ``` text extract amplicon (or specific region around it) via primer(s). -Attentions: +Attention: 1. Only one (the longest) matching location is returned for every primer pair. 2. Mismatch is allowed, but the mismatch location (5' or 3') is not controled. You can increase the value of "-j/--threads" to accelerate processing. @@ -2305,7 +2305,7 @@ Usage ``` text remove duplicated sequences by ID/name/sequence -Attentions: +Attention: 1. When comparing by sequences, both positive and negative strands are compared. Switch on -P/--only-positive-strand for considering the positive strand only. @@ -2530,7 +2530,7 @@ If you want to cut a sequence into multiple segments. E.g., cutting into segments of 40 bp and keeping the last segment which can be shorter than 40 bp. seqkit sliding -g -s 40 -W 40 input.fasta -o out.fasta -Attentions: +Attention: 1. For the two-pass mode (-2/--two-pass), The flag -U/--update-faidx is recommended to ensure the .fai file matches the FASTA file. @@ -2791,7 +2791,7 @@ Usage ```text match up paired-end reads from two fastq files -Attentions: +Attention: 1. Orders of headers in the two files better be the same (not shuffled), otherwise, it consumes a huge number of memory for buffering reads in memory. 2. Unpaired reads are optional outputted with the flag -u/--save-unpaired. @@ -3418,7 +3418,7 @@ Usage ``` text concatenate sequences with same ID from multiple files -Attentions: +Attention: 1. By default, only sequences with IDs that appear in all files are outputted. use -f/--full to output all sequences. 2. If there are more than one sequences of the same ID, we output the Cartesian @@ -3481,7 +3481,7 @@ Usage ``` text edit sequence (point mutation, insertion, deletion) -Attentions: +Attention: 1. Mutiple point mutations (-p/--point) are allowed, but only single insertion (-i/--insertion) OR single deletion (-d/--deletion) is allowed. @@ -3694,7 +3694,7 @@ seqkit will write the sequences to temporary files, and create FASTA index. Secondly, seqkit shuffles sequence IDs and extract sequences by FASTA index. -Attentions: +Attention: 1. For the two-pass mode (-2/--two-pass), The flag -U/--update-faidx is recommended to ensure the .fai file matches the FASTA file. @@ -3757,7 +3757,7 @@ seqkit will write the sequences to temporary files, and create FASTA index. Secondly, seqkit sorts sequence by head and length information and extracts sequences by FASTA index. -Attentions: +Attention: 1. For the two-pass mode (-2/--two-pass), The flag -U/--update-faidx is recommended to ensure the .fai file matches the FASTA file. diff --git a/seqkit/cmd/amplicon.go b/seqkit/cmd/amplicon.go index 6e2bf61..f1b5cf5 100644 --- a/seqkit/cmd/amplicon.go +++ b/seqkit/cmd/amplicon.go @@ -52,7 +52,7 @@ var ampliconCmd = &cobra.Command{ Short: "extract amplicon (or specific region around it) via primer(s)", Long: `extract amplicon (or specific region around it) via primer(s). -Attentions: +Attention: 1. Only one (the longest) matching location is returned for every primer pair. 2. Mismatch is allowed, but the mismatch location (5' or 3') is not controlled. You can increase the value of "-j/--threads" to accelerate processing. diff --git a/seqkit/cmd/concat.go b/seqkit/cmd/concat.go index 448082e..62a1b49 100644 --- a/seqkit/cmd/concat.go +++ b/seqkit/cmd/concat.go @@ -43,7 +43,7 @@ var concateCmd = &cobra.Command{ Short: "concatenate sequences with the same ID from multiple files", Long: `concatenate sequences with same ID from multiple files -Attentions: +Attention: 1. By default, only sequences with IDs that appear in all files are outputted. use -f/--full to output all sequences. 2. If there are more than one sequences of the same ID, we output the Cartesian diff --git a/seqkit/cmd/faidx.go b/seqkit/cmd/faidx.go index d72d01d..5ce75ee 100644 --- a/seqkit/cmd/faidx.go +++ b/seqkit/cmd/faidx.go @@ -52,7 +52,7 @@ This command is similar with "samtools faidx" but has some extra features: 3. if you have large number of IDs, you can use: seqkit faidx seqs.fasta -l IDs.txt -Attentions: +Attention: 1. The flag -U/--update-faidx is recommended to ensure the .fai file matches the FASTA file. The definition of region is 1-based and with some custom design. diff --git a/seqkit/cmd/grep.go b/seqkit/cmd/grep.go index be1c05c..feab5f4 100644 --- a/seqkit/cmd/grep.go +++ b/seqkit/cmd/grep.go @@ -50,7 +50,7 @@ var grepCmd = &cobra.Command{ Short: "search sequences by ID/name/sequence/sequence motifs, mismatch allowed", Long: fmt.Sprintf(`search sequences by ID/name/sequence/sequence motifs, mismatch allowed -Attentions: +Attention: 0. By default, we match sequence ID with patterns, use "-n/--by-name" for matching full name instead of just ID. diff --git a/seqkit/cmd/locate.go b/seqkit/cmd/locate.go index e08a5dd..6398e71 100644 --- a/seqkit/cmd/locate.go +++ b/seqkit/cmd/locate.go @@ -45,7 +45,7 @@ var locateCmd = &cobra.Command{ Short: "locate subsequences/motifs, mismatch allowed", Long: `locate subsequences/motifs, mismatch allowed -Attentions: +Attention: 1. Motifs could be EITHER plain sequence containing "ACTGN" OR regular expression like "A[TU]G(?:.{3})+?[TU](?:AG|AA|GA)" for ORFs. diff --git a/seqkit/cmd/mutate.go b/seqkit/cmd/mutate.go index 651e4ea..edf29f8 100644 --- a/seqkit/cmd/mutate.go +++ b/seqkit/cmd/mutate.go @@ -43,7 +43,7 @@ var mutateCmd = &cobra.Command{ Short: "edit sequence (point mutation, insertion, deletion)", Long: fmt.Sprintf(`edit sequence (point mutation, insertion, deletion) -Attentions: +Attention: 1. Multiple point mutations (-p/--point) are allowed, but only single insertion (-i/--insertion) OR single deletion (-d/--deletion) is allowed. diff --git a/seqkit/cmd/pair.go b/seqkit/cmd/pair.go index fd73941..ebb08bc 100644 --- a/seqkit/cmd/pair.go +++ b/seqkit/cmd/pair.go @@ -47,7 +47,7 @@ var pairCmd = &cobra.Command{ Short: "match up paired-end reads from two fastq files", Long: `match up paired-end reads from two fastq files -Attentions: +Attention: 1. Orders of headers in the two files better be the same (not shuffled), otherwise, it consumes a huge number of memory for buffering reads in memory. 2. Unpaired reads are optional outputted with the flag -u/--save-unpaired. diff --git a/seqkit/cmd/rmdup.go b/seqkit/cmd/rmdup.go index 7269d86..679fc2d 100644 --- a/seqkit/cmd/rmdup.go +++ b/seqkit/cmd/rmdup.go @@ -43,7 +43,7 @@ var rmdupCmd = &cobra.Command{ Short: "remove duplicated sequences by ID/name/sequence", Long: `remove duplicated sequences by ID/name/sequence -Attentions: +Attention: 1. When comparing by sequences, both positive and negative strands are compared. Switch on -P/--only-positive-strand for considering the positive strand only. diff --git a/seqkit/cmd/shuffle.go b/seqkit/cmd/shuffle.go index af28818..abc844e 100644 --- a/seqkit/cmd/shuffle.go +++ b/seqkit/cmd/shuffle.go @@ -52,7 +52,7 @@ seqkit will write the sequences to temporary files, and create FASTA index. Secondly, seqkit shuffles sequence IDs and extract sequences by FASTA index. -Attentions: +Attention: 1. For the two-pass mode (-2/--two-pass), The flag -U/--update-faidx is recommended to ensure the .fai file matches the FASTA file. diff --git a/seqkit/cmd/sort.go b/seqkit/cmd/sort.go index 0c8da0b..c9c8f0f 100644 --- a/seqkit/cmd/sort.go +++ b/seqkit/cmd/sort.go @@ -57,7 +57,7 @@ seqkit will write the sequences to temporary files, and create FASTA index. Secondly, seqkit sorts sequence by head and length information and extracts sequences by FASTA index. -Attentions: +Attention: 1. For the two-pass mode (-2/--two-pass), The flag -U/--update-faidx is recommended to ensure the .fai file matches the FASTA file. diff --git a/seqkit/cmd/split.go b/seqkit/cmd/split.go index d0ed5c7..94bafc1 100644 --- a/seqkit/cmd/split.go +++ b/seqkit/cmd/split.go @@ -60,7 +60,7 @@ If you want to cut a sequence into multiple segments. E.g., cutting into segments of 40 bp and keeping the last segment which can be shorter than 40 bp. seqkit sliding -g -s 40 -W 40 input.fasta -o out.fasta -Attentions: +Attention: 1. For the two-pass mode (-2/--two-pass), The flag -U/--update-faidx is recommended to ensure the .fai file matches the FASTA file. diff --git a/seqkit/cmd/stat.go b/seqkit/cmd/stat.go index 80800ed..83c49d9 100644 --- a/seqkit/cmd/stat.go +++ b/seqkit/cmd/stat.go @@ -77,7 +77,7 @@ Columns: 17. AvgQual average quality 18. GC(%) percentage of GC content -Attentions: +Attention: 1. Sequence length metrics (sum_len, min_len, avg_len, max_len, Q1, Q2, Q3) count the number of gaps or spaces. You can remove them with "seqkit seq -g": seqkit seq -g input.fasta | seqkit stats diff --git a/seqkit/cmd/subseq.go b/seqkit/cmd/subseq.go index b29b95f..e3c1073 100644 --- a/seqkit/cmd/subseq.go +++ b/seqkit/cmd/subseq.go @@ -46,7 +46,7 @@ var subseqCmd = &cobra.Command{ Short: "get subsequences by region/gtf/bed, including flanking sequences", Long: fmt.Sprintf(`get subsequences by region/gtf/bed, including flanking sequences. -Attentions: +Attention: 1. Use "seqkit grep" for extract subsets of sequences. "seqtk subseq seqs.fasta id.txt" equals to "seqkit grep -f id.txt seqs.fasta" diff --git a/seqkit/cmd/sum.go b/seqkit/cmd/sum.go index 04ba6dd..22c45da 100644 --- a/seqkit/cmd/sum.go +++ b/seqkit/cmd/sum.go @@ -51,7 +51,7 @@ var sumCmd = &cobra.Command{ Short: "compute message digest for all sequences in FASTA/Q files", Long: `compute message digest for all sequences in FASTA/Q files -Attentions: +Attention: 1. Sequence headers and qualities are skipped, only sequences matter. 2. The order of sequences records does not matter. 3. Circular complete genomes are supported with the flag -c/--circular.