-
Notifications
You must be signed in to change notification settings - Fork 26
/
esl_alphabet.h
131 lines (119 loc) · 7.35 KB
/
esl_alphabet.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
/* Digital representation of biosequence symbols in Easel.
*/
#ifndef eslALPHABET_INCLUDED
#define eslALPHABET_INCLUDED
#include <esl_config.h>
#include <ctype.h> /* isascii() */
#include "easel.h"
#ifdef __cplusplus // magic to make C++ compilers happy
extern "C" {
#endif
/* Flags for alphabet types.
* Do not change, only add, because these codes are used in file formats.
*/
#define eslUNKNOWN 0 /* 0=unknown is easel-wide convention; don't change */
#define eslRNA 1
#define eslDNA 2
#define eslAMINO 3
#define eslCOINS 4 /* for toy examples */
#define eslDICE 5 /* also for toy examples */
#define eslNONSTANDARD 6
/* ... if you add here, change esl_abc_ValidateType() too. */
/* Structure: ESL_ALPHABET
*/
typedef struct {
int type; /* eslDNA, eslRNA, eslAMINO, eslNONSTANDARD, etc. */
int K; /* uniq alphabet size: 4 or 20 */
int Kp; /* total size: alphabet + degen + gap + missing */
char *sym; /* "ACGT-RYMKSWHBVDN*~", for instance [0..Kp-1] */
ESL_DSQ inmap[128]; /* inmap['A'] = 0, etc: dsq[] index for a symbol */
char **degen; /* 1/0, which syms inc which res [0..Kp-1][0..K-1] */
int *ndegen; /* # of degenerate residues per code [0..Kp-1] */
ESL_DSQ *complement; /* maps sym to complements, [0..Kp-1]; NULL if <type> not DNA/RNA */
} ESL_ALPHABET;
/* 1. An ESL_ALPHABET object.
*/
extern ESL_ALPHABET *esl_alphabet_Create(int type);
extern ESL_ALPHABET *esl_alphabet_CreateCustom(const char *alphabet, int K, int Kp);
extern int esl_alphabet_SetEquiv(ESL_ALPHABET *a, char sym, char c);
extern int esl_alphabet_SetCaseInsensitive(ESL_ALPHABET *a);
extern int esl_alphabet_SetDegeneracy(ESL_ALPHABET *a, char c, char *ds);
extern int esl_alphabet_SetIgnored(ESL_ALPHABET *a, const char *ignoredchars);
extern size_t esl_alphabet_Sizeof(ESL_ALPHABET *a);
extern void esl_alphabet_Destroy(ESL_ALPHABET *a);
/* 2. Digitized sequences.
*/
extern int esl_abc_CreateDsq(const ESL_ALPHABET *a, const char *seq, ESL_DSQ **ret_dsq);
extern int esl_abc_Digitize (const ESL_ALPHABET *a, const char *seq, ESL_DSQ *dsq);
extern int esl_abc_Textize (const ESL_ALPHABET *a, const ESL_DSQ *dsq, int64_t L, char *seq);
extern int esl_abc_TextizeN (const ESL_ALPHABET *a, const ESL_DSQ *dptr, int64_t L, char *buf);
extern int esl_abc_dsqcpy(const ESL_DSQ *dsq, int64_t L, ESL_DSQ *dcopy);
extern int esl_abc_dsqdup(const ESL_DSQ *dsq, int64_t L, ESL_DSQ **ret_dup);
extern int esl_abc_dsqcat (const ESL_DSQ *inmap, ESL_DSQ **dsq, int64_t *L, const char *s, esl_pos_t n);
extern int esl_abc_dsqcat_noalloc(const ESL_DSQ *inmap, ESL_DSQ *dsq, int64_t *L, const char *s, esl_pos_t n);
extern int64_t esl_abc_dsqlen(const ESL_DSQ *dsq);
extern int64_t esl_abc_dsqrlen(const ESL_ALPHABET *a, const ESL_DSQ *dsq);
extern int esl_abc_CDealign(const ESL_ALPHABET *abc, char *s, const ESL_DSQ *ref_ax, int64_t *opt_rlen);
extern int esl_abc_XDealign(const ESL_ALPHABET *abc, ESL_DSQ *x, const ESL_DSQ *ref_ax, int64_t *opt_rlen);
extern int esl_abc_ConvertDegen2X(const ESL_ALPHABET *abc, ESL_DSQ *dsq);
extern int esl_abc_revcomp(const ESL_ALPHABET *abc, ESL_DSQ *dsq, int n);
/* 3. Other routines in the API.
*/
extern int esl_abc_ValidateType(int type);
extern int esl_abc_GuessAlphabet(const int64_t *ct, int *ret_type);
extern double esl_abc_Match (const ESL_ALPHABET *a, ESL_DSQ x, ESL_DSQ y, double *p);
extern int esl_abc_IAvgScore (const ESL_ALPHABET *a, ESL_DSQ x, const int *sc);
extern float esl_abc_FAvgScore (const ESL_ALPHABET *a, ESL_DSQ x, const float *sc);
extern double esl_abc_DAvgScore (const ESL_ALPHABET *a, ESL_DSQ x, const double *sc);
extern int esl_abc_IExpectScore(const ESL_ALPHABET *a, ESL_DSQ x, const int *sc, const float *p);
extern float esl_abc_FExpectScore(const ESL_ALPHABET *a, ESL_DSQ x, const float *sc, const float *p);
extern double esl_abc_DExpectScore(const ESL_ALPHABET *a, ESL_DSQ x, const double *sc, const double *p);
extern int esl_abc_IAvgScVec (const ESL_ALPHABET *a, int *sc);
extern int esl_abc_FAvgScVec (const ESL_ALPHABET *a, float *sc);
extern int esl_abc_DAvgScVec (const ESL_ALPHABET *a, double *sc);
extern int esl_abc_IExpectScVec(const ESL_ALPHABET *a, int *sc, const float *p);
extern int esl_abc_FExpectScVec(const ESL_ALPHABET *a, float *sc, const float *p);
extern int esl_abc_DExpectScVec(const ESL_ALPHABET *a, double *sc, const double *p);
extern int esl_abc_FCount (const ESL_ALPHABET *a, float *ct, ESL_DSQ x, float wt);
extern int esl_abc_DCount (const ESL_ALPHABET *a, double *ct, ESL_DSQ x, double wt);
extern int esl_abc_EncodeType (char *typestring);
extern int esl_abc_EncodeTypeMem(char *type, int n);
extern char *esl_abc_DecodeType (int type);
extern int esl_abc_ValidateSeq(const ESL_ALPHABET *a, const char *seq, int64_t L, char *errbuf);
/* In the tests below, remember the rules of order in internal alphabets:
* Canonical alphabet Gap Degeneracies Any None Missing
* 0..K-1 K K+1..Kp-4 (Kp-3) (Kp-2) (Kp-1)
* ACGT - RYMKSWHBVD N * ~ DNA: K=4 Kp=18
* ACDEFGHIKLMNPQRSTVWY - BJZOU X * ~ protein: K=20 Kp=29
*
* ESL_DSQ is an unsigned 8-bit type, so don't test for >= 0 or compilers will complain.
*/
#define esl_abc_DigitizeSymbol(a, c) ((a)->inmap[(int)c])
#define esl_abc_XIsValid(a, x) ((x) < (a)->Kp)
#define esl_abc_XIsResidue(a, x) ((x) < (a)->K || ((x) > (a)->K && (x) < (a)->Kp-2))
#define esl_abc_XIsCanonical(a, x) ((x) < (a)->K)
#define esl_abc_XIsGap(a, x) ((x) == (a)->K)
#define esl_abc_XIsDegenerate(a, x) ((x) > (a)->K && (x) < (a)->Kp-2)
#define esl_abc_XIsUnknown(a, x) ((x) == (a)->Kp-3)
#define esl_abc_XIsNonresidue(a, x) ((x) == (a)->Kp-2)
#define esl_abc_XIsMissing(a, x) ((x) == (a)->Kp-1)
#define esl_abc_XGetGap(a) ((a)->K)
#define esl_abc_XGetUnknown(a) ((a)->Kp-3)
#define esl_abc_XGetNonresidue(a) ((a)->Kp-2)
#define esl_abc_XGetMissing(a) ((a)->Kp-1)
#define esl_abc_CIsValid(a, c) (isascii(c) && (a)->inmap[(int)c] < (a)->Kp)
#define esl_abc_CIsResidue(a, c) ((a)->inmap[(int)c] < (a)->K || ((a)->inmap[(int)c] > (a)->K && (a)->inmap[(int)c] < (a)->Kp-2))
#define esl_abc_CIsCanonical(a, c) ((a)->inmap[(int)c] < (a)->K)
#define esl_abc_CIsGap(a, c) ((a)->inmap[(int)c] == (a)->K)
#define esl_abc_CIsDegenerate(a, c) ((a)->inmap[(int)c] > (a)->K && (a)->inmap[(int)c] < (a)->Kp-2)
#define esl_abc_CIsUnknown(a, c) ((a)->inmap[(int)c] == (a)->Kp-3)
#define esl_abc_CIsNonresidue(a, c) ((a)->inmap[(int)c] == (a)->Kp-2)
#define esl_abc_CIsMissing(a, c) ((a)->inmap[(int)c] == (a)->Kp-1)
#define esl_abc_CGetGap(a) ((a)->sym[(a)->K])
#define esl_abc_CGetUnknown(a) ((a)->sym[(a)->Kp-3])
#define esl_abc_CGetNonresidue(a) ((a)->sym[(a)->Kp-2])
#define esl_abc_CGetMissing(a) ((a)->sym[(a)->Kp-1])
#ifdef __cplusplus // magic to make C++ compilers happy
}
#endif
#endif /*eslALPHABET_INCLUDED*/