1 /* 2 * Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 tre-internal.h - TRE internal definitions 31 */ 32 33 #ifndef _TRE_INTERNAL_H 34 #define _TRE_INTERNAL_H 35 36 #include <sys/sysmacros.h> 37 38 #include <bitstring.h> 39 #include <wchar.h> 40 #include <wctype.h> 41 42 #include "tre.h" 43 44 #define TRE_REGEX_T_FIELD value 45 46 #ifdef TRE_DEBUG 47 #include <stdio.h> 48 #define DPRINT(msg) do {printf msg; fflush(stdout);} while(/*CONSTCOND*/0) 49 #else /* !TRE_DEBUG */ 50 #define DPRINT(msg) do { } while(/*CONSTCOND*/0) 51 #endif /* !TRE_DEBUG */ 52 53 #define elementsof(x) ( sizeof(x) / sizeof(x[0]) ) 54 55 #define tre_mbrtowc(pwc, s, n, ps) (mbrtowc((pwc), (s), (n), (ps))) 56 #define tre_mbrtowc_l(pwc,s,n,ps,l) (mbrtowc_l((pwc), (s), (n), (ps), (l))) 57 58 typedef wint_t tre_cint_t; 59 #define TRE_CHAR_MAX WCHAR_MAX 60 61 #define TRE_MB_CUR_MAX MB_CUR_MAX 62 #define TRE_MB_CUR_MAX_L MB_CUR_MAX_L 63 64 #define tre_isalnum iswalnum 65 #define tre_isalpha iswalpha 66 #define tre_isblank iswblank 67 #define tre_iscntrl iswcntrl 68 #define tre_isdigit iswdigit 69 #define tre_isgraph iswgraph 70 #define tre_islower iswlower 71 #define tre_isprint iswprint 72 #define tre_ispunct iswpunct 73 #define tre_isspace iswspace 74 #define tre_isupper iswupper 75 #define tre_isxdigit iswxdigit 76 77 #define tre_tolower towlower 78 #define tre_toupper towupper 79 #define tre_strlen wcslen 80 81 #define tre_isalnum_l iswalnum_l 82 #define tre_isdigit_l iswdigit_l 83 #define tre_islower_l iswlower_l 84 #define tre_isupper_l iswupper_l 85 #define tre_isxdigit_l iswxdigit_l 86 #define tre_tolower_l towlower_l 87 #define tre_toupper_l towupper_l 88 89 typedef wctype_t tre_ctype_t; 90 #define tre_isctype iswctype 91 #define tre_ctype wctype 92 #define tre_isctype_l iswctype_l 93 #define tre_ctype_l wctype_l 94 95 typedef enum { STR_WIDE, STR_BYTE, STR_MBS } tre_str_type_t; 96 97 /* Returns number of bytes to add to (char *)ptr to make it 98 properly aligned for the type. */ 99 #define ALIGN(ptr, type) \ 100 ((((long)ptr) % sizeof(type)) \ 101 ? (sizeof(type) - (((long)ptr) % sizeof(type))) \ 102 : 0) 103 104 #define STRF "ls" 105 106 /* Types to handle bracket expressions. */ 107 typedef enum { 108 TRE_BRACKET_MATCH_TYPE_UNUSED = 0, 109 TRE_BRACKET_MATCH_TYPE_CHAR, /* Single character value */ 110 TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN, /* Collation range begin */ 111 TRE_BRACKET_MATCH_TYPE_RANGE_END, /* Collation range end */ 112 TRE_BRACKET_MATCH_TYPE_CLASS, /* Character class */ 113 TRE_BRACKET_MATCH_TYPE_EQUIVALENCE, /* Collation equivalence value */ 114 } tre_bracket_match_type_t; 115 116 typedef struct { 117 tre_bracket_match_type_t type; 118 tre_cint_t value; 119 } tre_bracket_match_t; 120 121 #define TRE_BRACKET_MATCH_FLAG_NEGATE 1 122 123 typedef struct { 124 int num_bracket_matches; 125 int flags; 126 tre_bracket_match_t bracket_matches[0]; 127 } tre_bracket_match_list_t; 128 129 #define SIZEOF_BRACKET_MATCH_LIST_N(n) (sizeof(tre_bracket_match_list_t) + \ 130 sizeof(tre_bracket_match_t) * (n)) 131 #define SIZEOF_BRACKET_MATCH_LIST(l) SIZEOF_BRACKET_MATCH_LIST_N( \ 132 (l)->num_bracket_matches) 133 134 /* The "count" field is the number of time the tag was set, initially zero. 135 The "first" field contains the first set value (when "count" equals 1). 136 The "value" field contains the current value of the tag, if "count" is 137 greater than zero (the tag's current value is -1 if "count" is zero). 138 The "touch" field is the touch value, a montonically increasing value 139 (maintained by the caller) set each time the tag itself is set. */ 140 typedef struct { 141 int count; 142 int first; 143 int value; 144 int touch; 145 } tre_tag_t; 146 147 /* TNFA transition type. A TNFA state is an array of transitions, 148 the terminator is a transition with NULL `state'. */ 149 typedef struct tnfa_transition tre_tnfa_transition_t; 150 151 struct tnfa_transition { 152 /* Range of accepted characters. */ 153 tre_cint_t code_min; 154 tre_cint_t code_max; 155 /* Pointer to the destination state. */ 156 tre_tnfa_transition_t *state; 157 /* ID number of the destination state. */ 158 int state_id; 159 /* -1 terminated array of tags (or NULL). */ 160 int *tags; 161 /* Matching parameters settings (or NULL). */ 162 int *params; 163 /* Assertion bitmap. */ 164 int assertions; 165 /* Assertion parameters. */ 166 union { 167 /* Bracket matches. */ 168 tre_bracket_match_list_t *bracket_match_list; 169 /* Back reference assertion. */ 170 int backref; 171 } u; 172 }; 173 174 /* Assertions. */ 175 #define ASSERT_AT_BOL 1 /* Beginning of line. */ 176 #define ASSERT_AT_EOL 2 /* End of line. */ 177 #define ASSERT_BRACKET_MATCH 4 /* Matches in `bracket_match_list'. */ 178 #define ASSERT_AT_BOW 8 /* Beginning of word. */ 179 #define ASSERT_AT_EOW 16 /* End of word. */ 180 #define ASSERT_AT_WB 32 /* Word boundary. */ 181 #define ASSERT_AT_WB_NEG 64 /* Not a word boundary. */ 182 #define ASSERT_BACKREF 128 /* A back reference in `backref'. */ 183 #define ASSERT_LAST 128 184 185 /* Tag directions. */ 186 typedef enum { 187 TRE_TAG_MINIMIZE = 0, 188 TRE_TAG_MAXIMIZE, 189 TRE_TAG_LEFT_MAXIMIZE, 190 } tre_tag_direction_t; 191 192 /* Parameters that can be changed dynamically while matching. */ 193 typedef enum { 194 TRE_PARAM_COST_INS = 0, 195 TRE_PARAM_COST_DEL = 1, 196 TRE_PARAM_COST_SUBST = 2, 197 TRE_PARAM_COST_MAX = 3, 198 TRE_PARAM_MAX_INS = 4, 199 TRE_PARAM_MAX_DEL = 5, 200 TRE_PARAM_MAX_SUBST = 6, 201 TRE_PARAM_MAX_ERR = 7, 202 TRE_PARAM_DEPTH = 8, 203 TRE_PARAM_LAST = 9 204 } tre_param_t; 205 206 /* Unset matching parameter */ 207 #define TRE_PARAM_UNSET -1 208 209 /* Signifies the default matching parameter value. */ 210 #define TRE_PARAM_DEFAULT -2 211 212 /* Instructions to compute submatch register values from tag values 213 after a successful match. */ 214 struct tre_submatch_data { 215 /* Tag that gives the value for rm_so (submatch start offset). */ 216 int so_tag; 217 /* Tag that gives the value for rm_eo (submatch end offset). */ 218 int eo_tag; 219 }; 220 221 typedef struct tre_submatch_data tre_submatch_data_t; 222 223 /* LAST MATCHED structures */ 224 struct __previous_type; 225 struct __previous_branch_type; 226 struct __current_type; 227 struct __current_branch_type; 228 229 typedef struct __previous_branch_type { 230 struct __previous_branch_type *next; 231 struct __previous_type *last_matched; int n_last_matched; int cmp_tag; int n_tags; 232 int tot_branches; 233 int tot_last_matched; 234 int tot_tags; 235 bitstr_t tags[0]; 236 } tre_last_matched_branch_pre_t; 237 238 typedef struct __previous_type { 239 struct __previous_type *next; 240 tre_last_matched_branch_pre_t *branches; int n_branches; int start_tag; 241 int tot_branches; 242 int tot_last_matched; 243 int tot_tags; 244 } tre_last_matched_pre_t; 245 246 typedef struct __current_branch_type { 247 int *tags; 248 struct __current_type *last_matched; int n_last_matched; int cmp_tag; int n_tags; 249 } tre_last_matched_branch_t; 250 251 typedef struct __current_type { 252 tre_last_matched_branch_t *branches; int n_branches; int start_tag; 253 } tre_last_matched_t; 254 255 /* TNFA definition. */ 256 typedef struct tnfa tre_tnfa_t; 257 258 struct tnfa { 259 tre_tnfa_transition_t *transitions; 260 tre_tnfa_transition_t *initial; 261 tre_tnfa_transition_t *final; 262 tre_submatch_data_t *submatch_data; 263 tre_tag_direction_t *tag_directions; 264 int *minimal_tags; 265 tre_last_matched_branch_t *last_matched_branch; 266 locale_t loc; 267 unsigned int num_transitions; 268 int first_char; 269 unsigned int num_submatches; 270 unsigned int num_submatches_invisible; 271 int num_tags; 272 int num_minimals; 273 int end_tag; 274 int num_states; 275 int cflags; 276 int have_backrefs; 277 int num_reorder_tags; 278 int have_approx; 279 int params_depth; 280 }; 281 282 int 283 tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags, 284 locale_t loc); 285 286 void 287 tre_free(regex_t *preg); 288 289 reg_errcode_t 290 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, 291 const tre_tnfa_t *tnfa, const tre_tag_t *tags, int match_eo); 292 293 reg_errcode_t 294 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, 295 tre_str_type_t type, tre_tag_t *match_tags, int eflags, 296 int *match_end_ofs); 297 298 reg_errcode_t 299 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, 300 int len, tre_str_type_t type, tre_tag_t *match_tags, 301 int eflags, int *match_end_ofs); 302 303 #endif /* _TRE_INTERNAL_H */