1 /*
2 * Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*
30 tre-internal.h - TRE internal definitions
31 */
32
33 #ifndef _TRE_INTERNAL_H
34 #define _TRE_INTERNAL_H
35
36 #include <sys/sysmacros.h>
37
38 #include <bitstring.h>
39 #include <wchar.h>
40 #include <wctype.h>
41
42 #include "tre.h"
43
44 #define TRE_REGEX_T_FIELD value
45
46 #ifdef TRE_DEBUG
47 #include <stdio.h>
48 #define DPRINT(msg) do {printf msg; fflush(stdout);} while(/*CONSTCOND*/0)
49 #else /* !TRE_DEBUG */
50 #define DPRINT(msg) do { } while(/*CONSTCOND*/0)
51 #endif /* !TRE_DEBUG */
52
53 #define elementsof(x) ( sizeof(x) / sizeof(x[0]) )
54
55 #define tre_mbrtowc(pwc, s, n, ps) (mbrtowc((pwc), (s), (n), (ps)))
56 #define tre_mbrtowc_l(pwc,s,n,ps,l) (mbrtowc_l((pwc), (s), (n), (ps), (l)))
57
58 typedef wint_t tre_cint_t;
59 #define TRE_CHAR_MAX WCHAR_MAX
60
61 #define TRE_MB_CUR_MAX MB_CUR_MAX
62 #define TRE_MB_CUR_MAX_L MB_CUR_MAX_L
63
64 #define tre_isalnum iswalnum
65 #define tre_isalpha iswalpha
66 #define tre_isblank iswblank
67 #define tre_iscntrl iswcntrl
68 #define tre_isdigit iswdigit
69 #define tre_isgraph iswgraph
70 #define tre_islower iswlower
71 #define tre_isprint iswprint
72 #define tre_ispunct iswpunct
73 #define tre_isspace iswspace
74 #define tre_isupper iswupper
75 #define tre_isxdigit iswxdigit
76
77 #define tre_tolower towlower
78 #define tre_toupper towupper
79 #define tre_strlen wcslen
80
81 #define tre_isalnum_l iswalnum_l
82 #define tre_isdigit_l iswdigit_l
83 #define tre_islower_l iswlower_l
84 #define tre_isupper_l iswupper_l
85 #define tre_isxdigit_l iswxdigit_l
86 #define tre_tolower_l towlower_l
87 #define tre_toupper_l towupper_l
88
89 typedef wctype_t tre_ctype_t;
90 #define tre_isctype iswctype
91 #define tre_ctype wctype
92 #define tre_isctype_l iswctype_l
93 #define tre_ctype_l wctype_l
94
95 typedef enum { STR_WIDE, STR_BYTE, STR_MBS } tre_str_type_t;
96
97 /* Returns number of bytes to add to (char *)ptr to make it
98 properly aligned for the type. */
99 #define ALIGN(ptr, type) \
100 ((((long)ptr) % sizeof(type)) \
101 ? (sizeof(type) - (((long)ptr) % sizeof(type))) \
102 : 0)
103
104 #define STRF "ls"
105
106 /* Types to handle bracket expressions. */
107 typedef enum {
108 TRE_BRACKET_MATCH_TYPE_UNUSED = 0,
109 TRE_BRACKET_MATCH_TYPE_CHAR, /* Single character value */
110 TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN, /* Collation range begin */
111 TRE_BRACKET_MATCH_TYPE_RANGE_END, /* Collation range end */
112 TRE_BRACKET_MATCH_TYPE_CLASS, /* Character class */
113 TRE_BRACKET_MATCH_TYPE_EQUIVALENCE, /* Collation equivalence value */
114 } tre_bracket_match_type_t;
115
116 typedef struct {
117 tre_bracket_match_type_t type;
118 tre_cint_t value;
119 } tre_bracket_match_t;
120
121 #define TRE_BRACKET_MATCH_FLAG_NEGATE 1
122
123 typedef struct {
124 int num_bracket_matches;
125 int flags;
126 tre_bracket_match_t bracket_matches[0];
127 } tre_bracket_match_list_t;
128
129 #define SIZEOF_BRACKET_MATCH_LIST_N(n) (sizeof(tre_bracket_match_list_t) + \
130 sizeof(tre_bracket_match_t) * (n))
131 #define SIZEOF_BRACKET_MATCH_LIST(l) SIZEOF_BRACKET_MATCH_LIST_N( \
132 (l)->num_bracket_matches)
133
134 /* The "count" field is the number of time the tag was set, initially zero.
135 The "first" field contains the first set value (when "count" equals 1).
136 The "value" field contains the current value of the tag, if "count" is
137 greater than zero (the tag's current value is -1 if "count" is zero).
138 The "touch" field is the touch value, a montonically increasing value
139 (maintained by the caller) set each time the tag itself is set. */
140 typedef struct {
141 int count;
142 int first;
143 int value;
144 int touch;
145 } tre_tag_t;
146
147 /* TNFA transition type. A TNFA state is an array of transitions,
148 the terminator is a transition with NULL `state'. */
149 typedef struct tnfa_transition tre_tnfa_transition_t;
150
151 struct tnfa_transition {
152 /* Range of accepted characters. */
153 tre_cint_t code_min;
154 tre_cint_t code_max;
155 /* Pointer to the destination state. */
156 tre_tnfa_transition_t *state;
157 /* ID number of the destination state. */
158 int state_id;
159 /* -1 terminated array of tags (or NULL). */
160 int *tags;
161 /* Matching parameters settings (or NULL). */
162 int *params;
163 /* Assertion bitmap. */
164 int assertions;
165 /* Assertion parameters. */
166 union {
167 /* Bracket matches. */
168 tre_bracket_match_list_t *bracket_match_list;
169 /* Back reference assertion. */
170 int backref;
171 } u;
172 };
173
174 /* Assertions. */
175 #define ASSERT_AT_BOL 1 /* Beginning of line. */
176 #define ASSERT_AT_EOL 2 /* End of line. */
177 #define ASSERT_BRACKET_MATCH 4 /* Matches in `bracket_match_list'. */
178 #define ASSERT_AT_BOW 8 /* Beginning of word. */
179 #define ASSERT_AT_EOW 16 /* End of word. */
180 #define ASSERT_AT_WB 32 /* Word boundary. */
181 #define ASSERT_AT_WB_NEG 64 /* Not a word boundary. */
182 #define ASSERT_BACKREF 128 /* A back reference in `backref'. */
183 #define ASSERT_LAST 128
184
185 /* Tag directions. */
186 typedef enum {
187 TRE_TAG_MINIMIZE = 0,
188 TRE_TAG_MAXIMIZE,
189 TRE_TAG_LEFT_MAXIMIZE,
190 } tre_tag_direction_t;
191
192 /* Parameters that can be changed dynamically while matching. */
193 typedef enum {
194 TRE_PARAM_COST_INS = 0,
195 TRE_PARAM_COST_DEL = 1,
196 TRE_PARAM_COST_SUBST = 2,
197 TRE_PARAM_COST_MAX = 3,
198 TRE_PARAM_MAX_INS = 4,
199 TRE_PARAM_MAX_DEL = 5,
200 TRE_PARAM_MAX_SUBST = 6,
201 TRE_PARAM_MAX_ERR = 7,
202 TRE_PARAM_DEPTH = 8,
203 TRE_PARAM_LAST = 9
204 } tre_param_t;
205
206 /* Unset matching parameter */
207 #define TRE_PARAM_UNSET -1
208
209 /* Signifies the default matching parameter value. */
210 #define TRE_PARAM_DEFAULT -2
211
212 /* Instructions to compute submatch register values from tag values
213 after a successful match. */
214 struct tre_submatch_data {
215 /* Tag that gives the value for rm_so (submatch start offset). */
216 int so_tag;
217 /* Tag that gives the value for rm_eo (submatch end offset). */
218 int eo_tag;
219 };
220
221 typedef struct tre_submatch_data tre_submatch_data_t;
222
223 /* LAST MATCHED structures */
224 struct __previous_type;
225 struct __previous_branch_type;
226 struct __current_type;
227 struct __current_branch_type;
228
229 typedef struct __previous_branch_type {
230 struct __previous_branch_type *next;
231 struct __previous_type *last_matched; int n_last_matched; int cmp_tag; int n_tags;
232 int tot_branches;
233 int tot_last_matched;
234 int tot_tags;
235 bitstr_t tags[0];
236 } tre_last_matched_branch_pre_t;
237
238 typedef struct __previous_type {
239 struct __previous_type *next;
240 tre_last_matched_branch_pre_t *branches; int n_branches; int start_tag;
241 int tot_branches;
242 int tot_last_matched;
243 int tot_tags;
244 } tre_last_matched_pre_t;
245
246 typedef struct __current_branch_type {
247 int *tags;
248 struct __current_type *last_matched; int n_last_matched; int cmp_tag; int n_tags;
249 } tre_last_matched_branch_t;
250
251 typedef struct __current_type {
252 tre_last_matched_branch_t *branches; int n_branches; int start_tag;
253 } tre_last_matched_t;
254
255 /* TNFA definition. */
256 typedef struct tnfa tre_tnfa_t;
257
258 struct tnfa {
259 tre_tnfa_transition_t *transitions;
260 tre_tnfa_transition_t *initial;
261 tre_tnfa_transition_t *final;
262 tre_submatch_data_t *submatch_data;
263 tre_tag_direction_t *tag_directions;
264 int *minimal_tags;
265 tre_last_matched_branch_t *last_matched_branch;
266 locale_t loc;
267 unsigned int num_transitions;
268 int first_char;
269 unsigned int num_submatches;
270 unsigned int num_submatches_invisible;
271 int num_tags;
272 int num_minimals;
273 int end_tag;
274 int num_states;
275 int cflags;
276 int have_backrefs;
277 int num_reorder_tags;
278 int have_approx;
279 int params_depth;
280 };
281
282 int
283 tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags,
284 locale_t loc);
285
286 void
287 tre_free(regex_t *preg);
288
289 reg_errcode_t
290 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
291 const tre_tnfa_t *tnfa, const tre_tag_t *tags, int match_eo);
292
293 reg_errcode_t
294 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
295 tre_str_type_t type, tre_tag_t *match_tags, int eflags,
296 int *match_end_ofs);
297
298 reg_errcode_t
299 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
300 int len, tre_str_type_t type, tre_tag_t *match_tags,
301 int eflags, int *match_end_ofs);
302
303 #endif /* _TRE_INTERNAL_H */