1 /*
   2  * Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  *
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice, this list of conditions and the following disclaimer.
  11  *
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
  17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  18  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  19  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  20  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  21  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  22  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  26  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27  */
  28 
  29 /*
  30   tre-internal.h - TRE internal definitions
  31 */
  32 
  33 #ifndef _TRE_INTERNAL_H
  34 #define _TRE_INTERNAL_H
  35 
  36 #include <sys/sysmacros.h>
  37 
  38 #include <bitstring.h>
  39 #include <wchar.h>
  40 #include <wctype.h>
  41 
  42 #include "tre.h"
  43 
  44 #define TRE_REGEX_T_FIELD value
  45 
  46 #ifdef TRE_DEBUG
  47 #include <stdio.h>
  48 #define DPRINT(msg) do {printf msg; fflush(stdout);} while(/*CONSTCOND*/0)
  49 #else /* !TRE_DEBUG */
  50 #define DPRINT(msg) do { } while(/*CONSTCOND*/0)
  51 #endif /* !TRE_DEBUG */
  52 
  53 #define elementsof(x)   ( sizeof(x) / sizeof(x[0]) )
  54 
  55 #define tre_mbrtowc(pwc, s, n, ps) (mbrtowc((pwc), (s), (n), (ps)))
  56 #define tre_mbrtowc_l(pwc,s,n,ps,l) (mbrtowc_l((pwc), (s), (n), (ps), (l)))
  57 
  58 typedef wint_t tre_cint_t;
  59 #define TRE_CHAR_MAX WCHAR_MAX
  60 
  61 #define TRE_MB_CUR_MAX MB_CUR_MAX
  62 #define TRE_MB_CUR_MAX_L MB_CUR_MAX_L
  63 
  64 #define tre_isalnum iswalnum
  65 #define tre_isalpha iswalpha
  66 #define tre_isblank iswblank
  67 #define tre_iscntrl iswcntrl
  68 #define tre_isdigit iswdigit
  69 #define tre_isgraph iswgraph
  70 #define tre_islower iswlower
  71 #define tre_isprint iswprint
  72 #define tre_ispunct iswpunct
  73 #define tre_isspace iswspace
  74 #define tre_isupper iswupper
  75 #define tre_isxdigit iswxdigit
  76 
  77 #define tre_tolower towlower
  78 #define tre_toupper towupper
  79 #define tre_strlen  wcslen
  80 
  81 #define tre_isalnum_l iswalnum_l
  82 #define tre_isdigit_l iswdigit_l
  83 #define tre_islower_l iswlower_l
  84 #define tre_isupper_l iswupper_l
  85 #define tre_isxdigit_l iswxdigit_l
  86 #define tre_tolower_l towlower_l
  87 #define tre_toupper_l towupper_l
  88 
  89 typedef wctype_t tre_ctype_t;
  90 #define tre_isctype iswctype
  91 #define tre_ctype   wctype
  92 #define tre_isctype_l iswctype_l
  93 #define tre_ctype_l   wctype_l
  94 
  95 typedef enum { STR_WIDE, STR_BYTE, STR_MBS } tre_str_type_t;
  96 
  97 /* Returns number of bytes to add to (char *)ptr to make it
  98    properly aligned for the type. */
  99 #define ALIGN(ptr, type) \
 100   ((((long)ptr) % sizeof(type)) \
 101    ? (sizeof(type) - (((long)ptr) % sizeof(type))) \
 102    : 0)
 103 
 104 #define STRF "ls"
 105 
 106 /* Types to handle bracket expressions. */
 107 typedef enum {
 108   TRE_BRACKET_MATCH_TYPE_UNUSED = 0,
 109   TRE_BRACKET_MATCH_TYPE_CHAR,          /* Single character value */
 110   TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN,   /* Collation range begin */
 111   TRE_BRACKET_MATCH_TYPE_RANGE_END,     /* Collation range end */
 112   TRE_BRACKET_MATCH_TYPE_CLASS,         /* Character class */
 113   TRE_BRACKET_MATCH_TYPE_EQUIVALENCE,   /* Collation equivalence value */
 114 } tre_bracket_match_type_t;
 115 
 116 typedef struct {
 117   tre_bracket_match_type_t type;
 118   tre_cint_t value;
 119 } tre_bracket_match_t;
 120 
 121 #define TRE_BRACKET_MATCH_FLAG_NEGATE   1
 122 
 123 typedef struct {
 124   int num_bracket_matches;
 125   int flags;
 126   tre_bracket_match_t bracket_matches[0];
 127 } tre_bracket_match_list_t;
 128 
 129 #define SIZEOF_BRACKET_MATCH_LIST_N(n)  (sizeof(tre_bracket_match_list_t) + \
 130                                          sizeof(tre_bracket_match_t) * (n))
 131 #define SIZEOF_BRACKET_MATCH_LIST(l)    SIZEOF_BRACKET_MATCH_LIST_N( \
 132                                          (l)->num_bracket_matches)
 133 
 134 /* The "count" field is the number of time the tag was set, initially zero.
 135    The "first" field contains the first set value (when "count" equals 1).
 136    The "value" field contains the current value of the tag, if "count" is
 137    greater than zero (the tag's current value is -1 if "count" is zero).
 138    The "touch" field is the touch value, a montonically increasing value
 139    (maintained by the caller) set each time the tag itself is set. */
 140 typedef struct {
 141   int count;
 142   int first;
 143   int value;
 144   int touch;
 145 } tre_tag_t;
 146 
 147 /* TNFA transition type. A TNFA state is an array of transitions,
 148    the terminator is a transition with NULL `state'. */
 149 typedef struct tnfa_transition tre_tnfa_transition_t;
 150 
 151 struct tnfa_transition {
 152   /* Range of accepted characters. */
 153   tre_cint_t code_min;
 154   tre_cint_t code_max;
 155   /* Pointer to the destination state. */
 156   tre_tnfa_transition_t *state;
 157   /* ID number of the destination state. */
 158   int state_id;
 159   /* -1 terminated array of tags (or NULL). */
 160   int *tags;
 161   /* Matching parameters settings (or NULL). */
 162   int *params;
 163   /* Assertion bitmap. */
 164   int assertions;
 165   /* Assertion parameters. */
 166   union {
 167     /* Bracket matches. */
 168     tre_bracket_match_list_t *bracket_match_list;
 169     /* Back reference assertion. */
 170     int backref;
 171   } u;
 172 };
 173 
 174 /* Assertions. */
 175 #define ASSERT_AT_BOL             1   /* Beginning of line. */
 176 #define ASSERT_AT_EOL             2   /* End of line. */
 177 #define ASSERT_BRACKET_MATCH      4   /* Matches in `bracket_match_list'. */
 178 #define ASSERT_AT_BOW             8   /* Beginning of word. */
 179 #define ASSERT_AT_EOW            16   /* End of word. */
 180 #define ASSERT_AT_WB             32   /* Word boundary. */
 181 #define ASSERT_AT_WB_NEG         64   /* Not a word boundary. */
 182 #define ASSERT_BACKREF          128   /* A back reference in `backref'. */
 183 #define ASSERT_LAST             128
 184 
 185 /* Tag directions. */
 186 typedef enum {
 187   TRE_TAG_MINIMIZE = 0,
 188   TRE_TAG_MAXIMIZE,
 189   TRE_TAG_LEFT_MAXIMIZE,
 190 } tre_tag_direction_t;
 191 
 192 /* Parameters that can be changed dynamically while matching. */
 193 typedef enum {
 194   TRE_PARAM_COST_INS        = 0,
 195   TRE_PARAM_COST_DEL        = 1,
 196   TRE_PARAM_COST_SUBST      = 2,
 197   TRE_PARAM_COST_MAX        = 3,
 198   TRE_PARAM_MAX_INS         = 4,
 199   TRE_PARAM_MAX_DEL         = 5,
 200   TRE_PARAM_MAX_SUBST       = 6,
 201   TRE_PARAM_MAX_ERR         = 7,
 202   TRE_PARAM_DEPTH           = 8,
 203   TRE_PARAM_LAST            = 9
 204 } tre_param_t;
 205 
 206 /* Unset matching parameter */
 207 #define TRE_PARAM_UNSET -1
 208 
 209 /* Signifies the default matching parameter value. */
 210 #define TRE_PARAM_DEFAULT -2
 211 
 212 /* Instructions to compute submatch register values from tag values
 213    after a successful match.  */
 214 struct tre_submatch_data {
 215   /* Tag that gives the value for rm_so (submatch start offset). */
 216   int so_tag;
 217   /* Tag that gives the value for rm_eo (submatch end offset). */
 218   int eo_tag;
 219 };
 220 
 221 typedef struct tre_submatch_data tre_submatch_data_t;
 222 
 223 /* LAST MATCHED structures */
 224 struct __previous_type;
 225 struct __previous_branch_type;
 226 struct __current_type;
 227 struct __current_branch_type;
 228 
 229 typedef struct __previous_branch_type {
 230   struct __previous_branch_type *next;
 231   struct __previous_type *last_matched; int n_last_matched; int cmp_tag; int n_tags;
 232   int tot_branches;
 233   int tot_last_matched;
 234   int tot_tags;
 235   bitstr_t tags[0];
 236 } tre_last_matched_branch_pre_t;
 237 
 238 typedef struct __previous_type {
 239   struct __previous_type *next;
 240   tre_last_matched_branch_pre_t *branches; int n_branches; int start_tag;
 241   int tot_branches;
 242   int tot_last_matched;
 243   int tot_tags;
 244 } tre_last_matched_pre_t;
 245 
 246 typedef struct __current_branch_type {
 247   int *tags;
 248   struct __current_type *last_matched; int n_last_matched; int cmp_tag; int n_tags;
 249 } tre_last_matched_branch_t;
 250 
 251 typedef struct __current_type {
 252   tre_last_matched_branch_t *branches; int n_branches; int start_tag;
 253 } tre_last_matched_t;
 254 
 255 /* TNFA definition. */
 256 typedef struct tnfa tre_tnfa_t;
 257 
 258 struct tnfa {
 259   tre_tnfa_transition_t *transitions;
 260   tre_tnfa_transition_t *initial;
 261   tre_tnfa_transition_t *final;
 262   tre_submatch_data_t *submatch_data;
 263   tre_tag_direction_t *tag_directions;
 264   int *minimal_tags;
 265   tre_last_matched_branch_t *last_matched_branch;
 266   locale_t loc;
 267   unsigned int num_transitions;
 268   int first_char;
 269   unsigned int num_submatches;
 270   unsigned int num_submatches_invisible;
 271   int num_tags;
 272   int num_minimals;
 273   int end_tag;
 274   int num_states;
 275   int cflags;
 276   int have_backrefs;
 277   int num_reorder_tags;
 278   int have_approx;
 279   int params_depth;
 280 };
 281 
 282 int
 283 tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags,
 284             locale_t loc);
 285 
 286 void
 287 tre_free(regex_t *preg);
 288 
 289 reg_errcode_t
 290 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
 291                 const tre_tnfa_t *tnfa, const tre_tag_t *tags, int match_eo);
 292 
 293 reg_errcode_t
 294 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
 295                       tre_str_type_t type, tre_tag_t *match_tags, int eflags,
 296                       int *match_end_ofs);
 297 
 298 reg_errcode_t
 299 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
 300                        int len, tre_str_type_t type, tre_tag_t *match_tags,
 301                        int eflags, int *match_end_ofs);
 302 
 303 #endif  /* _TRE_INTERNAL_H */