1 /*
   2  * Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  *
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice, this list of conditions and the following disclaimer.
  11  *
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
  17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  18  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  19  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  20  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  21  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  22  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  26  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27  */
  28 
  29 /*
  30   tre-ast.h - Abstract syntax tree (AST) definitions
  31 */
  32 
  33 #ifndef _TRE_AST_H
  34 #define _TRE_AST_H
  35 
  36 #include <limits.h>
  37 
  38 #include "tre-mem.h"
  39 #include "tre-internal.h"
  40 #include "tre-compile.h"
  41 
  42 /* The different AST node types. */
  43 typedef enum {
  44   LITERAL,
  45   CATENATION,
  46   ITERATION,
  47   UNION
  48 } tre_ast_type_t;
  49 
  50 /* Special subtypes of TRE_LITERAL. */
  51 #define EMPTY     -1   /* Empty leaf (denotes empty string). */
  52 #define ASSERTION -2   /* Assertion leaf. */
  53 #define TAG       -3   /* Tag leaf. */
  54 #define BACKREF   -4   /* Back reference leaf. */
  55 #define PARAMETER -5   /* Parameter. */
  56 
  57 #define IS_SPECIAL(x)   ((x)->code_min < 0)
  58 #define IS_EMPTY(x)     ((x)->code_min == EMPTY)
  59 #define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
  60 #define IS_TAG(x)       ((x)->code_min == TAG)
  61 #define IS_BACKREF(x)   ((x)->code_min == BACKREF)
  62 #define IS_PARAMETER(x) ((x)->code_min == PARAMETER)
  63 
  64 #define SUBMATCH_ID_INVISIBLE_START     (INT_MAX / 2 + 1)
  65 
  66 /* A generic AST node.  All AST nodes consist of this node on the top
  67    level with `obj' pointing to the actual content. */
  68 typedef struct _tre_ast_node {
  69   void *obj;             /* Pointer to actual node. */
  70   tre_last_matched_branch_pre_t *last_matched_branch;
  71   tre_last_matched_pre_t *last_matched_in_progress;
  72   tre_pos_and_tags_t *firstpos;
  73   tre_pos_and_tags_t *lastpos;
  74   /* The original pointer is used to point to the original node, when a
  75    * CATENATION and tag are added.  If NULL, this is node is the original */
  76   struct _tre_ast_node *original;
  77   tre_ast_type_t type;   /* Type of the node. */
  78   int submatch_id;
  79   int num_submatches;
  80   int num_tags;
  81   short nullable;
  82   short make_branches;
  83 } tre_ast_node_t;
  84 
  85 /* A "literal" node.  These are created for assertions, back references,
  86    tags, matching parameter settings, and all expressions that match one
  87    character. */
  88 typedef struct {
  89   tre_cint_t code_min;
  90   tre_cint_t code_max;
  91   int position;
  92   union {
  93     tre_bracket_match_list_t *bracket_match_list;
  94     int *params;
  95   } u;
  96 } tre_literal_t;
  97 
  98 /* A "catenation" node.  These are created when two regexps are concatenated.
  99    If there are more than one subexpressions in sequence, the `left' part
 100    holds all but the last, and `right' part holds the last subexpression
 101    (catenation is left associative). */
 102 typedef struct {
 103   tre_ast_node_t *left;
 104   tre_ast_node_t *right;
 105 } tre_catenation_t;
 106 
 107 /* An "iteration" node.  These are created for the "*", "+", "?", and "{m,n}"
 108    operators. */
 109 typedef struct {
 110   /* Subexpression to match. */
 111   tre_ast_node_t *arg;
 112   /* Minimum number of consecutive matches. */
 113   int min;
 114   /* Maximum number of consecutive matches. */
 115   int max;
 116   /* If 0, match as many characters as possible, if 1 match as few as
 117      possible.  Note that this does not always mean the same thing as
 118      matching as many/few repetitions as possible. */
 119   unsigned int minimal:1;
 120   /* Approximate matching parameters (or NULL). */
 121   int *params;
 122 } tre_iteration_t;
 123 
 124 /* An "union" node.  These are created for the "|" operator. */
 125 typedef struct {
 126   tre_ast_node_t *left;
 127   tre_ast_node_t *right;
 128   /* The left end right end tags if non-zero */
 129   int left_tag;
 130   int right_tag;
 131 } tre_union_t;
 132 
 133 tre_ast_node_t *
 134 tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size);
 135 
 136 tre_ast_node_t *
 137 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position);
 138 
 139 tre_ast_node_t *
 140 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
 141                  int minimal);
 142 
 143 tre_ast_node_t *
 144 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right);
 145 
 146 tre_ast_node_t *
 147 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
 148                        tre_ast_node_t *right);
 149 
 150 #ifdef TRE_DEBUG
 151 void
 152 tre_ast_print(tre_ast_node_t *tree);
 153 
 154 /* XXX - rethink AST printing API */
 155 void
 156 tre_print_params(int *params);
 157 #endif /* TRE_DEBUG */
 158 
 159 #endif  /* _TRE_AST_H */