/* vi: set sw=4 ts=4: */ /* * awk implementation for busybox * * Copyright (C) 2002 by Dmitry Zakharov * * Licensed under GPLv2 or later, see file LICENSE in this source tree. */ //config:config AWK //config: bool "awk (23 kb)" //config: default y //config: help //config: Awk is used as a pattern scanning and processing language. //config: //config:config FEATURE_AWK_LIBM //config: bool "Enable math functions (requires libm)" //config: default y //config: depends on AWK //config: help //config: Enable math functions of the Awk programming language. //config: NOTE: This requires libm to be present for linking. //config: //config:config FEATURE_AWK_GNU_EXTENSIONS //config: bool "Enable a few GNU extensions" //config: default y //config: depends on AWK //config: help //config: Enable a few features from gawk: //config: * command line option -e AWK_PROGRAM //config: * simultaneous use of -f and -e on the command line. //config: This enables the use of awk library files. //config: Example: awk -f mylib.awk -e '{print myfunction($1);}' ... //applet:IF_AWK(APPLET_NOEXEC(awk, awk, BB_DIR_USR_BIN, BB_SUID_DROP, awk)) //kbuild:lib-$(CONFIG_AWK) += awk.o //usage:#define awk_trivial_usage //usage: "[OPTIONS] [AWK_PROGRAM] [FILE]..." //usage:#define awk_full_usage "\n\n" //usage: " -v VAR=VAL Set variable" //usage: "\n -F SEP Use SEP as field separator" //usage: "\n -f FILE Read program from FILE" //usage: IF_FEATURE_AWK_GNU_EXTENSIONS( //usage: "\n -e AWK_PROGRAM" //usage: ) #include "libbb.h" #include "xregex.h" #include /* This is a NOEXEC applet. Be very careful! */ /* If you comment out one of these below, it will be #defined later * to perform debug printfs to stderr: */ #define debug_printf_walker(...) do {} while (0) #define debug_printf_eval(...) do {} while (0) #define debug_printf_parse(...) do {} while (0) #ifndef debug_printf_walker # define debug_printf_walker(...) (fprintf(stderr, __VA_ARGS__)) #endif #ifndef debug_printf_eval # define debug_printf_eval(...) (fprintf(stderr, __VA_ARGS__)) #endif #ifndef debug_printf_parse # define debug_printf_parse(...) (fprintf(stderr, __VA_ARGS__)) #else # define debug_parse_print_tc(...) ((void)0) #endif /* "+": stop on first non-option: * $ awk 'BEGIN { for(i=1; i >> */ #define TC_UOPPOST (1 << 4) /* unary postfix operator ++ -- */ #define TC_UOPPRE1 (1 << 5) /* unary prefix operator ++ -- $ */ #define TC_BINOPX (1 << 6) /* two-opnd operator */ #define TC_IN (1 << 7) /* 'in' */ #define TC_COMMA (1 << 8) /* , */ #define TC_PIPE (1 << 9) /* input redirection pipe | */ #define TC_UOPPRE2 (1 << 10) /* unary prefix operator + - ! */ #define TC_ARRTERM (1 << 11) /* ] */ #define TC_GRPSTART (1 << 12) /* { */ #define TC_GRPTERM (1 << 13) /* } */ #define TC_SEMICOL (1 << 14) /* ; */ #define TC_NEWLINE (1 << 15) #define TC_STATX (1 << 16) /* ctl statement (for, next...) */ #define TC_WHILE (1 << 17) /* 'while' */ #define TC_ELSE (1 << 18) /* 'else' */ #define TC_BUILTIN (1 << 19) /* This costs ~50 bytes of code. * A separate class to support deprecated "length" form. If we don't need that * (i.e. if we demand that only "length()" with () is valid), then TC_LENGTH * can be merged with TC_BUILTIN: */ #define TC_LENGTH (1 << 20) /* 'length' */ #define TC_GETLINE (1 << 21) /* 'getline' */ #define TC_FUNCDECL (1 << 22) /* 'function' 'func' */ #define TC_BEGIN (1 << 23) /* 'BEGIN' */ #define TC_END (1 << 24) /* 'END' */ #define TC_EOF (1 << 25) #define TC_VARIABLE (1 << 26) /* name */ #define TC_ARRAY (1 << 27) /* name[ */ #define TC_FUNCTION (1 << 28) /* name( */ #define TC_STRING (1 << 29) /* "..." */ #define TC_NUMBER (1 << 30) #ifndef debug_parse_print_tc #define debug_parse_print_tc(n) do { \ if ((n) & TC_LPAREN ) debug_printf_parse(" LPAREN" ); \ if ((n) & TC_RPAREN ) debug_printf_parse(" RPAREN" ); \ if ((n) & TC_REGEXP ) debug_printf_parse(" REGEXP" ); \ if ((n) & TC_OUTRDR ) debug_printf_parse(" OUTRDR" ); \ if ((n) & TC_UOPPOST ) debug_printf_parse(" UOPPOST" ); \ if ((n) & TC_UOPPRE1 ) debug_printf_parse(" UOPPRE1" ); \ if ((n) & TC_BINOPX ) debug_printf_parse(" BINOPX" ); \ if ((n) & TC_IN ) debug_printf_parse(" IN" ); \ if ((n) & TC_COMMA ) debug_printf_parse(" COMMA" ); \ if ((n) & TC_PIPE ) debug_printf_parse(" PIPE" ); \ if ((n) & TC_UOPPRE2 ) debug_printf_parse(" UOPPRE2" ); \ if ((n) & TC_ARRTERM ) debug_printf_parse(" ARRTERM" ); \ if ((n) & TC_GRPSTART) debug_printf_parse(" GRPSTART"); \ if ((n) & TC_GRPTERM ) debug_printf_parse(" GRPTERM" ); \ if ((n) & TC_SEMICOL ) debug_printf_parse(" SEMICOL" ); \ if ((n) & TC_NEWLINE ) debug_printf_parse(" NEWLINE" ); \ if ((n) & TC_STATX ) debug_printf_parse(" STATX" ); \ if ((n) & TC_WHILE ) debug_printf_parse(" WHILE" ); \ if ((n) & TC_ELSE ) debug_printf_parse(" ELSE" ); \ if ((n) & TC_BUILTIN ) debug_printf_parse(" BUILTIN" ); \ if ((n) & TC_LENGTH ) debug_printf_parse(" LENGTH" ); \ if ((n) & TC_GETLINE ) debug_printf_parse(" GETLINE" ); \ if ((n) & TC_FUNCDECL) debug_printf_parse(" FUNCDECL"); \ if ((n) & TC_BEGIN ) debug_printf_parse(" BEGIN" ); \ if ((n) & TC_END ) debug_printf_parse(" END" ); \ if ((n) & TC_EOF ) debug_printf_parse(" EOF" ); \ if ((n) & TC_VARIABLE) debug_printf_parse(" VARIABLE"); \ if ((n) & TC_ARRAY ) debug_printf_parse(" ARRAY" ); \ if ((n) & TC_FUNCTION) debug_printf_parse(" FUNCTION"); \ if ((n) & TC_STRING ) debug_printf_parse(" STRING" ); \ if ((n) & TC_NUMBER ) debug_printf_parse(" NUMBER" ); \ } while (0) #endif /* combined token classes ("token [class] sets") */ #define TS_UOPPRE (TC_UOPPRE1 | TC_UOPPRE2) #define TS_BINOP (TC_BINOPX | TC_COMMA | TC_PIPE | TC_IN) //#define TS_UNARYOP (TS_UOPPRE | TC_UOPPOST) #define TS_OPERAND (TC_VARIABLE | TC_ARRAY | TC_FUNCTION \ | TC_BUILTIN | TC_LENGTH | TC_GETLINE \ | TC_LPAREN | TC_STRING | TC_NUMBER) #define TS_LVALUE (TC_VARIABLE | TC_ARRAY) #define TS_STATEMNT (TC_STATX | TC_WHILE) #define TS_OPTERM (TC_SEMICOL | TC_NEWLINE) /* word tokens, cannot mean something else if not expected */ #define TS_WORD (TC_IN | TS_STATEMNT | TC_ELSE \ | TC_BUILTIN | TC_LENGTH | TC_GETLINE \ | TC_FUNCDECL | TC_BEGIN | TC_END) /* discard newlines after these */ #define TS_NOTERM (TC_COMMA | TC_GRPSTART | TC_GRPTERM \ | TS_BINOP | TS_OPTERM) /* what can expression begin with */ #define TS_OPSEQ (TS_OPERAND | TS_UOPPRE | TC_REGEXP) /* what can group begin with */ #define TS_GRPSEQ (TS_OPSEQ | TS_OPTERM | TS_STATEMNT | TC_GRPSTART) /* if previous token class is CONCAT_L and next is CONCAT_R, concatenation */ /* operator is inserted between them */ #define TS_CONCAT_L (TC_VARIABLE | TC_ARRTERM | TC_RPAREN \ | TC_STRING | TC_NUMBER | TC_UOPPOST \ | TC_LENGTH) #define TS_CONCAT_R (TS_OPERAND | TS_UOPPRE) #define OF_RES1 0x010000 #define OF_RES2 0x020000 #define OF_STR1 0x040000 #define OF_STR2 0x080000 #define OF_NUM1 0x100000 #define OF_CHECKED 0x200000 #define OF_REQUIRED 0x400000 /* combined operator flags */ #define xx 0 #define xV OF_RES2 #define xS (OF_RES2 | OF_STR2) #define Vx OF_RES1 #define Rx (OF_RES1 | OF_NUM1 | OF_REQUIRED) #define VV (OF_RES1 | OF_RES2) #define Nx (OF_RES1 | OF_NUM1) #define NV (OF_RES1 | OF_NUM1 | OF_RES2) #define Sx (OF_RES1 | OF_STR1) #define SV (OF_RES1 | OF_STR1 | OF_RES2) #define SS (OF_RES1 | OF_STR1 | OF_RES2 | OF_STR2) #define OPCLSMASK 0xFF00 #define OPNMASK 0x007F /* operator priority is a highest byte (even: r->l, odd: l->r grouping) * For builtins it has different meaning: n n s3 s2 s1 v3 v2 v1, * n - min. number of args, vN - resolve Nth arg to var, sN - resolve to string */ #undef P #undef PRIMASK #undef PRIMASK2 #define P(x) (x << 24) #define PRIMASK 0x7F000000 #define PRIMASK2 0x7E000000 /* Operation classes */ #define SHIFT_TIL_THIS 0x0600 #define RECUR_FROM_THIS 0x1000 enum { OC_DELETE = 0x0100, OC_EXEC = 0x0200, OC_NEWSOURCE = 0x0300, OC_PRINT = 0x0400, OC_PRINTF = 0x0500, OC_WALKINIT = 0x0600, OC_BR = 0x0700, OC_BREAK = 0x0800, OC_CONTINUE = 0x0900, OC_EXIT = 0x0a00, OC_NEXT = 0x0b00, OC_NEXTFILE = 0x0c00, OC_TEST = 0x0d00, OC_WALKNEXT = 0x0e00, OC_BINARY = 0x1000, OC_BUILTIN = 0x1100, OC_COLON = 0x1200, OC_COMMA = 0x1300, OC_COMPARE = 0x1400, OC_CONCAT = 0x1500, OC_FBLTIN = 0x1600, OC_FIELD = 0x1700, OC_FNARG = 0x1800, OC_FUNC = 0x1900, OC_GETLINE = 0x1a00, OC_IN = 0x1b00, OC_LAND = 0x1c00, OC_LOR = 0x1d00, OC_MATCH = 0x1e00, OC_MOVE = 0x1f00, OC_PGETLINE = 0x2000, OC_REGEXP = 0x2100, OC_REPLACE = 0x2200, OC_RETURN = 0x2300, OC_SPRINTF = 0x2400, OC_TERNARY = 0x2500, OC_UNARY = 0x2600, OC_VAR = 0x2700, OC_DONE = 0x2800, ST_IF = 0x3000, ST_DO = 0x3100, ST_FOR = 0x3200, ST_WHILE = 0x3300 }; /* simple builtins */ enum { F_in, F_rn, F_co, F_ex, F_lg, F_si, F_sq, F_sr, F_ti, F_le, F_sy, F_ff, F_cl }; /* builtins */ enum { B_a2, B_ix, B_ma, B_sp, B_ss, B_ti, B_mt, B_lo, B_up, B_ge, B_gs, B_su, B_an, B_co, B_ls, B_or, B_rs, B_xo, }; /* tokens and their corresponding info values */ #define NTC "\377" /* switch to next token class (tc<<1) */ #define NTCC '\377' static const char tokenlist[] ALIGN1 = "\1(" NTC /* TC_LPAREN */ "\1)" NTC /* TC_RPAREN */ "\1/" NTC /* TC_REGEXP */ "\2>>" "\1>" "\1|" NTC /* TC_OUTRDR */ "\2++" "\2--" NTC /* TC_UOPPOST */ "\2++" "\2--" "\1$" NTC /* TC_UOPPRE1 */ "\2==" "\1=" "\2+=" "\2-=" /* TC_BINOPX */ "\2*=" "\2/=" "\2%=" "\2^=" "\1+" "\1-" "\3**=" "\2**" "\1/" "\1%" "\1^" "\1*" "\2!=" "\2>=" "\2<=" "\1>" "\1<" "\2!~" "\1~" "\2&&" "\2||" "\1?" "\1:" NTC "\2in" NTC /* TC_IN */ "\1," NTC /* TC_COMMA */ "\1|" NTC /* TC_PIPE */ "\1+" "\1-" "\1!" NTC /* TC_UOPPRE2 */ "\1]" NTC /* TC_ARRTERM */ "\1{" NTC /* TC_GRPSTART */ "\1}" NTC /* TC_GRPTERM */ "\1;" NTC /* TC_SEMICOL */ "\1\n" NTC /* TC_NEWLINE */ "\2if" "\2do" "\3for" "\5break" /* TC_STATX */ "\10continue" "\6delete" "\5print" "\6printf" "\4next" "\10nextfile" "\6return" "\4exit" NTC "\5while" NTC /* TC_WHILE */ "\4else" NTC /* TC_ELSE */ "\3and" "\5compl" "\6lshift" "\2or" /* TC_BUILTIN */ "\6rshift" "\3xor" "\5close" "\6system" "\6fflush" "\5atan2" "\3cos" "\3exp" "\3int" "\3log" "\4rand" "\3sin" "\4sqrt" "\5srand" "\6gensub" "\4gsub" "\5index" /* "\6length" was here */ "\5match" "\5split" "\7sprintf" "\3sub" "\6substr" "\7systime" "\10strftime" "\6mktime" "\7tolower" "\7toupper" NTC "\6length" NTC /* TC_LENGTH */ "\7getline" NTC /* TC_GETLINE */ "\4func" "\10function" NTC /* TC_FUNCDECL */ "\5BEGIN" NTC /* TC_BEGIN */ "\3END" /* TC_END */ /* compiler adds trailing "\0" */ ; #define OC_B OC_BUILTIN static const uint32_t tokeninfo[] ALIGN4 = { 0, 0, OC_REGEXP, xS|'a', xS|'w', xS|'|', OC_UNARY|xV|P(9)|'p', OC_UNARY|xV|P(9)|'m', #define TI_PREINC (OC_UNARY|xV|P(9)|'P') #define TI_PREDEC (OC_UNARY|xV|P(9)|'M') TI_PREINC, TI_PREDEC, OC_FIELD|xV|P(5), OC_COMPARE|VV|P(39)|5, OC_MOVE|VV|P(74), OC_REPLACE|NV|P(74)|'+', OC_REPLACE|NV|P(74)|'-', OC_REPLACE|NV|P(74)|'*', OC_REPLACE|NV|P(74)|'/', OC_REPLACE|NV|P(74)|'%', OC_REPLACE|NV|P(74)|'&', OC_BINARY|NV|P(29)|'+', OC_BINARY|NV|P(29)|'-', OC_REPLACE|NV|P(74)|'&', OC_BINARY|NV|P(15)|'&', OC_BINARY|NV|P(25)|'/', OC_BINARY|NV|P(25)|'%', OC_BINARY|NV|P(15)|'&', OC_BINARY|NV|P(25)|'*', OC_COMPARE|VV|P(39)|4, OC_COMPARE|VV|P(39)|3, OC_COMPARE|VV|P(39)|0, OC_COMPARE|VV|P(39)|1, #define TI_LESS (OC_COMPARE|VV|P(39)|2) TI_LESS, OC_MATCH|Sx|P(45)|'!', OC_MATCH|Sx|P(45)|'~', OC_LAND|Vx|P(55), OC_LOR|Vx|P(59), OC_TERNARY|Vx|P(64)|'?', OC_COLON|xx|P(67)|':', OC_IN|SV|P(49), /* TC_IN */ OC_COMMA|SS|P(80), OC_PGETLINE|SV|P(37), OC_UNARY|xV|P(19)|'+', OC_UNARY|xV|P(19)|'-', OC_UNARY|xV|P(19)|'!', 0, /* ] */ 0, 0, 0, 0, /* \n */ ST_IF, ST_DO, ST_FOR, OC_BREAK, OC_CONTINUE, OC_DELETE|Rx, OC_PRINT, OC_PRINTF, OC_NEXT, OC_NEXTFILE, OC_RETURN|Vx, OC_EXIT|Nx, ST_WHILE, 0, /* else */ OC_B|B_an|P(0x83), OC_B|B_co|P(0x41), OC_B|B_ls|P(0x83), OC_B|B_or|P(0x83), OC_B|B_rs|P(0x83), OC_B|B_xo|P(0x83), OC_FBLTIN|Sx|F_cl, OC_FBLTIN|Sx|F_sy, OC_FBLTIN|Sx|F_ff, OC_B|B_a2|P(0x83), OC_FBLTIN|Nx|F_co, OC_FBLTIN|Nx|F_ex, OC_FBLTIN|Nx|F_in, OC_FBLTIN|Nx|F_lg, OC_FBLTIN|F_rn, OC_FBLTIN|Nx|F_si, OC_FBLTIN|Nx|F_sq, OC_FBLTIN|Nx|F_sr, OC_B|B_ge|P(0xd6), OC_B|B_gs|P(0xb6), OC_B|B_ix|P(0x9b), /* OC_FBLTIN|Sx|F_le, was here */ OC_B|B_ma|P(0x89), OC_B|B_sp|P(0x8b), OC_SPRINTF, OC_B|B_su|P(0xb6), OC_B|B_ss|P(0x8f), OC_FBLTIN|F_ti, OC_B|B_ti|P(0x0b), OC_B|B_mt|P(0x0b), OC_B|B_lo|P(0x49), OC_B|B_up|P(0x49), OC_FBLTIN|Sx|F_le, /* TC_LENGTH */ OC_GETLINE|SV|P(0), 0, 0, 0, 0 /* TC_END */ }; /* internal variable names and their initial values */ /* asterisk marks SPECIAL vars; $ is just no-named Field0 */ enum { CONVFMT, OFMT, FS, OFS, ORS, RS, RT, FILENAME, SUBSEP, F0, ARGIND, ARGC, ARGV, ERRNO, FNR, NR, NF, IGNORECASE, ENVIRON, NUM_INTERNAL_VARS }; static const char vNames[] ALIGN1 = "CONVFMT\0" "OFMT\0" "FS\0*" "OFS\0" "ORS\0" "RS\0*" "RT\0" "FILENAME\0" "SUBSEP\0" "$\0*" "ARGIND\0" "ARGC\0" "ARGV\0" "ERRNO\0" "FNR\0" "NR\0" "NF\0*" "IGNORECASE\0*" "ENVIRON\0" "\0"; static const char vValues[] ALIGN1 = "%.6g\0" "%.6g\0" " \0" " \0" "\n\0" "\n\0" "\0" "\0" "\034\0" "\0" "\377"; /* hash size may grow to these values */ #define FIRST_PRIME 61 static const uint16_t PRIMES[] ALIGN2 = { 251, 1021, 4093, 16381, 65521 }; /* Globals. Split in two parts so that first one is addressed * with (mostly short) negative offsets. * NB: it's unsafe to put members of type "double" * into globals2 (gcc may fail to align them). */ struct globals { double t_double; chain beginseq, mainseq, endseq; chain *seq; node *break_ptr, *continue_ptr; rstream *iF; xhash *ahash; /* argument names, used only while parsing function bodies */ xhash *fnhash; /* function names, used only in parsing stage */ xhash *vhash; /* variables and arrays */ //xhash *fdhash; /* file objects, used only in execution stage */ //we are reusing ahash as fdhash, via define (see later) const char *g_progname; int g_lineno; int nfields; int maxfields; /* used in fsrealloc() only */ var *Fields; char *g_pos; char g_saved_ch; smallint icase; smallint exiting; smallint nextrec; smallint nextfile; smallint is_f0_split; smallint t_rollback; }; struct globals2 { uint32_t t_info; /* often used */ uint32_t t_tclass; char *t_string; int t_lineno; var *intvar[NUM_INTERNAL_VARS]; /* often used */ /* former statics from various functions */ char *split_f0__fstrings; uint32_t next_token__save_tclass; uint32_t next_token__save_info; smallint next_token__concat_inserted; smallint next_input_file__files_happen; rstream next_input_file__rsm; var *evaluate__fnargs; unsigned evaluate__seed; regex_t evaluate__sreg; var ptest__tmpvar; var awk_printf__tmpvar; var as_regex__tmpvar; tsplitter exec_builtin__tspl; /* biggest and least used members go last */ tsplitter fsplitter, rsplitter; char g_buf[MAXVARFMT + 1]; }; #define G1 (ptr_to_globals[-1]) #define G (*(struct globals2 *)ptr_to_globals) /* For debug. nm --size-sort awk.o | grep -vi ' [tr] ' */ /*char G1size[sizeof(G1)]; - 0x74 */ /*char Gsize[sizeof(G)]; - 0x1c4 */ /* Trying to keep most of members accessible with short offsets: */ /*char Gofs_seed[offsetof(struct globals2, evaluate__seed)]; - 0x90 */ #define t_double (G1.t_double ) #define beginseq (G1.beginseq ) #define mainseq (G1.mainseq ) #define endseq (G1.endseq ) #define seq (G1.seq ) #define break_ptr (G1.break_ptr ) #define continue_ptr (G1.continue_ptr) #define iF (G1.iF ) #define ahash (G1.ahash ) #define fnhash (G1.fnhash ) #define vhash (G1.vhash ) #define fdhash ahash //^^^^^^^^^^^^^^^^^^ ahash is cleared after every function parsing, // and ends up empty after parsing phase. Thus, we can simply reuse it // for fdhash in execution stage. #define g_progname (G1.g_progname ) #define g_lineno (G1.g_lineno ) #define nfields (G1.nfields ) #define maxfields (G1.maxfields ) #define Fields (G1.Fields ) #define g_pos (G1.g_pos ) #define g_saved_ch (G1.g_saved_ch ) #define icase (G1.icase ) #define exiting (G1.exiting ) #define nextrec (G1.nextrec ) #define nextfile (G1.nextfile ) #define is_f0_split (G1.is_f0_split ) #define t_rollback (G1.t_rollback ) #define t_info (G.t_info ) #define t_tclass (G.t_tclass ) #define t_string (G.t_string ) #define t_lineno (G.t_lineno ) #define intvar (G.intvar ) #define fsplitter (G.fsplitter ) #define rsplitter (G.rsplitter ) #define g_buf (G.g_buf ) #define INIT_G() do { \ SET_PTR_TO_GLOBALS((char*)xzalloc(sizeof(G1)+sizeof(G)) + sizeof(G1)); \ t_tclass = TS_OPTERM; \ G.evaluate__seed = 1; \ } while (0) /* function prototypes */ static void handle_special(var *); static node *parse_expr(uint32_t); static void chain_group(void); static var *evaluate(node *, var *); static rstream *next_input_file(void); static int fmt_num(char *, int, const char *, double, int); static int awk_exit(int) NORETURN; /* ---- error handling ---- */ static const char EMSG_UNEXP_EOS[] ALIGN1 = "Unexpected end of string"; static const char EMSG_UNEXP_TOKEN[] ALIGN1 = "Unexpected token"; static const char EMSG_DIV_BY_ZERO[] ALIGN1 = "Division by zero"; static const char EMSG_INV_FMT[] ALIGN1 = "Invalid format specifier"; static const char EMSG_TOO_FEW_ARGS[] ALIGN1 = "Too few arguments"; static const char EMSG_NOT_ARRAY[] ALIGN1 = "Not an array"; static const char EMSG_POSSIBLE_ERROR[] ALIGN1 = "Possible syntax error"; static const char EMSG_UNDEF_FUNC[] ALIGN1 = "Call to undefined function"; static const char EMSG_NO_MATH[] ALIGN1 = "Math support is not compiled in"; static const char EMSG_NEGATIVE_FIELD[] ALIGN1 = "Access to negative field"; static void zero_out_var(var *vp) { memset(vp, 0, sizeof(*vp)); } static void syntax_error(const char *message) NORETURN; static void syntax_error(const char *message) { bb_error_msg_and_die("%s:%i: %s", g_progname, g_lineno, message); } /* ---- hash stuff ---- */ static unsigned hashidx(const char *name) { unsigned idx = 0; while (*name) idx = *name++ + (idx << 6) - idx; return idx; } /* create new hash */ static xhash *hash_init(void) { xhash *newhash; newhash = xzalloc(sizeof(*newhash)); newhash->csize = FIRST_PRIME; newhash->items = xzalloc(FIRST_PRIME * sizeof(newhash->items[0])); return newhash; } static void hash_clear(xhash *hash) { unsigned i; hash_item *hi, *thi; for (i = 0; i < hash->csize; i++) { hi = hash->items[i]; while (hi) { thi = hi; hi = hi->next; //FIXME: this assumes that it's a hash of *variables*: free(thi->data.v.string); free(thi); } hash->items[i] = NULL; } hash->glen = hash->nel = 0; } #if 0 //UNUSED static void hash_free(xhash *hash) { hash_clear(hash); free(hash->items); free(hash); } #endif /* find item in hash, return ptr to data, NULL if not found */ static NOINLINE void *hash_search3(xhash *hash, const char *name, unsigned idx) { hash_item *hi; hi = hash->items[idx % hash->csize]; while (hi) { if (strcmp(hi->name, name) == 0) return &hi->data; hi = hi->next; } return NULL; } static void *hash_search(xhash *hash, const char *name) { return hash_search3(hash, name, hashidx(name)); } /* grow hash if it becomes too big */ static void hash_rebuild(xhash *hash) { unsigned newsize, i, idx; hash_item **newitems, *hi, *thi; if (hash->nprime == ARRAY_SIZE(PRIMES)) return; newsize = PRIMES[hash->nprime++]; newitems = xzalloc(newsize * sizeof(newitems[0])); for (i = 0; i < hash->csize; i++) { hi = hash->items[i]; while (hi) { thi = hi; hi = thi->next; idx = hashidx(thi->name) % newsize; thi->next = newitems[idx]; newitems[idx] = thi; } } free(hash->items); hash->csize = newsize; hash->items = newitems; } /* find item in hash, add it if necessary. Return ptr to data */ static void *hash_find(xhash *hash, const char *name) { hash_item *hi; unsigned idx; int l; idx = hashidx(name); hi = hash_search3(hash, name, idx); if (!hi) { if (++hash->nel > hash->csize * 8) hash_rebuild(hash); l = strlen(name) + 1; hi = xzalloc(sizeof(*hi) + l); strcpy(hi->name, name); idx = idx % hash->csize; hi->next = hash->items[idx]; hash->items[idx] = hi; hash->glen += l; } return &hi->data; } #define findvar(hash, name) ((var*) hash_find((hash), (name))) #define newvar(name) ((var*) hash_find(vhash, (name))) #define newfile(name) ((rstream*)hash_find(fdhash, (name))) #define newfunc(name) ((func*) hash_find(fnhash, (name))) static void hash_remove(xhash *hash, const char *name) { hash_item *hi, **phi; phi = &hash->items[hashidx(name) % hash->csize]; while (*phi) { hi = *phi; if (strcmp(hi->name, name) == 0) { hash->glen -= (strlen(name) + 1); hash->nel--; *phi = hi->next; free(hi); break; } phi = &hi->next; } } /* ------ some useful functions ------ */ static char *skip_spaces(char *p) { for (;;) { if (*p == '\\' && p[1] == '\n') { p++; t_lineno++; } else if (*p != ' ' && *p != '\t') { break; } p++; } return p; } /* returns old *s, advances *s past word and terminating NUL */ static char *nextword(char **s) { char *p = *s; char *q = p; while (*q++ != '\0') continue; *s = q; return p; } static char nextchar(char **s) { char c, *pps; c = *(*s)++; pps = *s; if (c == '\\') c = bb_process_escape_sequence((const char**)s); /* Example awk statement: * s = "abc\"def" * we must treat \" as " */ if (c == '\\' && *s == pps) { /* unrecognized \z? */ c = *(*s); /* yes, fetch z */ if (c) (*s)++; /* advance unless z = NUL */ } return c; } /* TODO: merge with strcpy_and_process_escape_sequences()? */ static void unescape_string_in_place(char *s1) { char *s = s1; while ((*s1 = nextchar(&s)) != '\0') s1++; } static ALWAYS_INLINE int isalnum_(int c) { return (isalnum(c) || c == '_'); } static double my_strtod(char **pp) { char *cp = *pp; if (ENABLE_DESKTOP && cp[0] == '0') { /* Might be hex or octal integer: 0x123abc or 07777 */ char c = (cp[1] | 0x20); if (c == 'x' || isdigit(cp[1])) { unsigned long long ull = strtoull(cp, pp, 0); if (c == 'x') return ull; c = **pp; if (!isdigit(c) && c != '.') return ull; /* else: it may be a floating number. Examples: * 009.123 (*pp points to '9') * 000.123 (*pp points to '.') * fall through to strtod. */ } } return strtod(cp, pp); } /* -------- working with variables (set/get/copy/etc) -------- */ static xhash *iamarray(var *v) { var *a = v; while (a->type & VF_CHILD) a = a->x.parent; if (!(a->type & VF_ARRAY)) { a->type |= VF_ARRAY; a->x.array = hash_init(); } return a->x.array; } #define clear_array(array) hash_clear(array) /* clear a variable */ static var *clrvar(var *v) { if (!(v->type & VF_FSTR)) free(v->string); v->type &= VF_DONTTOUCH; v->type |= VF_DIRTY; v->string = NULL; return v; } /* assign string value to variable */ static var *setvar_p(var *v, char *value) { clrvar(v); v->string = value; handle_special(v); return v; } /* same as setvar_p but make a copy of string */ static var *setvar_s(var *v, const char *value) { return setvar_p(v, (value && *value) ? xstrdup(value) : NULL); } /* same as setvar_s but sets USER flag */ static var *setvar_u(var *v, const char *value) { v = setvar_s(v, value); v->type |= VF_USER; return v; } /* set array element to user string */ static void setari_u(var *a, int idx, const char *s) { var *v; v = findvar(iamarray(a), itoa(idx)); setvar_u(v, s); } /* assign numeric value to variable */ static var *setvar_i(var *v, double value) { clrvar(v); v->type |= VF_NUMBER; v->number = value; handle_special(v); return v; } static const char *getvar_s(var *v) { /* if v is numeric and has no cached string, convert it to string */ if ((v->type & (VF_NUMBER | VF_CACHED)) == VF_NUMBER) { fmt_num(g_buf, MAXVARFMT, getvar_s(intvar[CONVFMT]), v->number, TRUE); v->string = xstrdup(g_buf); v->type |= VF_CACHED; } return (v->string == NULL) ? "" : v->string; } static double getvar_i(var *v) { char *s; if ((v->type & (VF_NUMBER | VF_CACHED)) == 0) { v->number = 0; s = v->string; if (s && *s) { debug_printf_eval("getvar_i: '%s'->", s); v->number = my_strtod(&s); debug_printf_eval("%f (s:'%s')\n", v->number, s); if (v->type & VF_USER) { //TODO: skip_spaces() also skips backslash+newline, is it intended here? s = skip_spaces(s); if (*s != '\0') v->type &= ~VF_USER; } } else { debug_printf_eval("getvar_i: '%s'->zero\n", s); v->type &= ~VF_USER; } v->type |= VF_CACHED; } debug_printf_eval("getvar_i: %f\n", v->number); return v->number; } /* Used for operands of bitwise ops */ static unsigned long getvar_i_int(var *v) { double d = getvar_i(v); /* Casting doubles to longs is undefined for values outside * of target type range. Try to widen it as much as possible */ if (d >= 0) return (unsigned long)d; /* Why? Think about d == -4294967295.0 (assuming 32bit longs) */ return - (long) (unsigned long) (-d); } static var *copyvar(var *dest, const var *src) { if (dest != src) { clrvar(dest); dest->type |= (src->type & ~(VF_DONTTOUCH | VF_FSTR)); debug_printf_eval("copyvar: number:%f string:'%s'\n", src->number, src->string); dest->number = src->number; if (src->string) dest->string = xstrdup(src->string); } handle_special(dest); return dest; } static var *incvar(var *v) { return setvar_i(v, getvar_i(v) + 1.0); } /* return true if v is number or numeric string */ static int is_numeric(var *v) { getvar_i(v); return ((v->type ^ VF_DIRTY) & (VF_NUMBER | VF_USER | VF_DIRTY)); } /* return 1 when value of v corresponds to true, 0 otherwise */ static int istrue(var *v) { if (is_numeric(v)) return (v->number != 0); return (v->string && v->string[0]); } /* ------- awk program text parsing ------- */ /* Parse next token pointed by global pos, place results into global t_XYZ variables. * If token isn't expected, print error message and die. * Return token class (also store it in t_tclass). */ static uint32_t next_token(uint32_t expected) { #define concat_inserted (G.next_token__concat_inserted) #define save_tclass (G.next_token__save_tclass) #define save_info (G.next_token__save_info) char *p; const char *tl; const uint32_t *ti; uint32_t tc, last_token_class; last_token_class = t_tclass; /* t_tclass is initialized to TS_OPTERM */ debug_printf_parse("%s() expected(%x):", __func__, expected); debug_parse_print_tc(expected); debug_printf_parse("\n"); if (t_rollback) { debug_printf_parse("%s: using rolled-back token\n", __func__); t_rollback = FALSE; } else if (concat_inserted) { debug_printf_parse("%s: using concat-inserted token\n", __func__); concat_inserted = FALSE; t_tclass = save_tclass; t_info = save_info; } else { p = g_pos; if (g_saved_ch != '\0') { *p = g_saved_ch; g_saved_ch = '\0'; } readnext: p = skip_spaces(p); g_lineno = t_lineno; if (*p == '#') while (*p != '\n' && *p != '\0') p++; if (*p == '\0') { tc = TC_EOF; debug_printf_parse("%s: token found: TC_EOF\n", __func__); } else if (*p == '\"') { /* it's a string */ char *s = t_string = ++p; while (*p != '\"') { char *pp; if (*p == '\0' || *p == '\n') syntax_error(EMSG_UNEXP_EOS); pp = p; *s++ = nextchar(&pp); p = pp; } p++; *s = '\0'; tc = TC_STRING; debug_printf_parse("%s: token found:'%s' TC_STRING\n", __func__, t_string); } else if ((expected & TC_REGEXP) && *p == '/') { /* it's regexp */ char *s = t_string = ++p; while (*p != '/') { if (*p == '\0' || *p == '\n') syntax_error(EMSG_UNEXP_EOS); *s = *p++; if (*s++ == '\\') { char *pp = p; s[-1] = bb_process_escape_sequence((const char **)&pp); if (*p == '\\') *s++ = '\\'; if (pp == p) *s++ = *p++; else p = pp; } } p++; *s = '\0'; tc = TC_REGEXP; debug_printf_parse("%s: token found:'%s' TC_REGEXP\n", __func__, t_string); } else if (*p == '.' || isdigit(*p)) { /* it's a number */ char *pp = p; t_double = my_strtod(&pp); p = pp; if (*p == '.') syntax_error(EMSG_UNEXP_TOKEN); tc = TC_NUMBER; debug_printf_parse("%s: token found:%f TC_NUMBER\n", __func__, t_double); } else { char *end_of_name; if (*p == '\n') t_lineno++; /* search for something known */ tl = tokenlist; tc = 0x00000001; ti = tokeninfo; while (*tl) { int l = (unsigned char) *tl++; if (l == (unsigned char) NTCC) { tc <<= 1; continue; } /* if token class is expected, * token matches, * and it's not a longer word, */ if ((tc & (expected | TS_WORD | TC_NEWLINE)) && strncmp(p, tl, l) == 0 && !((tc & TS_WORD) && isalnum_(p[l])) ) { /* then this is what we are looking for */ t_info = *ti; debug_printf_parse("%s: token found:'%.*s' t_info:%x\n", __func__, l, p, t_info); p += l; goto token_found; } ti++; tl += l; } /* not a known token */ /* is it a name? (var/array/function) */ if (!isalnum_(*p)) syntax_error(EMSG_UNEXP_TOKEN); /* no */ /* yes */ t_string = p; while (isalnum_(*p)) p++; end_of_name = p; if (last_token_class == TC_FUNCDECL) /* eat space in "function FUNC (...) {...}" declaration */ p = skip_spaces(p); else if (expected & TC_ARRAY) { /* eat space between array name and [ */ char *s = skip_spaces(p); if (*s == '[') /* array ref, not just a name? */ p = s; } /* else: do NOT consume whitespace after variable name! * gawk allows definition "function FUNC (p) {...}" - note space, * but disallows the call "FUNC (p)" because it isn't one - * expression "v (a)" should NOT be parsed as TC_FUNCTION: * it is a valid concatenation if "v" is a variable, * not a function name (and type of name is not known at parse time). */ if (*p == '(') { p++; tc = TC_FUNCTION; debug_printf_parse("%s: token found:'%s' TC_FUNCTION\n", __func__, t_string); } else if (*p == '[') { p++; tc = TC_ARRAY; debug_printf_parse("%s: token found:'%s' TC_ARRAY\n", __func__, t_string); } else { tc = TC_VARIABLE; debug_printf_parse("%s: token found:'%s' TC_VARIABLE\n", __func__, t_string); if (end_of_name == p) { /* there is no space for trailing NUL in t_string! * We need to save the char we are going to NUL. * (we'll use it in future call to next_token()) */ g_saved_ch = *end_of_name; // especially pathological example is V="abc"; V.2 - it's V concatenated to .2 // (it evaluates to "abc0.2"). Because of this case, we can't simply cache // '.' and analyze it later: we also have to *store it back* in next // next_token(), in order to give my_strtod() the undamaged ".2" string. } } *end_of_name = '\0'; /* terminate t_string */ } token_found: g_pos = p; /* skipping newlines in some cases */ if ((last_token_class & TS_NOTERM) && (tc & TC_NEWLINE)) goto readnext; /* insert concatenation operator when needed */ debug_printf_parse("%s: concat_inserted if all nonzero: %x %x %x %x\n", __func__, (last_token_class & TS_CONCAT_L), (tc & TS_CONCAT_R), (expected & TS_BINOP), !(last_token_class == TC_LENGTH && tc == TC_LPAREN)); if ((last_token_class & TS_CONCAT_L) && (tc & TS_CONCAT_R) && (expected & TS_BINOP) && !(last_token_class == TC_LENGTH && tc == TC_LPAREN) /* but not for "length(..." */ ) { concat_inserted = TRUE; save_tclass = tc; save_info = t_info; tc = TC_BINOPX; t_info = OC_CONCAT | SS | P(35); } t_tclass = tc; debug_printf_parse("%s: t_tclass=tc=%x\n", __func__, tc); } /* Are we ready for this? */ if (!(t_tclass & expected)) { syntax_error((last_token_class & (TC_NEWLINE | TC_EOF)) ? EMSG_UNEXP_EOS : EMSG_UNEXP_TOKEN); } debug_printf_parse("%s: returning, t_double:%f t_tclass:", __func__, t_double); debug_parse_print_tc(t_tclass); debug_printf_parse("\n"); return t_tclass; #undef concat_inserted #undef save_tclass #undef save_info } static ALWAYS_INLINE void rollback_token(void) { t_rollback = TRUE; } static node *new_node(uint32_t info) { node *n; n = xzalloc(sizeof(node)); n->info = info; n->lineno = g_lineno; return n; } static void mk_re_node(const char *s, node *n, regex_t *re) { n->info = OC_REGEXP; n->l.re = re; n->r.ire = re + 1; xregcomp(re, s, REG_EXTENDED); xregcomp(re + 1, s, REG_EXTENDED | REG_ICASE); } static node *parse_lrparen_list(void) { next_token(TC_LPAREN); return parse_expr(TC_RPAREN); } /* parse expression terminated by given argument, return ptr * to built subtree. Terminator is eaten by parse_expr */ static node *parse_expr(uint32_t term_tc) { node sn; node *cn = &sn; node *vn, *glptr; uint32_t tc, expected_tc; var *v; debug_printf_parse("%s() term_tc(%x):", __func__, term_tc); debug_parse_print_tc(term_tc); debug_printf_parse("\n"); sn.info = PRIMASK; sn.r.n = sn.a.n = glptr = NULL; expected_tc = TS_OPERAND | TS_UOPPRE | TC_REGEXP | term_tc; while (!((tc = next_token(expected_tc)) & term_tc)) { if (glptr && (t_info == TI_LESS)) { /* input redirection (<) attached to glptr node */ debug_printf_parse("%s: input redir\n", __func__); cn = glptr->l.n = new_node(OC_CONCAT | SS | P(37)); cn->a.n = glptr; expected_tc = TS_OPERAND | TS_UOPPRE; glptr = NULL; continue; } if (tc & (TS_BINOP | TC_UOPPOST)) { debug_printf_parse("%s: TS_BINOP | TC_UOPPOST tc:%x\n", __func__, tc); /* for binary and postfix-unary operators, jump back over * previous operators with higher priority */ vn = cn; while (((t_info & PRIMASK) > (vn->a.n->info & PRIMASK2)) || ((t_info == vn->info) && ((t_info & OPCLSMASK) == OC_COLON)) ) { vn = vn->a.n; if (!vn->a.n) syntax_error(EMSG_UNEXP_TOKEN); } if ((t_info & OPCLSMASK) == OC_TERNARY) t_info += P(6); cn = vn->a.n->r.n = new_node(t_info); cn->a.n = vn->a.n; if (tc & TS_BINOP) { cn->l.n = vn; expected_tc = TS_OPERAND | TS_UOPPRE | TC_REGEXP; if ((t_info & OPCLSMASK) == OC_PGETLINE) { /* it's a pipe */ next_token(TC_GETLINE); /* give maximum priority to this pipe */ cn->info &= ~PRIMASK; expected_tc = TS_OPERAND | TS_UOPPRE | TS_BINOP | term_tc; } } else { cn->r.n = vn; expected_tc = TS_OPERAND | TS_UOPPRE | TS_BINOP | term_tc; } vn->a.n = cn; continue; } debug_printf_parse("%s: other, t_info:%x\n", __func__, t_info); /* for operands and prefix-unary operators, attach them * to last node */ vn = cn; cn = vn->r.n = new_node(t_info); cn->a.n = vn; expected_tc = TS_OPERAND | TS_UOPPRE | TC_REGEXP; if (t_info == TI_PREINC || t_info == TI_PREDEC) expected_tc = TS_LVALUE | TC_UOPPRE1; if (!(tc & (TS_OPERAND | TC_REGEXP))) continue; debug_printf_parse("%s: TS_OPERAND | TC_REGEXP\n", __func__); expected_tc = TS_UOPPRE | TC_UOPPOST | TS_BINOP | TS_OPERAND | term_tc; /* one should be very careful with switch on tclass - * only simple tclasses should be used (TC_xyz, not TS_xyz) */ switch (tc) { case TC_VARIABLE: case TC_ARRAY: debug_printf_parse("%s: TC_VARIABLE | TC_ARRAY\n", __func__); cn->info = OC_VAR; v = hash_search(ahash, t_string); if (v != NULL) { cn->info = OC_FNARG; cn->l.aidx = v->x.aidx; } else { cn->l.v = newvar(t_string); } if (tc & TC_ARRAY) { cn->info |= xS; cn->r.n = parse_expr(TC_ARRTERM); } break; case TC_NUMBER: case TC_STRING: debug_printf_parse("%s: TC_NUMBER | TC_STRING\n", __func__); cn->info = OC_VAR; v = cn->l.v = xzalloc(sizeof(var)); if (tc & TC_NUMBER) setvar_i(v, t_double); else { setvar_s(v, t_string); expected_tc &= ~TC_UOPPOST; /* "str"++ is not allowed */ } break; case TC_REGEXP: debug_printf_parse("%s: TC_REGEXP\n", __func__); mk_re_node(t_string, cn, xzalloc(sizeof(regex_t)*2)); break; case TC_FUNCTION: debug_printf_parse("%s: TC_FUNCTION\n", __func__); cn->info = OC_FUNC; cn->r.f = newfunc(t_string); cn->l.n = parse_expr(TC_RPAREN); break; case TC_LPAREN: debug_printf_parse("%s: TC_LPAREN\n", __func__); cn = vn->r.n = parse_expr(TC_RPAREN); if (!cn) syntax_error("Empty sequence"); cn->a.n = vn; break; case TC_GETLINE: debug_printf_parse("%s: TC_GETLINE\n", __func__); glptr = cn; expected_tc = TS_OPERAND | TS_UOPPRE | TS_BINOP | term_tc; break; case TC_BUILTIN: debug_printf_parse("%s: TC_BUILTIN\n", __func__); cn->l.n = parse_lrparen_list(); break; case TC_LENGTH: debug_printf_parse("%s: TC_LENGTH\n", __func__); tc = next_token(TC_LPAREN /* length(...) */ | TS_OPTERM /* length; (or newline)*/ | TC_GRPTERM /* length } */ | TC_BINOPX /* length NUM */ | TC_COMMA /* print length, 1 */ ); rollback_token(); if (tc & TC_LPAREN) { /* It was a "(" token. Handle just like TC_BUILTIN */ cn->l.n = parse_lrparen_list(); } break; } } /* while() */ debug_printf_parse("%s() returns %p\n", __func__, sn.r.n); return sn.r.n; } /* add node to chain. Return ptr to alloc'd node */ static node *chain_node(uint32_t info) { node *n; if (!seq->first) seq->first = seq->last = new_node(0); if (seq->programname != g_progname) { seq->programname = g_progname; n = chain_node(OC_NEWSOURCE); n->l.new_progname = xstrdup(g_progname); } n = seq->last; n->info = info; seq->last = n->a.n = new_node(OC_DONE); return n; } static void chain_expr(uint32_t info) { node *n; n = chain_node(info); n->l.n = parse_expr(TS_OPTERM | TC_GRPTERM); if ((info & OF_REQUIRED) && !n->l.n) syntax_error(EMSG_TOO_FEW_ARGS); if (t_tclass & TC_GRPTERM) rollback_token(); } static node *chain_loop(node *nn) { node *n, *n2, *save_brk, *save_cont; save_brk = break_ptr; save_cont = continue_ptr; n = chain_node(OC_BR | Vx); continue_ptr = new_node(OC_EXEC); break_ptr = new_node(OC_EXEC); chain_group(); n2 = chain_node(OC_EXEC | Vx); n2->l.n = nn; n2->a.n = n; continue_ptr->a.n = n2; break_ptr->a.n = n->r.n = seq->last; continue_ptr = save_cont; break_ptr = save_brk; return n; } /* parse group and attach it to chain */ static void chain_group(void) { uint32_t c; node *n, *n2, *n3; do { c = next_token(TS_GRPSEQ); } while (c & TC_NEWLINE); if (c & TC_GRPSTART) { debug_printf_parse("%s: TC_GRPSTART\n", __func__); while ((c = next_token(TS_GRPSEQ | TC_GRPTERM)) != TC_GRPTERM) { debug_printf_parse("%s: !TC_GRPTERM\n", __func__); if (c & TC_NEWLINE) continue; rollback_token(); chain_group(); } debug_printf_parse("%s: TC_GRPTERM\n", __func__); return; } if (c & (TS_OPSEQ | TS_OPTERM)) { debug_printf_parse("%s: TS_OPSEQ | TS_OPTERM\n", __func__); rollback_token(); chain_expr(OC_EXEC | Vx); return; } /* TS_STATEMNT */ debug_printf_parse("%s: TS_STATEMNT(?)\n", __func__); switch (t_info & OPCLSMASK) { case ST_IF: debug_printf_parse("%s: ST_IF\n", __func__); n = chain_node(OC_BR | Vx); n->l.n = parse_lrparen_list(); chain_group(); n2 = chain_node(OC_EXEC); n->r.n = seq->last; if (next_token(TS_GRPSEQ | TC_GRPTERM | TC_ELSE) == TC_ELSE) { chain_group(); n2->a.n = seq->last; } else { rollback_token(); } break; case ST_WHILE: debug_printf_parse("%s: ST_WHILE\n", __func__); n2 = parse_lrparen_list(); n = chain_loop(NULL); n->l.n = n2; break; case ST_DO: debug_printf_parse("%s: ST_DO\n", __func__); n2 = chain_node(OC_EXEC); n = chain_loop(NULL); n2->a.n = n->a.n; next_token(TC_WHILE); n->l.n = parse_lrparen_list(); break; case ST_FOR: debug_printf_parse("%s: ST_FOR\n", __func__); next_token(TC_LPAREN); n2 = parse_expr(TC_SEMICOL | TC_RPAREN); if (t_tclass & TC_RPAREN) { /* for-in */ if (!n2 || (n2->info & OPCLSMASK) != OC_IN) syntax_error(EMSG_UNEXP_TOKEN); n = chain_node(OC_WALKINIT | VV); n->l.n = n2->l.n; n->r.n = n2->r.n; n = chain_loop(NULL); n->info = OC_WALKNEXT | Vx; n->l.n = n2->l.n; } else { /* for (;;) */ n = chain_node(OC_EXEC | Vx); n->l.n = n2; n2 = parse_expr(TC_SEMICOL); n3 = parse_expr(TC_RPAREN); n = chain_loop(n3); n->l.n = n2; if (!n2) n->info = OC_EXEC; } break; case OC_PRINT: case OC_PRINTF: debug_printf_parse("%s: OC_PRINT[F]\n", __func__); n = chain_node(t_info); n->l.n = parse_expr(TS_OPTERM | TC_OUTRDR | TC_GRPTERM); if (t_tclass & TC_OUTRDR) { n->info |= t_info; n->r.n = parse_expr(TS_OPTERM | TC_GRPTERM); } if (t_tclass & TC_GRPTERM) rollback_token(); break; case OC_BREAK: debug_printf_parse("%s: OC_BREAK\n", __func__); n = chain_node(OC_EXEC); n->a.n = break_ptr; chain_expr(t_info); break; case OC_CONTINUE: debug_printf_parse("%s: OC_CONTINUE\n", __func__); n = chain_node(OC_EXEC); n->a.n = continue_ptr; chain_expr(t_info); break; /* delete, next, nextfile, return, exit */ default: debug_printf_parse("%s: default\n", __func__); chain_expr(t_info); } } static void parse_program(char *p) { uint32_t tclass; node *cn; func *f; var *v; debug_printf_parse("%s()\n", __func__); g_pos = p; t_lineno = 1; while ((tclass = next_token(TC_EOF | TS_OPSEQ | TC_GRPSTART | TS_OPTERM | TC_BEGIN | TC_END | TC_FUNCDECL)) != TC_EOF) { if (tclass & TS_OPTERM) { debug_printf_parse("%s: TS_OPTERM\n", __func__); continue; } seq = &mainseq; if (tclass & TC_BEGIN) { debug_printf_parse("%s: TC_BEGIN\n", __func__); seq = &beginseq; chain_group(); } else if (tclass & TC_END) { debug_printf_parse("%s: TC_END\n", __func__); seq = &endseq; chain_group(); } else if (tclass & TC_FUNCDECL) { debug_printf_parse("%s: TC_FUNCDECL\n", __func__); next_token(TC_FUNCTION); f = newfunc(t_string); if (f->defined) syntax_error("Duplicate function"); f->defined = 1; //f->body.first = NULL; - already is //f->nargs = 0; - already is /* func arg list: comma sep list of args, and a close paren */ for (;;) { if (next_token(TC_VARIABLE | TC_RPAREN) == TC_RPAREN) { if (f->nargs == 0) break; /* func() is ok */ /* func(a,) is not ok */ syntax_error(EMSG_UNEXP_TOKEN); } v = findvar(ahash, t_string); v->x.aidx = f->nargs++; /* Arg followed either by end of arg list or 1 comma */ if (next_token(TC_COMMA | TC_RPAREN) == TC_RPAREN) break; /* it was a comma, we ate it */ } seq = &f->body; chain_group(); hash_clear(ahash); } else if (tclass & TS_OPSEQ) { debug_printf_parse("%s: TS_OPSEQ\n", __func__); rollback_token(); cn = chain_node(OC_TEST); cn->l.n = parse_expr(TS_OPTERM | TC_EOF | TC_GRPSTART); if (t_tclass & TC_GRPSTART) { debug_printf_parse("%s: TC_GRPSTART\n", __func__); rollback_token(); chain_group(); } else { debug_printf_parse("%s: !TC_GRPSTART\n", __func__); chain_node(OC_PRINT); } cn->r.n = mainseq.last; } else /* if (tclass & TC_GRPSTART) */ { debug_printf_parse("%s: TC_GRPSTART(?)\n", __func__); rollback_token(); chain_group(); } } debug_printf_parse("%s: TC_EOF\n", __func__); } /* -------- program execution part -------- */ /* temporary variables allocator */ static var *nvalloc(int sz) { return xzalloc(sz * sizeof(var)); } static void nvfree(var *v, int sz) { var *p = v; while (--sz >= 0) { if ((p->type & (VF_ARRAY | VF_CHILD)) == VF_ARRAY) { clear_array(iamarray(p)); free(p->x.array->items); free(p->x.array); } if (p->type & VF_WALK) { walker_list *n; walker_list *w = p->x.walker; debug_printf_walker("nvfree: freeing walker @%p\n", &p->x.walker); p->x.walker = NULL; while (w) { n = w->prev; debug_printf_walker(" free(%p)\n", w); free(w); w = n; } } clrvar(p); p++; } free(v); } static node *mk_splitter(const char *s, tsplitter *spl) { regex_t *re, *ire; node *n; re = &spl->re[0]; ire = &spl->re[1]; n = &spl->n; if ((n->info & OPCLSMASK) == OC_REGEXP) { regfree(re); regfree(ire); // TODO: nuke ire, use re+1? } if (s[0] && s[1]) { /* strlen(s) > 1 */ mk_re_node(s, n, re); } else { n->info = (uint32_t) s[0]; } return n; } /* Use node as a regular expression. Supplied with node ptr and regex_t * storage space. Return ptr to regex (if result points to preg, it should * be later regfree'd manually). */ static regex_t *as_regex(node *op, regex_t *preg) { int cflags; const char *s; if ((op->info & OPCLSMASK) == OC_REGEXP) { return icase ? op->r.ire : op->l.re; } #define TMPVAR (&G.as_regex__tmpvar) //tmpvar = nvalloc(1); // We use a single "static" tmpvar (instead of on-stack or malloced one) // to decrease memory consumption in deeply-recursive awk programs. // The rule to work safely is to never call evaluate() while our static // TMPVAR's value is still needed. s = getvar_s(evaluate(op, TMPVAR)); cflags = icase ? REG_EXTENDED | REG_ICASE : REG_EXTENDED; /* Testcase where REG_EXTENDED fails (unpaired '{'): * echo Hi | awk 'gsub("@(samp|code|file)\{","");' * gawk 3.1.5 eats this. We revert to ~REG_EXTENDED * (maybe gsub is not supposed to use REG_EXTENDED?). */ if (regcomp(preg, s, cflags)) { cflags &= ~REG_EXTENDED; xregcomp(preg, s, cflags); } //nvfree(tmpvar, 1); #undef TMPVAR return preg; } /* gradually increasing buffer. * note that we reallocate even if n == old_size, * and thus there is at least one extra allocated byte. */ static char* qrealloc(char *b, int n, int *size) { if (!b || n >= *size) { *size = n + (n>>1) + 80; b = xrealloc(b, *size); } return b; } /* resize field storage space */ static void fsrealloc(int size) { int i, newsize; if (size >= maxfields) { /* Sanity cap, easier than catering for overflows */ if (size > 0xffffff) bb_die_memory_exhausted(); i = maxfields; maxfields = size + 16; newsize = maxfields * sizeof(Fields[0]); debug_printf_eval("fsrealloc: xrealloc(%p, %u)\n", Fields, newsize); Fields = xrealloc(Fields, newsize); debug_printf_eval("fsrealloc: Fields=%p..%p\n", Fields, (char*)Fields + newsize - 1); /* ^^^ did Fields[] move? debug aid for L.v getting "upstaged" by R.v in evaluate() */ for (; i < maxfields; i++) { Fields[i].type = VF_SPECIAL; Fields[i].string = NULL; } } /* if size < nfields, clear extra field variables */ for (i = size; i < nfields; i++) { clrvar(Fields + i); } nfields = size; } static int regexec1_nonempty(const regex_t *preg, const char *s, regmatch_t pmatch[]) { int r = regexec(preg, s, 1, pmatch, 0); if (r == 0 && pmatch[0].rm_eo == 0) { /* For example, happens when FS can match * an empty string (awk -F ' *'). Logically, * this should split into one-char fields. * However, gawk 5.0.1 searches for first * _non-empty_ separator string match: */ size_t ofs = 0; do { ofs++; if (!s[ofs]) return REG_NOMATCH; regexec(preg, s + ofs, 1, pmatch, 0); } while (pmatch[0].rm_eo == 0); pmatch[0].rm_so += ofs; pmatch[0].rm_eo += ofs; } return r; } static int awk_split(const char *s, node *spl, char **slist) { int n; char c[4]; char *s1; /* in worst case, each char would be a separate field */ *slist = s1 = xzalloc(strlen(s) * 2 + 3); strcpy(s1, s); c[0] = c[1] = (char)spl->info; c[2] = c[3] = '\0'; if (*getvar_s(intvar[RS]) == '\0') c[2] = '\n'; n = 0; if ((spl->info & OPCLSMASK) == OC_REGEXP) { /* regex split */ if (!*s) return n; /* "": zero fields */ n++; /* at least one field will be there */ do { int l; regmatch_t pmatch[2]; // TODO: why [2]? [1] is enough... l = strcspn(s, c+2); /* len till next NUL or \n */ if (regexec1_nonempty(icase ? spl->r.ire : spl->l.re, s, pmatch) == 0 && pmatch[0].rm_so <= l ) { /* if (pmatch[0].rm_eo == 0) ... - impossible */ l = pmatch[0].rm_so; n++; /* we saw yet another delimiter */ } else { pmatch[0].rm_eo = l; if (s[l]) pmatch[0].rm_eo++; } s1 = mempcpy(s1, s, l); *s1++ = '\0'; s += pmatch[0].rm_eo; } while (*s); /* echo a-- | awk -F-- '{ print NF, length($NF), $NF }' * should print "2 0 ": */ *s1 = '\0'; return n; } if (c[0] == '\0') { /* null split */ while (*s) { *s1++ = *s++; *s1++ = '\0'; n++; } return n; } if (c[0] != ' ') { /* single-character split */ if (icase) { c[0] = toupper(c[0]); c[1] = tolower(c[1]); } if (*s1) n++; while ((s1 = strpbrk(s1, c)) != NULL) { *s1++ = '\0'; n++; } return n; } /* space split */ while (*s) { s = skip_whitespace(s); if (!*s) break; n++; while (*s && !isspace(*s)) *s1++ = *s++; *s1++ = '\0'; } return n; } static void split_f0(void) { /* static char *fstrings; */ #define fstrings (G.split_f0__fstrings) int i, n; char *s; if (is_f0_split) return; is_f0_split = TRUE; free(fstrings); fsrealloc(0); n = awk_split(getvar_s(intvar[F0]), &fsplitter.n, &fstrings); fsrealloc(n); s = fstrings; for (i = 0; i < n; i++) { Fields[i].string = nextword(&s); Fields[i].type |= (VF_FSTR | VF_USER | VF_DIRTY); } /* set NF manually to avoid side effects */ clrvar(intvar[NF]); intvar[NF]->type = VF_NUMBER | VF_SPECIAL; intvar[NF]->number = nfields; #undef fstrings } /* perform additional actions when some internal variables changed */ static void handle_special(var *v) { int n; char *b; const char *sep, *s; int sl, l, len, i, bsize; if (!(v->type & VF_SPECIAL)) return; if (v == intvar[NF]) { n = (int)getvar_i(v); if (n < 0) syntax_error("NF set to negative value"); fsrealloc(n); /* recalculate $0 */ sep = getvar_s(intvar[OFS]); sl = strlen(sep); b = NULL; len = 0; for (i = 0; i < n; i++) { s = getvar_s(&Fields[i]); l = strlen(s); if (b) { memcpy(b+len, sep, sl); len += sl; } b = qrealloc(b, len+l+sl, &bsize); memcpy(b+len, s, l); len += l; } if (b) b[len] = '\0'; setvar_p(intvar[F0], b); is_f0_split = TRUE; } else if (v == intvar[F0]) { is_f0_split = FALSE; } else if (v == intvar[FS]) { /* * The POSIX-2008 standard says that changing FS should have no effect on the * current input line, but only on the next one. The language is: * * > Before the first reference to a field in the record is evaluated, the record * > shall be split into fields, according to the rules in Regular Expressions, * > using the value of FS that was current at the time the record was read. * * So, split up current line before assignment to FS: */ split_f0(); mk_splitter(getvar_s(v), &fsplitter); } else if (v == intvar[RS]) { mk_splitter(getvar_s(v), &rsplitter); } else if (v == intvar[IGNORECASE]) { icase = istrue(v); } else { /* $n */ n = getvar_i(intvar[NF]); setvar_i(intvar[NF], n > v-Fields ? n : v-Fields+1); /* right here v is invalid. Just to note... */ } } /* step through func/builtin/etc arguments */ static node *nextarg(node **pn) { node *n; n = *pn; if (n && (n->info & OPCLSMASK) == OC_COMMA) { *pn = n->r.n; n = n->l.n; } else { *pn = NULL; } return n; } static void hashwalk_init(var *v, xhash *array) { hash_item *hi; unsigned i; walker_list *w; walker_list *prev_walker; if (v->type & VF_WALK) { prev_walker = v->x.walker; } else { v->type |= VF_WALK; prev_walker = NULL; } debug_printf_walker("hashwalk_init: prev_walker:%p\n", prev_walker); w = v->x.walker = xzalloc(sizeof(*w) + array->glen + 1); /* why + 1? */ debug_printf_walker(" walker@%p=%p\n", &v->x.walker, w); w->cur = w->end = w->wbuf; w->prev = prev_walker; for (i = 0; i < array->csize; i++) { hi = array->items[i]; while (hi) { w->end = stpcpy(w->end, hi->name) + 1; hi = hi->next; } } } static int hashwalk_next(var *v) { walker_list *w = v->x.walker; if (w->cur >= w->end) { walker_list *prev_walker = w->prev; debug_printf_walker("end of iteration, free(walker@%p:%p), prev_walker:%p\n", &v->x.walker, w, prev_walker); free(w); v->x.walker = prev_walker; return FALSE; } setvar_s(v, nextword(&w->cur)); return TRUE; } /* evaluate node, return 1 when result is true, 0 otherwise */ static int ptest(node *pattern) { // We use a single "static" tmpvar (instead of on-stack or malloced one) // to decrease memory consumption in deeply-recursive awk programs. // The rule to work safely is to never call evaluate() while our static // TMPVAR's value is still needed. return istrue(evaluate(pattern, &G.ptest__tmpvar)); } /* read next record from stream rsm into a variable v */ static int awk_getline(rstream *rsm, var *v) { char *b; regmatch_t pmatch[2]; // TODO: why [2]? [1] is enough... int size, a, p, pp = 0; int fd, so, eo, r, rp; char c, *m, *s; debug_printf_eval("entered %s()\n", __func__); /* we're using our own buffer since we need access to accumulating * characters */ fd = fileno(rsm->F); m = rsm->buffer; a = rsm->adv; p = rsm->pos; size = rsm->size; c = (char) rsplitter.n.info; rp = 0; if (!m) m = qrealloc(m, 256, &size); do { b = m + a; so = eo = p; r = 1; if (p > 0) { if ((rsplitter.n.info & OPCLSMASK) == OC_REGEXP) { if (regexec(icase ? rsplitter.n.r.ire : rsplitter.n.l.re, b, 1, pmatch, 0) == 0) { so = pmatch[0].rm_so; eo = pmatch[0].rm_eo; if (b[eo] != '\0') break; } } else if (c != '\0') { s = strchr(b+pp, c); if (!s) s = memchr(b+pp, '\0', p - pp); if (s) { so = eo = s-b; eo++; break; } } else { while (b[rp] == '\n') rp++; s = strstr(b+rp, "\n\n"); if (s) { so = eo = s-b; while (b[eo] == '\n') eo++; if (b[eo] != '\0') break; } } } if (a > 0) { memmove(m, m+a, p+1); b = m; a = 0; } m = qrealloc(m, a+p+128, &size); b = m + a; pp = p; p += safe_read(fd, b+p, size-p-1); if (p < pp) { p = 0; r = 0; setvar_i(intvar[ERRNO], errno); } b[p] = '\0'; } while (p > pp); if (p == 0) { r--; } else { c = b[so]; b[so] = '\0'; setvar_s(v, b+rp); v->type |= VF_USER; b[so] = c; c = b[eo]; b[eo] = '\0'; setvar_s(intvar[RT], b+so); b[eo] = c; } rsm->buffer = m; rsm->adv = a + eo; rsm->pos = p - eo; rsm->size = size; debug_printf_eval("returning from %s(): %d\n", __func__, r); return r; } static int fmt_num(char *b, int size, const char *format, double n, int int_as_int) { int r = 0; char c; const char *s = format; if (int_as_int && n == (long long)n) { r = snprintf(b, size, "%lld", (long long)n); } else { do { c = *s; } while (c && *++s); if (strchr("diouxX", c)) { r = snprintf(b, size, format, (int)n); } else if (strchr("eEfgG", c)) { r = snprintf(b, size, format, n); } else { syntax_error(EMSG_INV_FMT); } } return r; } /* formatted output into an allocated buffer, return ptr to buffer */ #if !ENABLE_FEATURE_AWK_GNU_EXTENSIONS # define awk_printf(a, b) awk_printf(a) #endif static char *awk_printf(node *n, int *len) { char *b = NULL; char *fmt, *s, *f; const char *s1; int i, j, incr, bsize; char c, c1; var *arg; //tmpvar = nvalloc(1); #define TMPVAR (&G.awk_printf__tmpvar) // We use a single "static" tmpvar (instead of on-stack or malloced one) // to decrease memory consumption in deeply-recursive awk programs. // The rule to work safely is to never call evaluate() while our static // TMPVAR's value is still needed. fmt = f = xstrdup(getvar_s(evaluate(nextarg(&n), TMPVAR))); // ^^^^^^^^^ here we immediately strdup() the value, so the later call // to evaluate() potentially recursing into another awk_printf() can't // mangle the value. i = 0; while (*f) { s = f; while (*f && (*f != '%' || *++f == '%')) f++; while (*f && !isalpha(*f)) { if (*f == '*') syntax_error("%*x formats are not supported"); f++; } incr = (f - s) + MAXVARFMT; b = qrealloc(b, incr + i, &bsize); c = *f; if (c != '\0') f++; c1 = *f; *f = '\0'; arg = evaluate(nextarg(&n), TMPVAR); j = i; if (c == 'c' || !c) { i += sprintf(b+i, s, is_numeric(arg) ? (char)getvar_i(arg) : *getvar_s(arg)); } else if (c == 's') { s1 = getvar_s(arg); b = qrealloc(b, incr+i+strlen(s1), &bsize); i += sprintf(b+i, s, s1); } else { i += fmt_num(b+i, incr, s, getvar_i(arg), FALSE); } *f = c1; /* if there was an error while sprintf, return value is negative */ if (i < j) i = j; } free(fmt); // nvfree(tmpvar, 1); #undef TMPVAR b = xrealloc(b, i + 1); b[i] = '\0'; #if ENABLE_FEATURE_AWK_GNU_EXTENSIONS if (len) *len = i; #endif return b; } /* Common substitution routine. * Replace (nm)'th substring of (src) that matches (rn) with (repl), * store result into (dest), return number of substitutions. * If nm = 0, replace all matches. * If src or dst is NULL, use $0. * If subexp != 0, enable subexpression matching (\1-\9). */ static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest, int subexp) { char *resbuf; const char *sp; int match_no, residx, replen, resbufsize; int regexec_flags; regmatch_t pmatch[10]; regex_t sreg, *regex; resbuf = NULL; residx = 0; match_no = 0; regexec_flags = 0; regex = as_regex(rn, &sreg); sp = getvar_s(src ? src : intvar[F0]); replen = strlen(repl); while (regexec(regex, sp, 10, pmatch, regexec_flags) == 0) { int so = pmatch[0].rm_so; int eo = pmatch[0].rm_eo; //bb_error_msg("match %u: [%u,%u] '%s'%p", match_no+1, so, eo, sp,sp); resbuf = qrealloc(resbuf, residx + eo + replen, &resbufsize); memcpy(resbuf + residx, sp, eo); residx += eo; if (++match_no >= nm) { const char *s; int nbs; /* replace */ residx -= (eo - so); nbs = 0; for (s = repl; *s; s++) { char c = resbuf[residx++] = *s; if (c == '\\') { nbs++; continue; } if (c == '&' || (subexp && c >= '0' && c <= '9')) { int j; residx -= ((nbs + 3) >> 1); j = 0; if (c != '&') { j = c - '0'; nbs++; } if (nbs % 2) { resbuf[residx++] = c; } else { int n = pmatch[j].rm_eo - pmatch[j].rm_so; resbuf = qrealloc(resbuf, residx + replen + n, &resbufsize); memcpy(resbuf + residx, sp + pmatch[j].rm_so, n); residx += n; } } nbs = 0; } } regexec_flags = REG_NOTBOL; sp += eo; if (match_no == nm) break; if (eo == so) { /* Empty match (e.g. "b*" will match anywhere). * Advance by one char. */ //BUG (bug 1333): //gsub(/\info; op = op->l.n; av[2] = av[3] = NULL; for (i = 0; i < 4 && op; i++) { an[i] = nextarg(&op); if (isr & 0x09000000) av[i] = evaluate(an[i], TMPVAR(i)); if (isr & 0x08000000) as[i] = getvar_s(av[i]); isr >>= 1; } nargs = i; if ((uint32_t)nargs < (info >> 30)) syntax_error(EMSG_TOO_FEW_ARGS); info &= OPNMASK; switch (info) { case B_a2: if (ENABLE_FEATURE_AWK_LIBM) setvar_i(res, atan2(getvar_i(av[0]), getvar_i(av[1]))); else syntax_error(EMSG_NO_MATH); break; case B_sp: { char *s, *s1; if (nargs > 2) { spl = (an[2]->info & OPCLSMASK) == OC_REGEXP ? an[2] : mk_splitter(getvar_s(evaluate(an[2], TMPVAR2)), &tspl); } else { spl = &fsplitter.n; } n = awk_split(as[0], spl, &s); s1 = s; clear_array(iamarray(av[1])); for (i = 1; i <= n; i++) setari_u(av[1], i, nextword(&s)); free(s1); setvar_i(res, n); break; } case B_ss: { char *s; l = strlen(as[0]); i = getvar_i(av[1]) - 1; if (i > l) i = l; if (i < 0) i = 0; n = (nargs > 2) ? getvar_i(av[2]) : l-i; if (n < 0) n = 0; s = xstrndup(as[0]+i, n); setvar_p(res, s); break; } /* Bitwise ops must assume that operands are unsigned. GNU Awk 3.1.5: * awk '{ print or(-1,1) }' gives "4.29497e+09", not "-2.xxxe+09" */ case B_an: setvar_i(res, getvar_i_int(av[0]) & getvar_i_int(av[1])); break; case B_co: setvar_i(res, ~getvar_i_int(av[0])); break; case B_ls: setvar_i(res, getvar_i_int(av[0]) << getvar_i_int(av[1])); break; case B_or: setvar_i(res, getvar_i_int(av[0]) | getvar_i_int(av[1])); break; case B_rs: setvar_i(res, getvar_i_int(av[0]) >> getvar_i_int(av[1])); break; case B_xo: setvar_i(res, getvar_i_int(av[0]) ^ getvar_i_int(av[1])); break; case B_lo: case B_up: { char *s, *s1; s1 = s = xstrdup(as[0]); while (*s1) { //*s1 = (info == B_up) ? toupper(*s1) : tolower(*s1); if ((unsigned char)((*s1 | 0x20) - 'a') <= ('z' - 'a')) *s1 = (info == B_up) ? (*s1 & 0xdf) : (*s1 | 0x20); s1++; } setvar_p(res, s); break; } case B_ix: n = 0; ll = strlen(as[1]); l = strlen(as[0]) - ll; if (ll > 0 && l >= 0) { if (!icase) { char *s = strstr(as[0], as[1]); if (s) n = (s - as[0]) + 1; } else { /* this piece of code is terribly slow and * really should be rewritten */ for (i = 0; i <= l; i++) { if (strncasecmp(as[0]+i, as[1], ll) == 0) { n = i+1; break; } } } } setvar_i(res, n); break; case B_ti: if (nargs > 1) tt = getvar_i(av[1]); else time(&tt); //s = (nargs > 0) ? as[0] : "%a %b %d %H:%M:%S %Z %Y"; i = strftime(g_buf, MAXVARFMT, ((nargs > 0) ? as[0] : "%a %b %d %H:%M:%S %Z %Y"), localtime(&tt)); g_buf[i] = '\0'; setvar_s(res, g_buf); break; case B_mt: setvar_i(res, do_mktime(as[0])); break; case B_ma: re = as_regex(an[1], &sreg); n = regexec(re, as[0], 1, pmatch, 0); if (n == 0) { pmatch[0].rm_so++; pmatch[0].rm_eo++; } else { pmatch[0].rm_so = 0; pmatch[0].rm_eo = -1; } setvar_i(newvar("RSTART"), pmatch[0].rm_so); setvar_i(newvar("RLENGTH"), pmatch[0].rm_eo - pmatch[0].rm_so); setvar_i(res, pmatch[0].rm_so); if (re == &sreg) regfree(re); break; case B_ge: awk_sub(an[0], as[1], getvar_i(av[2]), av[3], res, TRUE); break; case B_gs: setvar_i(res, awk_sub(an[0], as[1], 0, av[2], av[2], FALSE)); break; case B_su: setvar_i(res, awk_sub(an[0], as[1], 1, av[2], av[2], FALSE)); break; } nvfree(tmpvars, 4); #undef TMPVAR0 #undef TMPVAR1 #undef TMPVAR2 #undef TMPVAR3 #undef TMPVAR return res; #undef tspl } /* * Evaluate node - the heart of the program. Supplied with subtree * and place where to store result. Returns ptr to result. */ #define XC(n) ((n) >> 8) static var *evaluate(node *op, var *res) { /* This procedure is recursive so we should count every byte */ #define fnargs (G.evaluate__fnargs) /* seed is initialized to 1 */ #define seed (G.evaluate__seed) #define sreg (G.evaluate__sreg) var *tmpvars; #define TMPVAR0 (tmpvars) #define TMPVAR1 (tmpvars + 1) if (!op) return setvar_s(res, NULL); debug_printf_eval("entered %s()\n", __func__); tmpvars = nvalloc(2); while (op) { struct { var *v; const char *s; } L = L; /* for compiler */ struct { var *v; const char *s; } R = R; double L_d = L_d; uint32_t opinfo; int opn; node *op1; opinfo = op->info; opn = (opinfo & OPNMASK); g_lineno = op->lineno; op1 = op->l.n; debug_printf_eval("opinfo:%08x opn:%08x\n", opinfo, opn); /* "delete" is special: * "delete array[var--]" must evaluate index expr only once, * must not evaluate it in "execute inevitable things" part. */ if (XC(opinfo & OPCLSMASK) == XC(OC_DELETE)) { uint32_t info = op1->info & OPCLSMASK; var *v; debug_printf_eval("DELETE\n"); if (info == OC_VAR) { v = op1->l.v; } else if (info == OC_FNARG) { v = &fnargs[op1->l.aidx]; } else { syntax_error(EMSG_NOT_ARRAY); } if (op1->r.n) { /* array ref? */ const char *s; s = getvar_s(evaluate(op1->r.n, TMPVAR0)); hash_remove(iamarray(v), s); } else { clear_array(iamarray(v)); } goto next; } /* execute inevitable things */ if (opinfo & OF_RES1) L.v = evaluate(op1, TMPVAR0); if (opinfo & OF_STR1) { L.s = getvar_s(L.v); debug_printf_eval("L.s:'%s'\n", L.s); } if (opinfo & OF_NUM1) { L_d = getvar_i(L.v); debug_printf_eval("L_d:%f\n", L_d); } /* NB: Must get string/numeric values of L (done above) * _before_ evaluate()'ing R.v: if both L and R are $NNNs, * and right one is large, then L.v points to Fields[NNN1], * second evaluate() reallocates and moves (!) Fields[], * R.v points to Fields[NNN2] but L.v now points to freed mem! * (Seen trying to evaluate "$444 $44444") */ if (opinfo & OF_RES2) { R.v = evaluate(op->r.n, TMPVAR1); //TODO: L.v may be invalid now, set L.v to NULL to catch bugs? //L.v = NULL; } if (opinfo & OF_STR2) { R.s = getvar_s(R.v); debug_printf_eval("R.s:'%s'\n", R.s); } debug_printf_eval("switch(0x%x)\n", XC(opinfo & OPCLSMASK)); switch (XC(opinfo & OPCLSMASK)) { /* -- iterative node type -- */ /* test pattern */ case XC( OC_TEST ): debug_printf_eval("TEST\n"); if ((op1->info & OPCLSMASK) == OC_COMMA) { /* it's range pattern */ if ((opinfo & OF_CHECKED) || ptest(op1->l.n)) { op->info |= OF_CHECKED; if (ptest(op1->r.n)) op->info &= ~OF_CHECKED; op = op->a.n; } else { op = op->r.n; } } else { op = ptest(op1) ? op->a.n : op->r.n; } break; /* just evaluate an expression, also used as unconditional jump */ case XC( OC_EXEC ): debug_printf_eval("EXEC\n"); break; /* branch, used in if-else and various loops */ case XC( OC_BR ): debug_printf_eval("BR\n"); op = istrue(L.v) ? op->a.n : op->r.n; break; /* initialize for-in loop */ case XC( OC_WALKINIT ): debug_printf_eval("WALKINIT\n"); hashwalk_init(L.v, iamarray(R.v)); break; /* get next array item */ case XC( OC_WALKNEXT ): debug_printf_eval("WALKNEXT\n"); op = hashwalk_next(L.v) ? op->a.n : op->r.n; break; case XC( OC_PRINT ): debug_printf_eval("PRINT /\n"); case XC( OC_PRINTF ): debug_printf_eval("PRINTF\n"); { FILE *F = stdout; IF_FEATURE_AWK_GNU_EXTENSIONS(int len;) if (op->r.n) { rstream *rsm = newfile(R.s); if (!rsm->F) { if (opn == '|') { rsm->F = popen(R.s, "w"); if (rsm->F == NULL) bb_simple_perror_msg_and_die("popen"); rsm->is_pipe = 1; } else { rsm->F = xfopen(R.s, opn=='w' ? "w" : "a"); } } F = rsm->F; } if ((opinfo & OPCLSMASK) == OC_PRINT) { if (!op1) { fputs(getvar_s(intvar[F0]), F); } else { for (;;) { var *v = evaluate(nextarg(&op1), TMPVAR0); if (v->type & VF_NUMBER) { fmt_num(g_buf, MAXVARFMT, getvar_s(intvar[OFMT]), getvar_i(v), TRUE); fputs(g_buf, F); } else { fputs(getvar_s(v), F); } if (!op1) break; fputs(getvar_s(intvar[OFS]), F); } } fputs(getvar_s(intvar[ORS]), F); } else { /* OC_PRINTF */ char *s = awk_printf(op1, &len); #if ENABLE_FEATURE_AWK_GNU_EXTENSIONS fwrite(s, len, 1, F); #else fputs(s, F); #endif free(s); } fflush(F); break; } /* case XC( OC_DELETE ): - moved to happen before arg evaluation */ case XC( OC_NEWSOURCE ): debug_printf_eval("NEWSOURCE\n"); g_progname = op->l.new_progname; break; case XC( OC_RETURN ): debug_printf_eval("RETURN\n"); copyvar(res, L.v); break; case XC( OC_NEXTFILE ): debug_printf_eval("NEXTFILE\n"); nextfile = TRUE; case XC( OC_NEXT ): debug_printf_eval("NEXT\n"); nextrec = TRUE; case XC( OC_DONE ): debug_printf_eval("DONE\n"); clrvar(res); break; case XC( OC_EXIT ): debug_printf_eval("EXIT\n"); awk_exit(L_d); /* -- recursive node type -- */ case XC( OC_VAR ): debug_printf_eval("VAR\n"); L.v = op->l.v; if (L.v == intvar[NF]) split_f0(); goto v_cont; case XC( OC_FNARG ): debug_printf_eval("FNARG[%d]\n", op->l.aidx); L.v = &fnargs[op->l.aidx]; v_cont: res = op->r.n ? findvar(iamarray(L.v), R.s) : L.v; break; case XC( OC_IN ): debug_printf_eval("IN\n"); setvar_i(res, hash_search(iamarray(R.v), L.s) ? 1 : 0); break; case XC( OC_REGEXP ): debug_printf_eval("REGEXP\n"); op1 = op; L.s = getvar_s(intvar[F0]); goto re_cont; case XC( OC_MATCH ): debug_printf_eval("MATCH\n"); op1 = op->r.n; re_cont: { regex_t *re = as_regex(op1, &sreg); int i = regexec(re, L.s, 0, NULL, 0); if (re == &sreg) regfree(re); setvar_i(res, (i == 0) ^ (opn == '!')); } break; case XC( OC_MOVE ): debug_printf_eval("MOVE\n"); /* if source is a temporary string, jusk relink it to dest */ //Disabled: if R.v is numeric but happens to have cached R.v->string, //then L.v ends up being a string, which is wrong // if (R.v == TMPVAR1 && R.v->string) { // res = setvar_p(L.v, R.v->string); // R.v->string = NULL; // } else { res = copyvar(L.v, R.v); // } break; case XC( OC_TERNARY ): debug_printf_eval("TERNARY\n"); if ((op->r.n->info & OPCLSMASK) != OC_COLON) syntax_error(EMSG_POSSIBLE_ERROR); res = evaluate(istrue(L.v) ? op->r.n->l.n : op->r.n->r.n, res); break; case XC( OC_FUNC ): { var *argvars, *sv_fnargs; const char *sv_progname; int nargs, i; debug_printf_eval("FUNC\n"); if (!op->r.f->defined) syntax_error(EMSG_UNDEF_FUNC); /* The body might be empty, still has to eval the args */ nargs = op->r.f->nargs; argvars = nvalloc(nargs); i = 0; while (op1) { var *arg = evaluate(nextarg(&op1), TMPVAR0); if (i == nargs) { /* call with more arguments than function takes. * (gawk warns: "warning: function 'f' called with more arguments than declared"). * They are still evaluated, but discarded: */ clrvar(arg); continue; } copyvar(&argvars[i], arg); argvars[i].type |= VF_CHILD; argvars[i].x.parent = arg; i++; } sv_fnargs = fnargs; sv_progname = g_progname; fnargs = argvars; res = evaluate(op->r.f->body.first, res); nvfree(argvars, nargs); g_progname = sv_progname; fnargs = sv_fnargs; break; } case XC( OC_GETLINE ): debug_printf_eval("GETLINE /\n"); case XC( OC_PGETLINE ): debug_printf_eval("PGETLINE\n"); { rstream *rsm; int i; if (op1) { rsm = newfile(L.s); if (!rsm->F) { if ((opinfo & OPCLSMASK) == OC_PGETLINE) { rsm->F = popen(L.s, "r"); rsm->is_pipe = TRUE; } else { rsm->F = fopen_for_read(L.s); /* not xfopen! */ } } } else { if (!iF) iF = next_input_file(); rsm = iF; } if (!rsm || !rsm->F) { setvar_i(intvar[ERRNO], errno); setvar_i(res, -1); break; } if (!op->r.n) R.v = intvar[F0]; i = awk_getline(rsm, R.v); if (i > 0 && !op1) { incvar(intvar[FNR]); incvar(intvar[NR]); } setvar_i(res, i); break; } /* simple builtins */ case XC( OC_FBLTIN ): { double R_d = R_d; /* for compiler */ debug_printf_eval("FBLTIN\n"); switch (opn) { case F_in: R_d = (long long)L_d; break; case F_rn: R_d = (double)rand() / (double)RAND_MAX; break; case F_co: if (ENABLE_FEATURE_AWK_LIBM) { R_d = cos(L_d); break; } case F_ex: if (ENABLE_FEATURE_AWK_LIBM) { R_d = exp(L_d); break; } case F_lg: if (ENABLE_FEATURE_AWK_LIBM) { R_d = log(L_d); break; } case F_si: if (ENABLE_FEATURE_AWK_LIBM) { R_d = sin(L_d); break; } case F_sq: if (ENABLE_FEATURE_AWK_LIBM) { R_d = sqrt(L_d); break; } syntax_error(EMSG_NO_MATH); break; case F_sr: R_d = (double)seed; seed = op1 ? (unsigned)L_d : (unsigned)time(NULL); srand(seed); break; case F_ti: R_d = time(NULL); break; case F_le: debug_printf_eval("length: L.s:'%s'\n", L.s); if (!op1) { L.s = getvar_s(intvar[F0]); debug_printf_eval("length: L.s='%s'\n", L.s); } else if (L.v->type & VF_ARRAY) { R_d = L.v->x.array->nel; debug_printf_eval("length: array_len:%d\n", L.v->x.array->nel); break; } R_d = strlen(L.s); break; case F_sy: fflush_all(); R_d = (ENABLE_FEATURE_ALLOW_EXEC && L.s && *L.s) ? (system(L.s) >> 8) : 0; break; case F_ff: if (!op1) { fflush(stdout); } else if (L.s && *L.s) { rstream *rsm = newfile(L.s); fflush(rsm->F); } else { fflush_all(); } break; case F_cl: { rstream *rsm; int err = 0; rsm = (rstream *)hash_search(fdhash, L.s); debug_printf_eval("OC_FBLTIN F_cl rsm:%p\n", rsm); if (rsm) { debug_printf_eval("OC_FBLTIN F_cl " "rsm->is_pipe:%d, ->F:%p\n", rsm->is_pipe, rsm->F); /* Can be NULL if open failed. Example: * getline line <"doesnt_exist"; * close("doesnt_exist"); <--- here rsm->F is NULL */ if (rsm->F) err = rsm->is_pipe ? pclose(rsm->F) : fclose(rsm->F); free(rsm->buffer); hash_remove(fdhash, L.s); } if (err) setvar_i(intvar[ERRNO], errno); R_d = (double)err; break; } } /* switch */ setvar_i(res, R_d); break; } case XC( OC_BUILTIN ): debug_printf_eval("BUILTIN\n"); res = exec_builtin(op, res); break; case XC( OC_SPRINTF ): debug_printf_eval("SPRINTF\n"); setvar_p(res, awk_printf(op1, NULL)); break; case XC( OC_UNARY ): debug_printf_eval("UNARY\n"); { double Ld, R_d; Ld = R_d = getvar_i(R.v); switch (opn) { case 'P': Ld = ++R_d; goto r_op_change; case 'p': R_d++; goto r_op_change; case 'M': Ld = --R_d; goto r_op_change; case 'm': R_d--; r_op_change: setvar_i(R.v, R_d); break; case '!': Ld = !istrue(R.v); break; case '-': Ld = -R_d; break; } setvar_i(res, Ld); break; } case XC( OC_FIELD ): debug_printf_eval("FIELD\n"); { int i = (int)getvar_i(R.v); if (i < 0) syntax_error(EMSG_NEGATIVE_FIELD); if (i == 0) { res = intvar[F0]; } else { split_f0(); if (i > nfields) fsrealloc(i); res = &Fields[i - 1]; } break; } /* concatenation (" ") and index joining (",") */ case XC( OC_CONCAT ): debug_printf_eval("CONCAT /\n"); case XC( OC_COMMA ): { const char *sep = ""; debug_printf_eval("COMMA\n"); if ((opinfo & OPCLSMASK) == OC_COMMA) sep = getvar_s(intvar[SUBSEP]); setvar_p(res, xasprintf("%s%s%s", L.s, sep, R.s)); break; } case XC( OC_LAND ): debug_printf_eval("LAND\n"); setvar_i(res, istrue(L.v) ? ptest(op->r.n) : 0); break; case XC( OC_LOR ): debug_printf_eval("LOR\n"); setvar_i(res, istrue(L.v) ? 1 : ptest(op->r.n)); break; case XC( OC_BINARY ): debug_printf_eval("BINARY /\n"); case XC( OC_REPLACE ): debug_printf_eval("REPLACE\n"); { double R_d = getvar_i(R.v); debug_printf_eval("R_d:%f opn:%c\n", R_d, opn); switch (opn) { case '+': L_d += R_d; break; case '-': L_d -= R_d; break; case '*': L_d *= R_d; break; case '/': if (R_d == 0) syntax_error(EMSG_DIV_BY_ZERO); L_d /= R_d; break; case '&': if (ENABLE_FEATURE_AWK_LIBM) L_d = pow(L_d, R_d); else syntax_error(EMSG_NO_MATH); break; case '%': if (R_d == 0) syntax_error(EMSG_DIV_BY_ZERO); L_d -= (long long)(L_d / R_d) * R_d; break; } debug_printf_eval("BINARY/REPLACE result:%f\n", L_d); res = setvar_i(((opinfo & OPCLSMASK) == OC_BINARY) ? res : L.v, L_d); break; } case XC( OC_COMPARE ): { int i = i; /* for compiler */ double Ld; debug_printf_eval("COMPARE\n"); if (is_numeric(L.v) && is_numeric(R.v)) { Ld = getvar_i(L.v) - getvar_i(R.v); } else { const char *l = getvar_s(L.v); const char *r = getvar_s(R.v); Ld = icase ? strcasecmp(l, r) : strcmp(l, r); } switch (opn & 0xfe) { case 0: i = (Ld > 0); break; case 2: i = (Ld >= 0); break; case 4: i = (Ld == 0); break; } setvar_i(res, (i == 0) ^ (opn & 1)); break; } default: syntax_error(EMSG_POSSIBLE_ERROR); } /* switch */ next: if ((opinfo & OPCLSMASK) <= SHIFT_TIL_THIS) op = op->a.n; if ((opinfo & OPCLSMASK) >= RECUR_FROM_THIS) break; if (nextrec) break; } /* while (op) */ nvfree(tmpvars, 2); #undef TMPVAR0 #undef TMPVAR1 debug_printf_eval("returning from %s(): %p\n", __func__, res); return res; #undef fnargs #undef seed #undef sreg } /* -------- main & co. -------- */ static int awk_exit(int r) { unsigned i; if (!exiting) { var tv; exiting = TRUE; nextrec = FALSE; zero_out_var(&tv); evaluate(endseq.first, &tv); } /* waiting for children */ for (i = 0; i < fdhash->csize; i++) { hash_item *hi; hi = fdhash->items[i]; while (hi) { if (hi->data.rs.F && hi->data.rs.is_pipe) pclose(hi->data.rs.F); hi = hi->next; } } exit(r); } /* if expr looks like "var=value", perform assignment and return 1, * otherwise return 0 */ static int is_assignment(const char *expr) { char *exprc, *val; if (!isalnum_(*expr) || (val = strchr(expr, '=')) == NULL) { return FALSE; } exprc = xstrdup(expr); val = exprc + (val - expr); *val++ = '\0'; unescape_string_in_place(val); setvar_u(newvar(exprc), val); free(exprc); return TRUE; } /* switch to next input file */ static rstream *next_input_file(void) { #define rsm (G.next_input_file__rsm) #define files_happen (G.next_input_file__files_happen) FILE *F; const char *fname, *ind; if (rsm.F) fclose(rsm.F); rsm.F = NULL; rsm.pos = rsm.adv = 0; for (;;) { if (getvar_i(intvar[ARGIND])+1 >= getvar_i(intvar[ARGC])) { if (files_happen) return NULL; fname = "-"; F = stdin; break; } ind = getvar_s(incvar(intvar[ARGIND])); fname = getvar_s(findvar(iamarray(intvar[ARGV]), ind)); if (fname && *fname && !is_assignment(fname)) { F = xfopen_stdin(fname); break; } } files_happen = TRUE; setvar_s(intvar[FILENAME], fname); rsm.F = F; return &rsm; #undef rsm #undef files_happen } int awk_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; int awk_main(int argc UNUSED_PARAM, char **argv) { unsigned opt; char *opt_F; llist_t *list_v = NULL; llist_t *list_f = NULL; #if ENABLE_FEATURE_AWK_GNU_EXTENSIONS llist_t *list_e = NULL; #endif int i; var tv; INIT_G(); /* Undo busybox.c, or else strtod may eat ','! This breaks parsing: * $1,$2 == '$1,' '$2', NOT '$1' ',' '$2' */ if (ENABLE_LOCALE_SUPPORT) setlocale(LC_NUMERIC, "C"); /* initialize variables */ vhash = hash_init(); { char *vnames = (char *)vNames; /* cheat */ char *vvalues = (char *)vValues; for (i = 0; *vnames; i++) { var *v; intvar[i] = v = newvar(nextword(&vnames)); if (*vvalues != '\377') setvar_s(v, nextword(&vvalues)); else setvar_i(v, 0); if (*vnames == '*') { v->type |= VF_SPECIAL; vnames++; } } } handle_special(intvar[FS]); handle_special(intvar[RS]); /* Huh, people report that sometimes environ is NULL. Oh well. */ if (environ) { char **envp; for (envp = environ; *envp; envp++) { /* environ is writable, thus we don't strdup it needlessly */ char *s = *envp; char *s1 = strchr(s, '='); if (s1) { *s1 = '\0'; /* Both findvar and setvar_u take const char* * as 2nd arg -> environment is not trashed */ setvar_u(findvar(iamarray(intvar[ENVIRON]), s), s1 + 1); *s1 = '='; } } } opt = getopt32(argv, OPTSTR_AWK, &opt_F, &list_v, &list_f, IF_FEATURE_AWK_GNU_EXTENSIONS(&list_e,) NULL); argv += optind; //argc -= optind; if (opt & OPT_W) bb_simple_error_msg("warning: option -W is ignored"); if (opt & OPT_F) { unescape_string_in_place(opt_F); setvar_s(intvar[FS], opt_F); } while (list_v) { if (!is_assignment(llist_pop(&list_v))) bb_show_usage(); } /* Parse all supplied programs */ fnhash = hash_init(); ahash = hash_init(); while (list_f) { int fd; char *s; g_progname = llist_pop(&list_f); fd = xopen_stdin(g_progname); s = xmalloc_read(fd, NULL); /* it's NUL-terminated */ close(fd); parse_program(s); free(s); } g_progname = "cmd. line"; #if ENABLE_FEATURE_AWK_GNU_EXTENSIONS while (list_e) { parse_program(llist_pop(&list_e)); } #endif if (!(opt & (OPT_f | OPT_e))) { if (!*argv) bb_show_usage(); parse_program(*argv++); } /* Free unused parse structures */ //hash_free(fnhash); // ~250 bytes when empty, used only for function names //^^^^^^^^^^^^^^^^^ does not work, hash_clear() inside SEGVs // (IOW: hash_clear() assumes it's a hash of variables. fnhash is not). free(fnhash->items); free(fnhash); fnhash = NULL; // debug //hash_free(ahash); // empty after parsing, will reuse as fdhash instead of freeing /* Parsing done, on to executing */ /* fill in ARGV array */ setari_u(intvar[ARGV], 0, "awk"); i = 0; while (*argv) setari_u(intvar[ARGV], ++i, *argv++); setvar_i(intvar[ARGC], i + 1); //fdhash = ahash; // done via define newfile("/dev/stdin")->F = stdin; newfile("/dev/stdout")->F = stdout; newfile("/dev/stderr")->F = stderr; zero_out_var(&tv); evaluate(beginseq.first, &tv); if (!mainseq.first && !endseq.first) awk_exit(EXIT_SUCCESS); /* input file could already be opened in BEGIN block */ if (!iF) iF = next_input_file(); /* passing through input files */ while (iF) { nextfile = FALSE; setvar_i(intvar[FNR], 0); while ((i = awk_getline(iF, intvar[F0])) > 0) { nextrec = FALSE; incvar(intvar[NR]); incvar(intvar[FNR]); evaluate(mainseq.first, &tv); if (nextfile) break; } if (i < 0) syntax_error(strerror(errno)); iF = next_input_file(); } awk_exit(EXIT_SUCCESS); /*return 0;*/ }