From fa24499735018e2cb81a6a76aa8b5e72695fb8ad Mon Sep 17 00:00:00 2001 From: Tavian Barnes Date: Mon, 27 May 2024 16:56:43 -0400 Subject: Implement the remaining regex types Closes: https://github.com/tavianator/bfs/issues/21 --- src/parse.c | 32 +++++++++++++++++----- src/xregex.c | 41 +++++++++++++++++++++++++++++ src/xregex.h | 4 +++ tests/gnu/regextype_awk.out | 2 ++ tests/gnu/regextype_awk.sh | 3 +++ tests/gnu/regextype_egrep.out | 0 tests/gnu/regextype_egrep.sh | 3 +++ tests/gnu/regextype_findutils_default.out | 3 +++ tests/gnu/regextype_findutils_default.sh | 3 +++ tests/gnu/regextype_gnu_awk.out | 2 ++ tests/gnu/regextype_gnu_awk.sh | 3 +++ tests/gnu/regextype_posix_awk.out | 2 ++ tests/gnu/regextype_posix_awk.sh | 3 +++ tests/gnu/regextype_posix_minimal_basic.out | 1 + tests/gnu/regextype_posix_minimal_basic.sh | 2 ++ 15 files changed, 98 insertions(+), 6 deletions(-) create mode 100644 tests/gnu/regextype_awk.out create mode 100644 tests/gnu/regextype_awk.sh create mode 100644 tests/gnu/regextype_egrep.out create mode 100644 tests/gnu/regextype_egrep.sh create mode 100644 tests/gnu/regextype_findutils_default.out create mode 100644 tests/gnu/regextype_findutils_default.sh create mode 100644 tests/gnu/regextype_gnu_awk.out create mode 100644 tests/gnu/regextype_gnu_awk.sh create mode 100644 tests/gnu/regextype_posix_awk.out create mode 100644 tests/gnu/regextype_posix_awk.sh create mode 100644 tests/gnu/regextype_posix_minimal_basic.out create mode 100644 tests/gnu/regextype_posix_minimal_basic.sh diff --git a/src/parse.c b/src/parse.c index 539aa05..58a0209 100644 --- a/src/parse.c +++ b/src/parse.c @@ -2253,16 +2253,27 @@ static struct bfs_expr *parse_regextype(struct bfs_parser *parser, int arg1, int // See https://www.gnu.org/software/gnulib/manual/html_node/Predefined-Syntaxes.html const char *type = expr->argv[1]; if (strcmp(type, "posix-basic") == 0 + || strcmp(type, "posix-minimal-basic") == 0 || strcmp(type, "ed") == 0 || strcmp(type, "sed") == 0) { parser->regex_type = BFS_REGEX_POSIX_BASIC; } else if (strcmp(type, "posix-extended") == 0) { parser->regex_type = BFS_REGEX_POSIX_EXTENDED; #if BFS_WITH_ONIGURUMA + } else if (strcmp(type, "awk") == 0 + || strcmp(type, "posix-awk") == 0) { + parser->regex_type = BFS_REGEX_AWK; + } else if (strcmp(type, "gnu-awk") == 0) { + parser->regex_type = BFS_REGEX_GNU_AWK; } else if (strcmp(type, "emacs") == 0) { parser->regex_type = BFS_REGEX_EMACS; } else if (strcmp(type, "grep") == 0) { parser->regex_type = BFS_REGEX_GREP; + } else if (strcmp(type, "egrep") == 0 + || strcmp(type, "posix-egrep") == 0) { + parser->regex_type = BFS_REGEX_EGREP; + } else if (strcmp(type, "findutils-default") == 0) { + parser->regex_type = BFS_REGEX_GNU_FIND; #endif } else if (strcmp(type, "help") == 0) { parser->just_info = true; @@ -2277,14 +2288,23 @@ static struct bfs_expr *parse_regextype(struct bfs_parser *parser, int arg1, int list_types: cfprintf(cfile, "Supported types are:\n\n"); - cfprintf(cfile, " ${bld}posix-basic${rs}: POSIX basic regular expressions (BRE)\n"); - cfprintf(cfile, " ${bld}posix-extended${rs}: POSIX extended regular expressions (ERE)\n"); - cfprintf(cfile, " ${bld}ed${rs}: Like ${grn}ed${rs} (same as ${bld}posix-basic${rs})\n"); + cfprintf(cfile, " ${bld}posix-basic${rs}: POSIX basic regular expressions (BRE)\n"); + cfprintf(cfile, " ${bld}ed${rs}: Like ${grn}ed${rs} (same as ${bld}posix-basic${rs})\n"); + cfprintf(cfile, " ${bld}sed${rs}: Like ${grn}sed${rs} (same as ${bld}posix-basic${rs})\n\n"); + + cfprintf(cfile, " ${bld}posix-extended${rs}: POSIX extended regular expressions (ERE)\n\n"); + #if BFS_WITH_ONIGURUMA - cfprintf(cfile, " ${bld}emacs${rs}: Like ${grn}emacs${rs}\n"); - cfprintf(cfile, " ${bld}grep${rs}: Like ${grn}grep${rs}\n"); + cfprintf(cfile, " [${bld}posix-${rs}]${bld}awk${rs}: Like ${grn}awk${rs}\n"); + cfprintf(cfile, " ${bld}gnu-awk${rs}: Like GNU ${grn}awk${rs}\n\n"); + + cfprintf(cfile, " ${bld}emacs${rs}: Like ${grn}emacs${rs}\n\n"); + + cfprintf(cfile, " ${bld}grep${rs}: Like ${grn}grep${rs}\n"); + cfprintf(cfile, " [${bld}posix-${rs}]${bld}egrep${rs}: Like ${grn}grep${rs} ${cyn}-E${rs}\n\n"); + + cfprintf(cfile, " ${bld}findutils-default${rs}: Like GNU ${grn}find${rs}\n"); #endif - cfprintf(cfile, " ${bld}sed${rs}: Like ${grn}sed${rs} (same as ${bld}posix-basic${rs})\n"); return NULL; } diff --git a/src/xregex.c b/src/xregex.c index 414c366..2d089b2 100644 --- a/src/xregex.c +++ b/src/xregex.c @@ -37,7 +37,11 @@ struct bfs_regex { static int bfs_onig_status; static OnigEncoding bfs_onig_enc; +static OnigSyntaxType bfs_onig_syntax_awk; +static OnigSyntaxType bfs_onig_syntax_gnu_awk; static OnigSyntaxType bfs_onig_syntax_emacs; +static OnigSyntaxType bfs_onig_syntax_egrep; +static OnigSyntaxType bfs_onig_syntax_gnu_find; /** pthread_once() callback. */ static void bfs_onig_once(void) { @@ -106,9 +110,34 @@ static void bfs_onig_once(void) { bfs_onig_enc = NULL; } + // Compute the GNU extensions + OnigSyntaxType *ere = ONIG_SYNTAX_POSIX_EXTENDED; + OnigSyntaxType *gnu = ONIG_SYNTAX_GNU_REGEX; + unsigned int gnu_op = gnu->op & ~ere->op; + unsigned int gnu_op2 = gnu->op2 & ~ere->op2; + unsigned int gnu_behavior = gnu->behavior & ~ere->behavior; + + onig_copy_syntax(&bfs_onig_syntax_awk, ONIG_SYNTAX_POSIX_EXTENDED); + bfs_onig_syntax_awk.behavior |= ONIG_SYN_ALLOW_INVALID_INTERVAL; + bfs_onig_syntax_awk.behavior |= ONIG_SYN_BACKSLASH_ESCAPE_IN_CC; + + onig_copy_syntax(&bfs_onig_syntax_gnu_awk, &bfs_onig_syntax_awk); + bfs_onig_syntax_gnu_awk.op |= gnu_op; + bfs_onig_syntax_gnu_awk.op2 |= gnu_op2; + bfs_onig_syntax_gnu_awk.behavior |= gnu_behavior; + bfs_onig_syntax_gnu_awk.behavior &= ~ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS; + bfs_onig_syntax_gnu_awk.behavior &= ~ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS; + // https://github.com/kkos/oniguruma/issues/296 onig_copy_syntax(&bfs_onig_syntax_emacs, ONIG_SYNTAX_EMACS); bfs_onig_syntax_emacs.op2 |= ONIG_SYN_OP2_QMARK_GROUP_EFFECT; + + onig_copy_syntax(&bfs_onig_syntax_egrep, ONIG_SYNTAX_POSIX_EXTENDED); + bfs_onig_syntax_egrep.behavior |= ONIG_SYN_ALLOW_INVALID_INTERVAL; + bfs_onig_syntax_egrep.behavior &= ~ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS; + + onig_copy_syntax(&bfs_onig_syntax_gnu_find, &bfs_onig_syntax_emacs); + bfs_onig_syntax_gnu_find.options |= ONIG_OPTION_MULTILINE; } /** Initialize Oniguruma. */ @@ -149,12 +178,24 @@ int bfs_regcomp(struct bfs_regex **preg, const char *pattern, enum bfs_regex_typ case BFS_REGEX_POSIX_EXTENDED: syntax = ONIG_SYNTAX_POSIX_EXTENDED; break; + case BFS_REGEX_AWK: + syntax = &bfs_onig_syntax_awk; + break; + case BFS_REGEX_GNU_AWK: + syntax = &bfs_onig_syntax_gnu_awk; + break; case BFS_REGEX_EMACS: syntax = &bfs_onig_syntax_emacs; break; case BFS_REGEX_GREP: syntax = ONIG_SYNTAX_GREP; break; + case BFS_REGEX_EGREP: + syntax = &bfs_onig_syntax_egrep; + break; + case BFS_REGEX_GNU_FIND: + syntax = &bfs_onig_syntax_gnu_find; + break; } bfs_assert(syntax, "Invalid regex type"); diff --git a/src/xregex.h b/src/xregex.h index 998a2b0..750db24 100644 --- a/src/xregex.h +++ b/src/xregex.h @@ -15,8 +15,12 @@ struct bfs_regex; enum bfs_regex_type { BFS_REGEX_POSIX_BASIC, BFS_REGEX_POSIX_EXTENDED, + BFS_REGEX_AWK, + BFS_REGEX_GNU_AWK, BFS_REGEX_EMACS, BFS_REGEX_GREP, + BFS_REGEX_EGREP, + BFS_REGEX_GNU_FIND, }; /** diff --git a/tests/gnu/regextype_awk.out b/tests/gnu/regextype_awk.out new file mode 100644 index 0000000..0f32fc4 --- /dev/null +++ b/tests/gnu/regextype_awk.out @@ -0,0 +1,2 @@ +weirdnames/*/m +weirdnames/[/k diff --git a/tests/gnu/regextype_awk.sh b/tests/gnu/regextype_awk.sh new file mode 100644 index 0000000..3718473 --- /dev/null +++ b/tests/gnu/regextype_awk.sh @@ -0,0 +1,3 @@ +invoke_bfs -regextype awk -quit || skip + +bfs_diff weirdnames -regextype awk -regex '.*/[\[\*]/.*' diff --git a/tests/gnu/regextype_egrep.out b/tests/gnu/regextype_egrep.out new file mode 100644 index 0000000..e69de29 diff --git a/tests/gnu/regextype_egrep.sh b/tests/gnu/regextype_egrep.sh new file mode 100644 index 0000000..281d9c0 --- /dev/null +++ b/tests/gnu/regextype_egrep.sh @@ -0,0 +1,3 @@ +invoke_bfs -regextype egrep -quit || skip + +bfs_diff weirdnames -regextype egrep -regex '*.*/{l' diff --git a/tests/gnu/regextype_findutils_default.out b/tests/gnu/regextype_findutils_default.out new file mode 100644 index 0000000..709a7ba --- /dev/null +++ b/tests/gnu/regextype_findutils_default.out @@ -0,0 +1,3 @@ +/n +weirdnames/ +weirdnames/*/m diff --git a/tests/gnu/regextype_findutils_default.sh b/tests/gnu/regextype_findutils_default.sh new file mode 100644 index 0000000..c870312 --- /dev/null +++ b/tests/gnu/regextype_findutils_default.sh @@ -0,0 +1,3 @@ +invoke_bfs -regextype findutils-default -quit || skip + +bfs_diff weirdnames -regextype findutils-default -regex '.*/./\(m\|n\)' diff --git a/tests/gnu/regextype_gnu_awk.out b/tests/gnu/regextype_gnu_awk.out new file mode 100644 index 0000000..0f32fc4 --- /dev/null +++ b/tests/gnu/regextype_gnu_awk.out @@ -0,0 +1,2 @@ +weirdnames/*/m +weirdnames/[/k diff --git a/tests/gnu/regextype_gnu_awk.sh b/tests/gnu/regextype_gnu_awk.sh new file mode 100644 index 0000000..6b66496 --- /dev/null +++ b/tests/gnu/regextype_gnu_awk.sh @@ -0,0 +1,3 @@ +invoke_bfs -regextype gnu-awk -quit || skip + +bfs_diff weirdnames -regextype gnu-awk -regex '.*/[\[\*]/(\<.\>)' diff --git a/tests/gnu/regextype_posix_awk.out b/tests/gnu/regextype_posix_awk.out new file mode 100644 index 0000000..0f32fc4 --- /dev/null +++ b/tests/gnu/regextype_posix_awk.out @@ -0,0 +1,2 @@ +weirdnames/*/m +weirdnames/[/k diff --git a/tests/gnu/regextype_posix_awk.sh b/tests/gnu/regextype_posix_awk.sh new file mode 100644 index 0000000..86377d7 --- /dev/null +++ b/tests/gnu/regextype_posix_awk.sh @@ -0,0 +1,3 @@ +invoke_bfs -regextype posix-awk -quit || skip + +bfs_diff weirdnames -regextype posix-awk -regex '.*/[\[\*]/.*' diff --git a/tests/gnu/regextype_posix_minimal_basic.out b/tests/gnu/regextype_posix_minimal_basic.out new file mode 100644 index 0000000..0f0971e --- /dev/null +++ b/tests/gnu/regextype_posix_minimal_basic.out @@ -0,0 +1 @@ +./( diff --git a/tests/gnu/regextype_posix_minimal_basic.sh b/tests/gnu/regextype_posix_minimal_basic.sh new file mode 100644 index 0000000..ee324f3 --- /dev/null +++ b/tests/gnu/regextype_posix_minimal_basic.sh @@ -0,0 +1,2 @@ +cd weirdnames +bfs_diff -regextype posix-minimal-basic -regex '\./\((\)' -- cgit v1.2.3