summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--regex.c91
-rwxr-xr-xtests.sh12
-rw-r--r--tests/test_regex_invalid_utf8.out1
3 files changed, 94 insertions, 10 deletions
diff --git a/regex.c b/regex.c
index 642e357..0852792 100644
--- a/regex.c
+++ b/regex.c
@@ -21,6 +21,7 @@
#include <string.h>
#if BFS_WITH_ONIGURUMA
+# include <langinfo.h>
# include <oniguruma.h>
#else
# include <regex.h>
@@ -34,19 +35,84 @@ struct bfs_regex {
#endif
};
-struct bfs_regex *bfs_regcomp(const char *expr, enum bfs_regex_type type, enum bfs_regcomp_flags flags, int *err) {
#if BFS_WITH_ONIGURUMA
- static bool onig_initialized = false;
- if (!onig_initialized) {
- OnigEncoding encs[] = {ONIG_ENCODING_UTF8};
- *err = onig_initialize(encs, sizeof(encs)/sizeof(encs[0]));
- if (*err != ONIG_NORMAL) {
- return NULL;
- }
- onig_initialized = true;
+/** Get (and initialize) the appropriate encoding for the current locale. */
+static OnigEncoding bfs_onig_encoding(int *err) {
+ static OnigEncoding enc = NULL;
+ if (enc) {
+ return enc;
}
+
+ // Fall back to ASCII by default
+ enc = ONIG_ENCODING_ASCII;
+
+ // Oniguruma has no locale support, so try to guess the right encoding
+ // from the current locale.
+ const char *charmap = nl_langinfo(CODESET);
+ if (charmap) {
+#define BFS_MAP_ENCODING(name, value) \
+ do { \
+ if (strcmp(charmap, name) == 0) { \
+ enc = value; \
+ } \
+ } while (0)
+#define BFS_MAP_ENCODING2(name1, name2, value) \
+ do { \
+ BFS_MAP_ENCODING(name1, value); \
+ BFS_MAP_ENCODING(name2, value); \
+ } while (0)
+
+ // These names were found with locale -m on Linux and FreeBSD
+#define BFS_MAP_ISO_8859(n) \
+ BFS_MAP_ENCODING2("ISO-8859-" #n, "ISO8859-" #n, ONIG_ENCODING_ISO_8859_ ## n)
+
+ BFS_MAP_ISO_8859(1);
+ BFS_MAP_ISO_8859(2);
+ BFS_MAP_ISO_8859(3);
+ BFS_MAP_ISO_8859(4);
+ BFS_MAP_ISO_8859(5);
+ BFS_MAP_ISO_8859(6);
+ BFS_MAP_ISO_8859(7);
+ BFS_MAP_ISO_8859(8);
+ BFS_MAP_ISO_8859(9);
+ BFS_MAP_ISO_8859(10);
+ BFS_MAP_ISO_8859(11);
+ // BFS_MAP_ISO_8859(12);
+ BFS_MAP_ISO_8859(13);
+ BFS_MAP_ISO_8859(14);
+ BFS_MAP_ISO_8859(15);
+ BFS_MAP_ISO_8859(16);
+
+ BFS_MAP_ENCODING("UTF-8", ONIG_ENCODING_UTF8);
+
+#define BFS_MAP_EUC(name) \
+ BFS_MAP_ENCODING2("EUC-" #name, "euc" #name, ONIG_ENCODING_EUC_ ## name)
+
+ BFS_MAP_EUC(JP);
+ BFS_MAP_EUC(TW);
+ BFS_MAP_EUC(KR);
+ BFS_MAP_EUC(CN);
+
+ BFS_MAP_ENCODING2("SHIFT_JIS", "SJIS", ONIG_ENCODING_SJIS);
+
+ // BFS_MAP_ENCODING("KOI-8", ONIG_ENCODING_KOI8);
+ BFS_MAP_ENCODING("KOI8-R", ONIG_ENCODING_KOI8_R);
+
+ BFS_MAP_ENCODING("CP1251", ONIG_ENCODING_CP1251);
+
+ BFS_MAP_ENCODING("GB18030", ONIG_ENCODING_BIG5);
+ }
+
+ *err = onig_initialize(&enc, 1);
+ if (*err != ONIG_NORMAL) {
+ enc = NULL;
+ }
+
+ return enc;
+}
#endif
+struct bfs_regex *bfs_regcomp(const char *expr, enum bfs_regex_type type, enum bfs_regcomp_flags flags, int *err) {
struct bfs_regex *regex = malloc(sizeof(*regex));
if (!regex) {
#if BFS_WITH_ONIGURUMA
@@ -80,9 +146,14 @@ struct bfs_regex *bfs_regcomp(const char *expr, enum bfs_regex_type type, enum b
options |= ONIG_OPTION_IGNORECASE;
}
+ OnigEncoding enc = bfs_onig_encoding(err);
+ if (!enc) {
+ goto fail;
+ }
+
const unsigned char *uexpr = (const unsigned char *)expr;
const unsigned char *end = uexpr + strlen(expr);
- *err = onig_new(&regex->impl, uexpr, end, options, ONIG_ENCODING_UTF8, syntax, NULL);
+ *err = onig_new(&regex->impl, uexpr, end, options, enc, syntax, NULL);
if (*err != ONIG_NORMAL) {
goto fail;
}
diff --git a/tests.sh b/tests.sh
index d7d9947..9003efd 100755
--- a/tests.sh
+++ b/tests.sh
@@ -625,6 +625,7 @@ gnu_tests=(
test_regex
test_regex_parens
test_regex_error
+ test_regex_invalid_utf8
test_regextype_posix_basic
test_regextype_posix_extended
@@ -2146,6 +2147,17 @@ function test_regex_error() {
fail quiet invoke_bfs basic -regex '['
}
+function test_regex_invalid_utf8() {
+ rm -rf scratch/*
+
+ # Incomplete UTF-8 sequences
+ skip_if fail quiet touch scratch/$'\xC3'
+ skip_if fail quiet touch scratch/$'\xE2\x84'
+ skip_if fail quiet touch scratch/$'\xF0\x9F\x92'
+
+ bfs_diff scratch -regex 'scratch/..'
+}
+
function test_E() {
cd weirdnames
bfs_diff -E . -regex '\./(\()'
diff --git a/tests/test_regex_invalid_utf8.out b/tests/test_regex_invalid_utf8.out
new file mode 100644
index 0000000..03f3f58
--- /dev/null
+++ b/tests/test_regex_invalid_utf8.out
@@ -0,0 +1 @@
+scratch/â„