summaryrefslogtreecommitdiffstats
path: root/regex.c
diff options
context:
space:
mode:
Diffstat (limited to 'regex.c')
-rw-r--r--regex.c91
1 files changed, 81 insertions, 10 deletions
diff --git a/regex.c b/regex.c
index 642e357..0852792 100644
--- a/regex.c
+++ b/regex.c
@@ -21,6 +21,7 @@
#include <string.h>
#if BFS_WITH_ONIGURUMA
+# include <langinfo.h>
# include <oniguruma.h>
#else
# include <regex.h>
@@ -34,19 +35,84 @@ struct bfs_regex {
#endif
};
-struct bfs_regex *bfs_regcomp(const char *expr, enum bfs_regex_type type, enum bfs_regcomp_flags flags, int *err) {
#if BFS_WITH_ONIGURUMA
- static bool onig_initialized = false;
- if (!onig_initialized) {
- OnigEncoding encs[] = {ONIG_ENCODING_UTF8};
- *err = onig_initialize(encs, sizeof(encs)/sizeof(encs[0]));
- if (*err != ONIG_NORMAL) {
- return NULL;
- }
- onig_initialized = true;
+/** Get (and initialize) the appropriate encoding for the current locale. */
+static OnigEncoding bfs_onig_encoding(int *err) {
+ static OnigEncoding enc = NULL;
+ if (enc) {
+ return enc;
}
+
+ // Fall back to ASCII by default
+ enc = ONIG_ENCODING_ASCII;
+
+ // Oniguruma has no locale support, so try to guess the right encoding
+ // from the current locale.
+ const char *charmap = nl_langinfo(CODESET);
+ if (charmap) {
+#define BFS_MAP_ENCODING(name, value) \
+ do { \
+ if (strcmp(charmap, name) == 0) { \
+ enc = value; \
+ } \
+ } while (0)
+#define BFS_MAP_ENCODING2(name1, name2, value) \
+ do { \
+ BFS_MAP_ENCODING(name1, value); \
+ BFS_MAP_ENCODING(name2, value); \
+ } while (0)
+
+ // These names were found with locale -m on Linux and FreeBSD
+#define BFS_MAP_ISO_8859(n) \
+ BFS_MAP_ENCODING2("ISO-8859-" #n, "ISO8859-" #n, ONIG_ENCODING_ISO_8859_ ## n)
+
+ BFS_MAP_ISO_8859(1);
+ BFS_MAP_ISO_8859(2);
+ BFS_MAP_ISO_8859(3);
+ BFS_MAP_ISO_8859(4);
+ BFS_MAP_ISO_8859(5);
+ BFS_MAP_ISO_8859(6);
+ BFS_MAP_ISO_8859(7);
+ BFS_MAP_ISO_8859(8);
+ BFS_MAP_ISO_8859(9);
+ BFS_MAP_ISO_8859(10);
+ BFS_MAP_ISO_8859(11);
+ // BFS_MAP_ISO_8859(12);
+ BFS_MAP_ISO_8859(13);
+ BFS_MAP_ISO_8859(14);
+ BFS_MAP_ISO_8859(15);
+ BFS_MAP_ISO_8859(16);
+
+ BFS_MAP_ENCODING("UTF-8", ONIG_ENCODING_UTF8);
+
+#define BFS_MAP_EUC(name) \
+ BFS_MAP_ENCODING2("EUC-" #name, "euc" #name, ONIG_ENCODING_EUC_ ## name)
+
+ BFS_MAP_EUC(JP);
+ BFS_MAP_EUC(TW);
+ BFS_MAP_EUC(KR);
+ BFS_MAP_EUC(CN);
+
+ BFS_MAP_ENCODING2("SHIFT_JIS", "SJIS", ONIG_ENCODING_SJIS);
+
+ // BFS_MAP_ENCODING("KOI-8", ONIG_ENCODING_KOI8);
+ BFS_MAP_ENCODING("KOI8-R", ONIG_ENCODING_KOI8_R);
+
+ BFS_MAP_ENCODING("CP1251", ONIG_ENCODING_CP1251);
+
+ BFS_MAP_ENCODING("GB18030", ONIG_ENCODING_BIG5);
+ }
+
+ *err = onig_initialize(&enc, 1);
+ if (*err != ONIG_NORMAL) {
+ enc = NULL;
+ }
+
+ return enc;
+}
#endif
+struct bfs_regex *bfs_regcomp(const char *expr, enum bfs_regex_type type, enum bfs_regcomp_flags flags, int *err) {
struct bfs_regex *regex = malloc(sizeof(*regex));
if (!regex) {
#if BFS_WITH_ONIGURUMA
@@ -80,9 +146,14 @@ struct bfs_regex *bfs_regcomp(const char *expr, enum bfs_regex_type type, enum b
options |= ONIG_OPTION_IGNORECASE;
}
+ OnigEncoding enc = bfs_onig_encoding(err);
+ if (!enc) {
+ goto fail;
+ }
+
const unsigned char *uexpr = (const unsigned char *)expr;
const unsigned char *end = uexpr + strlen(expr);
- *err = onig_new(&regex->impl, uexpr, end, options, ONIG_ENCODING_UTF8, syntax, NULL);
+ *err = onig_new(&regex->impl, uexpr, end, options, enc, syntax, NULL);
if (*err != ONIG_NORMAL) {
goto fail;
}