Skip to content

Commit 0ac676f

Browse files
committed
feature: added the "U" regex option to the ngx.re API to mean enabling the UTF-8 matching mode but disabling UTF-8 validity check on the subject strings. thanks Lance Li for the patch in openresty#227.
1 parent 766563f commit 0ac676f

File tree

9 files changed

+590
-18
lines changed

9 files changed

+590
-18
lines changed

src/ngx_http_lua_regex.c

Lines changed: 42 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
#define NGX_LUA_RE_MODE_DFA (1<<1)
3939
#define NGX_LUA_RE_MODE_JIT (1<<2)
4040
#define NGX_LUA_RE_MODE_DUPNAMES (1<<3)
41+
#define NGX_LUA_RE_NO_UTF8_CHECK (1<<4)
4142

4243
#define NGX_LUA_RE_DFA_MODE_WORKSPACE_COUNT (100)
4344

@@ -93,14 +94,14 @@ static void ngx_http_lua_re_collect_named_captures(lua_State *L,
9394
unsigned flags, ngx_str_t *subj);
9495

9596

96-
#define ngx_http_lua_regex_exec(re, e, s, start, captures, size) \
97-
pcre_exec(re, e, (const char *) (s)->data, (s)->len, start, 0, \
97+
#define ngx_http_lua_regex_exec(re, e, s, start, captures, size, opts) \
98+
pcre_exec(re, e, (const char *) (s)->data, (s)->len, start, opts, \
9899
captures, size)
99100

100101

101102
#define ngx_http_lua_regex_dfa_exec(re, e, s, start, captures, size, ws, \
102-
wscount) \
103-
pcre_dfa_exec(re, e, (const char *) (s)->data, (s)->len, start, 0, \
103+
wscount, opts) \
104+
pcre_dfa_exec(re, e, (const char *) (s)->data, (s)->len, start, opts, \
104105
captures, size, ws, wscount)
105106

106107

@@ -128,6 +129,7 @@ ngx_http_lua_ngx_re_match(lua_State *L)
128129
pcre_extra *sd = NULL;
129130
int name_entry_size, name_count;
130131
u_char *name_table;
132+
int exec_opts;
131133

132134
ngx_http_lua_regex_compile_t re_comp;
133135

@@ -429,14 +431,21 @@ ngx_http_lua_ngx_re_match(lua_State *L)
429431
}
430432
}
431433

434+
if (flags & NGX_LUA_RE_NO_UTF8_CHECK) {
435+
exec_opts = PCRE_NO_UTF8_CHECK;
436+
437+
} else {
438+
exec_opts = 0;
439+
}
440+
432441
if (flags & NGX_LUA_RE_MODE_DFA) {
433442

434443
#if LUA_HAVE_PCRE_DFA
435444

436445
int ws[NGX_LUA_RE_DFA_MODE_WORKSPACE_COUNT];
437446
rc = ngx_http_lua_regex_dfa_exec(re_comp.regex, sd, &subj,
438447
(int) pos, cap, ovecsize, ws,
439-
sizeof(ws)/sizeof(ws[0]));
448+
sizeof(ws)/sizeof(ws[0]), exec_opts);
440449

441450
#else /* LUA_HAVE_PCRE_DFA */
442451

@@ -447,7 +456,7 @@ ngx_http_lua_ngx_re_match(lua_State *L)
447456

448457
} else {
449458
rc = ngx_http_lua_regex_exec(re_comp.regex, sd, &subj, (int) pos, cap,
450-
ovecsize);
459+
ovecsize, exec_opts);
451460
}
452461

453462
if (rc == NGX_REGEX_NO_MATCHED) {
@@ -894,6 +903,7 @@ ngx_http_lua_ngx_re_gmatch_iterator(lua_State *L)
894903
const char *msg = NULL;
895904
int name_entry_size, name_count;
896905
u_char *name_table;
906+
int exec_opts;
897907

898908
/* upvalues in order: subj ctx offset */
899909

@@ -949,6 +959,13 @@ ngx_http_lua_ngx_re_gmatch_iterator(lua_State *L)
949959
}
950960
}
951961

962+
if (ctx->flags & NGX_LUA_RE_NO_UTF8_CHECK) {
963+
exec_opts = PCRE_NO_UTF8_CHECK;
964+
965+
} else {
966+
exec_opts = 0;
967+
}
968+
952969
if (ctx->flags & NGX_LUA_RE_MODE_DFA) {
953970

954971
#if LUA_HAVE_PCRE_DFA
@@ -957,7 +974,7 @@ ngx_http_lua_ngx_re_gmatch_iterator(lua_State *L)
957974

958975
rc = ngx_http_lua_regex_dfa_exec(ctx->regex, ctx->regex_sd, &subj,
959976
offset, cap, ctx->captures_len, ws,
960-
sizeof(ws)/sizeof(ws[0]));
977+
sizeof(ws)/sizeof(ws[0]), exec_opts);
961978

962979
#else /* LUA_HAVE_PCRE_DFA */
963980
msg = "at least pcre 6.0 is required for the DFA mode";
@@ -967,7 +984,8 @@ ngx_http_lua_ngx_re_gmatch_iterator(lua_State *L)
967984

968985
} else {
969986
rc = ngx_http_lua_regex_exec(ctx->regex, ctx->regex_sd, &subj,
970-
offset, cap, ctx->captures_len);
987+
offset, cap, ctx->captures_len,
988+
exec_opts);
971989
}
972990

973991
if (rc == NGX_REGEX_NO_MATCHED) {
@@ -1099,6 +1117,11 @@ ngx_http_lua_ngx_re_parse_opts(lua_State *L, ngx_http_lua_regex_compile_t *re,
10991117
re->options |= PCRE_UTF8;
11001118
break;
11011119

1120+
case 'U':
1121+
re->options |= PCRE_UTF8;
1122+
flags |= NGX_LUA_RE_NO_UTF8_CHECK;
1123+
break;
1124+
11021125
case 'x':
11031126
re->options |= PCRE_EXTENDED;
11041127
break;
@@ -1193,6 +1216,7 @@ ngx_http_lua_ngx_re_sub_helper(lua_State *L, unsigned global)
11931216
pcre_extra *sd = NULL;
11941217
int name_entry_size, name_count;
11951218
u_char *name_table;
1219+
int exec_opts;
11961220

11971221
ngx_http_lua_regex_compile_t re_comp;
11981222
ngx_http_lua_complex_value_t *ctpl = NULL;
@@ -1572,6 +1596,13 @@ ngx_http_lua_ngx_re_sub_helper(lua_State *L, unsigned global)
15721596
}
15731597
}
15741598

1599+
if (flags & NGX_LUA_RE_NO_UTF8_CHECK) {
1600+
exec_opts = PCRE_NO_UTF8_CHECK;
1601+
1602+
} else {
1603+
exec_opts = 0;
1604+
}
1605+
15751606
for (;;) {
15761607
if (flags & NGX_LUA_RE_MODE_DFA) {
15771608

@@ -1580,7 +1611,8 @@ ngx_http_lua_ngx_re_sub_helper(lua_State *L, unsigned global)
15801611
int ws[NGX_LUA_RE_DFA_MODE_WORKSPACE_COUNT];
15811612
rc = ngx_http_lua_regex_dfa_exec(re_comp.regex, sd, &subj,
15821613
offset, cap, ovecsize, ws,
1583-
sizeof(ws)/sizeof(ws[0]));
1614+
sizeof(ws)/sizeof(ws[0]),
1615+
exec_opts);
15841616

15851617
#else /* LUA_HAVE_PCRE_DFA */
15861618

@@ -1591,7 +1623,7 @@ ngx_http_lua_ngx_re_sub_helper(lua_State *L, unsigned global)
15911623

15921624
} else {
15931625
rc = ngx_http_lua_regex_exec(re_comp.regex, sd, &subj, offset, cap,
1594-
ovecsize);
1626+
ovecsize, exec_opts);
15951627
}
15961628

15971629
if (rc == NGX_REGEX_NO_MATCHED) {

t/034-match.t

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ use Test::Nginx::Socket;
99

1010
repeat_each(2);
1111

12-
plan tests => repeat_each() * (blocks() * 2 + 10);
12+
plan tests => repeat_each() * (blocks() * 2 + 14);
1313

1414
#no_diff();
1515
no_long_string();
@@ -945,3 +945,71 @@ error: pcre_exec\(\) failed: -10 on "你.*?" using "你好"
945945
--- no_error_log
946946
[error]
947947
948+
949+
950+
=== TEST 43: UTF-8 mode without UTF-8 sequence checks
951+
--- config
952+
location /re {
953+
content_by_lua '
954+
local m = ngx.re.match("你好", ".", "U")
955+
if m then
956+
ngx.say(m[0])
957+
else
958+
ngx.say("not matched!")
959+
end
960+
';
961+
}
962+
--- stap
963+
probe process("$LIBPCRE_PATH").function("pcre_compile") {
964+
printf("compile opts: %x\n", $options)
965+
}
966+
967+
probe process("$LIBPCRE_PATH").function("pcre_exec") {
968+
printf("exec opts: %x\n", $options)
969+
}
970+
971+
--- stap_out
972+
compile opts: 800
973+
exec opts: 2000
974+
975+
--- request
976+
GET /re
977+
--- response_body
978+
979+
--- no_error_log
980+
[error]
981+
982+
983+
984+
=== TEST 44: UTF-8 mode with UTF-8 sequence checks
985+
--- config
986+
location /re {
987+
content_by_lua '
988+
local m = ngx.re.match("你好", ".", "u")
989+
if m then
990+
ngx.say(m[0])
991+
else
992+
ngx.say("not matched!")
993+
end
994+
';
995+
}
996+
--- stap
997+
probe process("$LIBPCRE_PATH").function("pcre_compile") {
998+
printf("compile opts: %x\n", $options)
999+
}
1000+
1001+
probe process("$LIBPCRE_PATH").function("pcre_exec") {
1002+
printf("exec opts: %x\n", $options)
1003+
}
1004+
1005+
--- stap_out
1006+
compile opts: 800
1007+
exec opts: 0
1008+
1009+
--- request
1010+
GET /re
1011+
--- response_body
1012+
1013+
--- no_error_log
1014+
[error]
1015+

t/035-gmatch.t

Lines changed: 71 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ use Test::Nginx::Socket;
99

1010
repeat_each(5);
1111

12-
plan tests => repeat_each() * (blocks() * 2 + 3);
12+
plan tests => repeat_each() * (blocks() * 2 + 7);
1313

1414
our $HtmlDir = html_dir;
1515

@@ -741,3 +741,73 @@ error: pcre_exec\(\) failed: -10 on "你.*?"
741741
--- no_error_log
742742
[error]
743743
744+
745+
746+
=== TEST 28: UTF-8 mode without UTF-8 sequence checks
747+
--- config
748+
location /re {
749+
content_by_lua '
750+
local it = ngx.re.gmatch("你好", ".", "U")
751+
local m = it()
752+
if m then
753+
ngx.say(m[0])
754+
else
755+
ngx.say("not matched!")
756+
end
757+
';
758+
}
759+
--- stap
760+
probe process("$LIBPCRE_PATH").function("pcre_compile") {
761+
printf("compile opts: %x\n", $options)
762+
}
763+
764+
probe process("$LIBPCRE_PATH").function("pcre_exec") {
765+
printf("exec opts: %x\n", $options)
766+
}
767+
768+
--- stap_out
769+
compile opts: 800
770+
exec opts: 2000
771+
772+
--- request
773+
GET /re
774+
--- response_body
775+
776+
--- no_error_log
777+
[error]
778+
779+
780+
781+
=== TEST 29: UTF-8 mode with UTF-8 sequence checks
782+
--- config
783+
location /re {
784+
content_by_lua '
785+
local it = ngx.re.gmatch("你好", ".", "u")
786+
local m = it()
787+
if m then
788+
ngx.say(m[0])
789+
else
790+
ngx.say("not matched!")
791+
end
792+
';
793+
}
794+
--- stap
795+
probe process("$LIBPCRE_PATH").function("pcre_compile") {
796+
printf("compile opts: %x\n", $options)
797+
}
798+
799+
probe process("$LIBPCRE_PATH").function("pcre_exec") {
800+
printf("exec opts: %x\n", $options)
801+
}
802+
803+
--- stap_out
804+
compile opts: 800
805+
exec opts: 0
806+
807+
--- request
808+
GET /re
809+
--- response_body
810+
811+
--- no_error_log
812+
[error]
813+

0 commit comments

Comments
 (0)