mirror of
https://github.com/vlang/v.git
synced 2025-09-11 00:20:26 -04:00
This commit is contained in:
parent
8c249366c4
commit
ab45d80fa5
@ -103,6 +103,9 @@ A meta-char can match different types of characters.
|
|||||||
- `\a` matches only a lowercase char `[a-z]`
|
- `\a` matches only a lowercase char `[a-z]`
|
||||||
- `\A` matches only an uppercase char `[A-Z]`
|
- `\A` matches only an uppercase char `[A-Z]`
|
||||||
|
|
||||||
|
- `\x41` match a byte of value 0x41, `A` in ascii code
|
||||||
|
- `\X414C` match two consecutive bytes of value 0x414c, `AL` in ascii code
|
||||||
|
|
||||||
### Quantifier
|
### Quantifier
|
||||||
|
|
||||||
Each token can have a quantifier, that specifies how many times the character
|
Each token can have a quantifier, that specifies how many times the character
|
||||||
|
@ -17,9 +17,7 @@ module regex
|
|||||||
import strings
|
import strings
|
||||||
|
|
||||||
pub const v_regex_version = '1.0 alpha' // regex module version
|
pub const v_regex_version = '1.0 alpha' // regex module version
|
||||||
|
|
||||||
pub const max_code_len = 256 // default small base code len for the regex programs
|
pub const max_code_len = 256 // default small base code len for the regex programs
|
||||||
|
|
||||||
pub const max_quantifier = 1073741824 // default max repetitions allowed for the quantifiers = 2^30
|
pub const max_quantifier = 1073741824 // default max repetitions allowed for the quantifiers = 2^30
|
||||||
|
|
||||||
// spaces chars (here only westerns!!) TODO: manage all the spaces from unicode
|
// spaces chars (here only westerns!!) TODO: manage all the spaces from unicode
|
||||||
@ -32,29 +30,17 @@ pub const no_match_found = -1
|
|||||||
|
|
||||||
// Errors
|
// Errors
|
||||||
pub const compile_ok = 0 // the regex string compiled, all ok
|
pub const compile_ok = 0 // the regex string compiled, all ok
|
||||||
|
|
||||||
pub const err_char_unknown = -2 // the char used is unknow to the system
|
pub const err_char_unknown = -2 // the char used is unknow to the system
|
||||||
|
|
||||||
pub const err_undefined = -3 // the compiler symbol is undefined
|
pub const err_undefined = -3 // the compiler symbol is undefined
|
||||||
|
|
||||||
pub const err_internal_error = -4 // Bug in the regex system!!
|
pub const err_internal_error = -4 // Bug in the regex system!!
|
||||||
|
|
||||||
pub const err_cc_alloc_overflow = -5 // memory for char class full!!
|
pub const err_cc_alloc_overflow = -5 // memory for char class full!!
|
||||||
|
|
||||||
pub const err_syntax_error = -6 // syntax error in regex compiling
|
pub const err_syntax_error = -6 // syntax error in regex compiling
|
||||||
|
|
||||||
pub const err_groups_overflow = -7 // max number of groups reached
|
pub const err_groups_overflow = -7 // max number of groups reached
|
||||||
|
|
||||||
pub const err_groups_max_nested = -8 // max number of nested group reached
|
pub const err_groups_max_nested = -8 // max number of nested group reached
|
||||||
|
|
||||||
pub const err_group_not_balanced = -9 // group not balanced
|
pub const err_group_not_balanced = -9 // group not balanced
|
||||||
|
|
||||||
pub const err_group_qm_notation = -10 // group invalid notation
|
pub const err_group_qm_notation = -10 // group invalid notation
|
||||||
|
|
||||||
pub const err_invalid_or_with_cc = -11 // invalid or on two consecutive char class
|
pub const err_invalid_or_with_cc = -11 // invalid or on two consecutive char class
|
||||||
|
|
||||||
pub const err_neg_group_quantifier = -12 // negation groups can not have quantifier
|
pub const err_neg_group_quantifier = -12 // negation groups can not have quantifier
|
||||||
|
|
||||||
pub const err_consecutive_dots = -13
|
pub const err_consecutive_dots = -13
|
||||||
|
|
||||||
//*************************************
|
//*************************************
|
||||||
@ -66,9 +52,7 @@ const ist_simple_char = u32(0x7FFFFFFF) // single char instruction, 31 bit avail
|
|||||||
// AA = 00 regular class
|
// AA = 00 regular class
|
||||||
// AA = 01 Negated class ^ char
|
// AA = 01 Negated class ^ char
|
||||||
const ist_char_class = u32(0xD1000000) // MASK
|
const ist_char_class = u32(0xD1000000) // MASK
|
||||||
|
|
||||||
const ist_char_class_pos = u32(0xD0000000) // char class normal [abc]
|
const ist_char_class_pos = u32(0xD0000000) // char class normal [abc]
|
||||||
|
|
||||||
const ist_char_class_neg = u32(0xD1000000) // char class negate [^abc]
|
const ist_char_class_neg = u32(0xD1000000) // char class negate [^abc]
|
||||||
|
|
||||||
// dot char 10 0110 xx xxxxxxxx
|
// dot char 10 0110 xx xxxxxxxx
|
||||||
@ -82,7 +66,6 @@ const ist_or_branch = u32(0x91000000) // OR case
|
|||||||
|
|
||||||
// groups 10 010Y xx xxxxxxxx
|
// groups 10 010Y xx xxxxxxxx
|
||||||
const ist_group_start = u32(0x92000000) // group start (
|
const ist_group_start = u32(0x92000000) // group start (
|
||||||
|
|
||||||
const ist_group_end = u32(0x94000000) // group end )
|
const ist_group_end = u32(0x94000000) // group end )
|
||||||
|
|
||||||
// control instructions
|
// control instructions
|
||||||
@ -257,6 +240,7 @@ mut:
|
|||||||
// char
|
// char
|
||||||
ch rune // char of the token if any
|
ch rune // char of the token if any
|
||||||
ch_len u8 // char len
|
ch_len u8 // char len
|
||||||
|
flag u8 // flag for general usage
|
||||||
// Quantifiers / branch
|
// Quantifiers / branch
|
||||||
rep_min int // used also for jump next in the OR branch [no match] pc jump
|
rep_min int // used also for jump next in the OR branch [no match] pc jump
|
||||||
rep_max int // used also for jump next in the OR branch [ match] pc jump
|
rep_max int // used also for jump next in the OR branch [ match] pc jump
|
||||||
@ -294,13 +278,9 @@ fn (mut tok Token) reset() {
|
|||||||
*
|
*
|
||||||
******************************************************************************/
|
******************************************************************************/
|
||||||
pub const f_nl = 0x00000001 // end the match when find a new line symbol
|
pub const f_nl = 0x00000001 // end the match when find a new line symbol
|
||||||
|
|
||||||
pub const f_ms = 0x00000002 // match true only if the match is at the start of the string
|
pub const f_ms = 0x00000002 // match true only if the match is at the start of the string
|
||||||
|
|
||||||
pub const f_me = 0x00000004 // match true only if the match is at the end of the string
|
pub const f_me = 0x00000004 // match true only if the match is at the end of the string
|
||||||
|
|
||||||
pub const f_efm = 0x00000100 // exit on first token matched, used by search
|
pub const f_efm = 0x00000100 // exit on first token matched, used by search
|
||||||
|
|
||||||
pub const f_bin = 0x00000200 // work only on bytes, ignore utf-8
|
pub const f_bin = 0x00000200 // work only on bytes, ignore utf-8
|
||||||
|
|
||||||
// behaviour modifier flags
|
// behaviour modifier flags
|
||||||
@ -419,17 +399,22 @@ enum BSLS_parse_state {
|
|||||||
bsls_found
|
bsls_found
|
||||||
bsls_char
|
bsls_char
|
||||||
normal_char
|
normal_char
|
||||||
|
hex_char
|
||||||
}
|
}
|
||||||
|
|
||||||
// parse_bsls return (index, str_len) bsls_validator_array index, len of the backslash sequence if present
|
// parse_bsls return (index, str_len) bsls_validator_array index, len of the backslash sequence if present
|
||||||
fn (re RE) parse_bsls(in_txt string, in_i int) (int, int) {
|
fn (re RE) parse_bsls(in_txt string, in_i int) (int, int, u32) {
|
||||||
mut status := BSLS_parse_state.start
|
mut status := BSLS_parse_state.start
|
||||||
mut i := in_i
|
mut i := in_i
|
||||||
|
mut hex_max_len := 2
|
||||||
|
mut hex_res := u32(0)
|
||||||
|
mut hex_count := 0
|
||||||
|
|
||||||
for i < in_txt.len {
|
for i < in_txt.len {
|
||||||
// get our char
|
// get our char
|
||||||
char_tmp, char_len := re.get_char(in_txt, i)
|
char_tmp, char_len := re.get_char(in_txt, i)
|
||||||
ch := u8(char_tmp)
|
ch := u8(char_tmp)
|
||||||
|
// println("ch [${ch:c}]")
|
||||||
|
|
||||||
if status == .start && ch == `\\` {
|
if status == .start && ch == `\\` {
|
||||||
status = .bsls_found
|
status = .bsls_found
|
||||||
@ -441,26 +426,86 @@ fn (re RE) parse_bsls(in_txt string, in_i int) (int, int) {
|
|||||||
if status == .bsls_found {
|
if status == .bsls_found {
|
||||||
for c, x in regex.bsls_validator_array {
|
for c, x in regex.bsls_validator_array {
|
||||||
if x.ch == ch {
|
if x.ch == ch {
|
||||||
return c, i - in_i + 1
|
return c, i - in_i + 1, hex_res
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// check for \x00 hex 8bit
|
||||||
|
if ch == `x` {
|
||||||
|
status = .hex_char
|
||||||
|
hex_max_len = 2
|
||||||
|
i += char_len
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// check for \x00 hex 16bit
|
||||||
|
if ch == `X` {
|
||||||
|
status = .hex_char
|
||||||
|
hex_max_len = 4
|
||||||
|
i += char_len
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
status = .normal_char
|
status = .normal_char
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// manage hex byte
|
||||||
|
if status == .hex_char {
|
||||||
|
if ch >= `0` && ch <= `9` {
|
||||||
|
hex_count++
|
||||||
|
hex_res <<= 4
|
||||||
|
hex_res += u32(ch - `0`)
|
||||||
|
i += char_len
|
||||||
|
} else if ch >= `A` && ch <= `F` {
|
||||||
|
hex_count++
|
||||||
|
hex_res <<= 4
|
||||||
|
hex_res += u32(ch - `A` + 10)
|
||||||
|
i += char_len
|
||||||
|
} else if ch >= `a` && ch <= `f` {
|
||||||
|
hex_count++
|
||||||
|
hex_res <<= 4
|
||||||
|
hex_res += u32(ch - `a` + 10)
|
||||||
|
i += char_len
|
||||||
|
} else {
|
||||||
|
return regex.err_syntax_error, i, hex_res
|
||||||
|
}
|
||||||
|
|
||||||
|
// println("hex_res: ${hex_res:08x} hex_count: ${hex_count}")
|
||||||
|
|
||||||
|
// look for more hex digits
|
||||||
|
if hex_count < hex_max_len {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// if over 8 nibble is more than 32 bit, error
|
||||||
|
if hex_count > hex_max_len {
|
||||||
|
return regex.err_syntax_error, i - in_i, hex_res
|
||||||
|
}
|
||||||
|
|
||||||
|
if hex_count == hex_max_len {
|
||||||
|
// we have a good result
|
||||||
|
// println("RESULT hex_res: ${hex_res:08x} hex_count: ${hex_count}")
|
||||||
|
return -2, i - in_i, hex_res
|
||||||
|
}
|
||||||
|
|
||||||
|
// MUST NOT BE HERE!
|
||||||
|
return regex.err_syntax_error, i, hex_res
|
||||||
|
}
|
||||||
|
|
||||||
// no BSLS validator, manage as normal escape char char
|
// no BSLS validator, manage as normal escape char char
|
||||||
if status == .normal_char {
|
if status == .normal_char {
|
||||||
if ch in regex.bsls_escape_list {
|
if ch in regex.bsls_escape_list {
|
||||||
return regex.no_match_found, i - in_i + 1
|
return regex.no_match_found, i - in_i + 1, hex_res
|
||||||
}
|
}
|
||||||
return regex.err_syntax_error, i - in_i + 1
|
return regex.err_syntax_error, i - in_i + 1, hex_res
|
||||||
}
|
}
|
||||||
|
|
||||||
// at the present time we manage only one char after the \
|
// at the present time we manage only one char after the \
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
// not our bsls return KO
|
// not our bsls return KO
|
||||||
return regex.err_syntax_error, i
|
return regex.err_syntax_error, i, hex_res
|
||||||
}
|
}
|
||||||
|
|
||||||
/******************************************************************************
|
/******************************************************************************
|
||||||
@ -1200,8 +1245,10 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
|
|||||||
// ist_bsls_char
|
// ist_bsls_char
|
||||||
if char_len == 1 && pc >= 0 {
|
if char_len == 1 && pc >= 0 {
|
||||||
if u8(char_tmp) == `\\` {
|
if u8(char_tmp) == `\\` {
|
||||||
bsls_index, tmp := re.parse_bsls(in_txt, i)
|
// if the index is negative:
|
||||||
// println("index: $bsls_index str:${in_txt[i..i+tmp]}")
|
// -1 ERROR
|
||||||
|
// -2 hex byte code BLSL
|
||||||
|
bsls_index, tmp, hex_res := re.parse_bsls(in_txt, i)
|
||||||
if bsls_index >= 0 {
|
if bsls_index >= 0 {
|
||||||
i = i + tmp
|
i = i + tmp
|
||||||
re.prog[pc].ist = u32(0) | regex.ist_bsls_char
|
re.prog[pc].ist = u32(0) | regex.ist_bsls_char
|
||||||
@ -1212,6 +1259,34 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
|
|||||||
pc = pc + 1
|
pc = pc + 1
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
// hex char
|
||||||
|
// this code can mange up to \x for 32 bit
|
||||||
|
// at the present time only 8/16 bit are used
|
||||||
|
else if bsls_index == -2 {
|
||||||
|
mut value := hex_res
|
||||||
|
mut value_list := []u32{cap: 4}
|
||||||
|
mut count := 0
|
||||||
|
for value > 0 {
|
||||||
|
value_list << value & 0xFF
|
||||||
|
value = value >> 8
|
||||||
|
count++
|
||||||
|
}
|
||||||
|
|
||||||
|
count--
|
||||||
|
for count >= 0 {
|
||||||
|
re.prog[pc].ist = regex.ist_simple_char
|
||||||
|
re.prog[pc].ch = value_list[count]
|
||||||
|
re.prog[pc].ch_len = u8(char_len)
|
||||||
|
re.prog[pc].rep_min = 1
|
||||||
|
re.prog[pc].rep_max = 1
|
||||||
|
re.prog[pc].flag = 1 // state a byte char
|
||||||
|
// println("char: ${char_tmp:c}")
|
||||||
|
pc = pc + 1
|
||||||
|
count--
|
||||||
|
}
|
||||||
|
i = i + tmp
|
||||||
|
continue
|
||||||
|
}
|
||||||
// this is an escape char, skip the bsls and continue as a normal char
|
// this is an escape char, skip the bsls and continue as a normal char
|
||||||
else if bsls_index == regex.no_match_found {
|
else if bsls_index == regex.no_match_found {
|
||||||
i += char_len
|
i += char_len
|
||||||
@ -1514,7 +1589,11 @@ pub fn (re RE) get_code() string {
|
|||||||
} else if ist == regex.ist_group_end {
|
} else if ist == regex.ist_group_end {
|
||||||
res.write_string(') GROUP_END #:${tk.group_id}')
|
res.write_string(') GROUP_END #:${tk.group_id}')
|
||||||
} else if ist == regex.ist_simple_char {
|
} else if ist == regex.ist_simple_char {
|
||||||
res.write_string('[${tk.ch:1c}] query_ch')
|
if tk.flag == 0 {
|
||||||
|
res.write_string('[${tk.ch:1c}] query_ch')
|
||||||
|
} else {
|
||||||
|
res.write_string('[0x${tk.ch:02X}]HEXquery_ch')
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if tk.rep_max == regex.max_quantifier {
|
if tk.rep_max == regex.max_quantifier {
|
||||||
@ -1615,10 +1694,14 @@ pub fn (re RE) get_query() string {
|
|||||||
|
|
||||||
// char alone
|
// char alone
|
||||||
if ch == regex.ist_simple_char {
|
if ch == regex.ist_simple_char {
|
||||||
if u8(ch) in regex.bsls_escape_list {
|
if tk.flag == 0 {
|
||||||
res.write_string('\\')
|
if u8(ch) in regex.bsls_escape_list {
|
||||||
|
res.write_string('\\')
|
||||||
|
}
|
||||||
|
res.write_string('${tk.ch:c}')
|
||||||
|
} else {
|
||||||
|
res.write_string('\\x${tk.ch:02x}')
|
||||||
}
|
}
|
||||||
res.write_string('${tk.ch:c}')
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// quantifier
|
// quantifier
|
||||||
@ -1799,7 +1882,11 @@ pub fn (mut re RE) match_base(in_txt &u8, in_txt_len int) (int, int) {
|
|||||||
buf2.write_string(" i,ch,len:[${state.i:3d},'${utf8_str(ch)}',${char_len}] f.m:[${state.first_match:3d},${state.match_index:3d}] ")
|
buf2.write_string(" i,ch,len:[${state.i:3d},'${utf8_str(ch)}',${char_len}] f.m:[${state.first_match:3d},${state.match_index:3d}] ")
|
||||||
|
|
||||||
if ist == regex.ist_simple_char {
|
if ist == regex.ist_simple_char {
|
||||||
buf2.write_string('query_ch: [${re.prog[state.pc].ch:1c}]')
|
if re.prog[state.pc].flag == 0 {
|
||||||
|
buf2.write_string('query_ch: [${re.prog[state.pc].ch:1c}]')
|
||||||
|
} else {
|
||||||
|
buf2.write_string('query_ch: [0x${re.prog[state.pc].ch:02X}]')
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
if ist == regex.ist_bsls_char {
|
if ist == regex.ist_bsls_char {
|
||||||
buf2.write_string('BSLS [\\${re.prog[state.pc].ch:1c}]')
|
buf2.write_string('BSLS [\\${re.prog[state.pc].ch:1c}]')
|
||||||
|
@ -193,6 +193,18 @@ match_test_suite = [
|
|||||||
// test has `\0` chars
|
// test has `\0` chars
|
||||||
TestItem{"abcxyz", "^abc\0xyz$", -1,3},
|
TestItem{"abcxyz", "^abc\0xyz$", -1,3},
|
||||||
TestItem{"abc\0xyz", "^abc\0xyz$", 0,7},
|
TestItem{"abc\0xyz", "^abc\0xyz$", 0,7},
|
||||||
|
|
||||||
|
// test hex byte chars
|
||||||
|
TestItem{"abc_xyz", r"abc\x5Fxyz", 0,7},
|
||||||
|
TestItem{"abc_xyz", r"^abc\x5fxyz$", 0,7},
|
||||||
|
TestItem{"abcAxyz", r"^abc\x41xyz$", 0,7},
|
||||||
|
TestItem{"abcAAxyz", r"^abc\x41+xyz$", 0,8},
|
||||||
|
TestItem{"abcALxyz", r"^abc\x41\x4Cxyz$", 0,8},
|
||||||
|
TestItem{"abcAAxyz", r"^abc\X4141xyz$", 0,8},
|
||||||
|
TestItem{"abcALxyz", r"^abc\X414cxyz$", 0,8},
|
||||||
|
TestItem{"abcALxyz", r"^abc\X414Cxyz$", 0,8},
|
||||||
|
TestItem{"abcBxyz", r"^abc\x41+xyz$", -1,3},
|
||||||
|
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user