breaking,time: rewrite parse_rfc3339/1 to improve performance, reject partial timestamps, that miss date info like 22:47:08Z (#22585)

This commit is contained in:
Hitalo Souza 2024-10-21 04:24:37 -04:00 committed by GitHub
parent de46d9d395
commit c55a75f412
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 357 additions and 67 deletions

View File

@ -5,6 +5,127 @@ module time
import strconv
const date_format_buffer = [u8(`0`), `0`, `0`, `0`, `-`, `0`, `0`, `-`, `0`, `0`]!
const time_format_buffer = [u8(`0`), `0`, `:`, `0`, `0`, `:`, `0`, `0`]!
fn validate_time_bounds(hour int, minute int, second int, nanosecond int) ! {
if hour < 0 || hour > 23 {
return error('invalid hour: ${hour}')
}
if minute < 0 || minute > 59 {
return error('invalid minute: ${minute}')
}
if second < 0 || second > 59 {
return error('invalid second: ${second}')
}
if nanosecond < 0 || nanosecond > 1_000_000_000 {
return error('invalid nanosecond: ${nanosecond}')
}
}
fn check_and_extract_time(s string) !(int, int, int, int) {
mut hour_ := 0
mut minute_ := 0
mut second_ := 0
mut nanosecond_ := 0
// Check if the string start in the format "HH:MM:SS"
for i := 0; i < time_format_buffer.len; i++ {
if time_format_buffer[i] == u8(`0`) {
if s[i] < u8(`0`) && s[i] > u8(`9`) {
return error('`HH:MM:SS` match error: expected digit, not `${s[i]}` in position ${i}')
} else {
if i < 2 {
hour_ = hour_ * 10 + (s[i] - u8(`0`))
} else if i < 5 {
minute_ = minute_ * 10 + (s[i] - u8(`0`))
} else {
second_ = second_ * 10 + (s[i] - u8(`0`))
}
}
} else if time_format_buffer[i] != s[i] {
return error('time separator error: expected `:`, not `${[s[i]].bytestr()}` in position ${i}')
}
}
if s.len == time_format_buffer.len + 1 {
if s[time_format_buffer.len] !in [u8(`Z`), `z`] {
return error('timezone error: expected "Z" or "z" at the end of the string')
}
validate_time_bounds(hour_, minute_, second_, nanosecond_)!
return hour_, minute_, second_, nanosecond_
}
if s.len < time_format_buffer.len + 1 {
return error('datetime string is too short')
}
if s[time_format_buffer.len] == u8(`.`) {
// Check if the string contains the nanoseconds part after the time part
if s.len < time_format_buffer.len + 1 {
return error('datetime string is too short')
}
// Check if the string start in the format ".NNNNNNNNN"
mut nanosecond_digits := 0
for i := time_format_buffer.len + 1; i < s.len; i++ {
if s[i] < u8(`0`) || s[i] > u8(`9`) {
if s[i] in [u8(`Z`), `z`] {
if i != s.len - 1 {
return error('timezone error: "Z" or "z" can only be at the end of the string')
}
break
} else if s[i] in [u8(`+`), `-`] {
break
}
return error('nanoseconds error: expected digit, not `${s[i]}` in position ${i}')
}
if !(i >= time_format_buffer.len + 1 + 9) {
// nanoseconds limit is 9 digits
nanosecond_ = nanosecond_ * 10 + (s[i] - u8(`0`))
nanosecond_digits++
}
}
if nanosecond_digits < 9 {
for i := 0; i < 9 - nanosecond_digits; i++ {
nanosecond_ *= 10
}
}
}
validate_time_bounds(hour_, minute_, second_, nanosecond_)!
return hour_, minute_, second_, nanosecond_
}
fn check_and_extract_date(s string) !(int, int, int) {
mut year := 0
mut month := 0
mut day := 0
// Check if the string start in the format "YYYY-MM-DD"
for i := 0; i < date_format_buffer.len; i++ {
if date_format_buffer[i] == u8(`0`) {
if s[i] < u8(`0`) && s[i] > u8(`9`) {
return error('`YYYY-MM-DD` match error: expected digit, not `${s[i]}` in position ${i}')
} else {
if i < 4 {
year = year * 10 + (s[i] - u8(`0`))
} else if i < 7 {
month = month * 10 + (s[i] - u8(`0`))
} else {
day = day * 10 + (s[i] - u8(`0`))
}
}
} else if date_format_buffer[i] != s[i] {
return error('date separator error:expected "${date_format_buffer[i]}", not `${s[i]}` in position ${i}')
}
}
if month < 1 || month > 12 {
return error('date error: invalid month ${month}')
}
if day < 1 || day > 31 {
return error('date error: invalid day ${day}')
}
return year, month, day
}
// parse_rfc3339 returns the time from a date string in RFC 3339 datetime format.
// See also https://ijmacd.github.io/rfc3339-iso8601/ for a visual reference of
// the differences between ISO-8601 and RFC 3339.
@ -12,48 +133,152 @@ pub fn parse_rfc3339(s string) !Time {
if s == '' {
return error_invalid_time(0, 'datetime string is empty')
}
// Normalize the input before parsing. Good since iso8601 doesn't permit lower case `t` and `z`.
sn := s.replace_each(['t', 'T', 'z', 'Z'])
mut t := parse_iso8601(sn) or { Time{} }
// If parse_iso8601 DID NOT result in default values (i.e. date was parsed correctly)
if t != Time{} {
return t
if s.len < time_format_buffer.len {
return error('string is too short to parse')
}
t_i := sn.index('T') or { -1 }
parts := if t_i != -1 { [sn[..t_i], sn[t_i + 1..]] } else { sn.split(' ') }
mut year, mut month, mut day := 0, 0, 0
mut hour_, mut minute_, mut second_, mut nanosecond_ := 0, 0, 0, 0
// Check if sn is date only
if !parts[0].contains_any(' Z') && parts[0].contains('-') {
year, month, day := parse_iso8601_date(sn)!
t = new(Time{
year: year
month: month
day: day
})
return t
is_time := if s.len >= time_format_buffer.len {
s[2] == u8(`:`) && s[5] == u8(`:`)
} else {
false
}
// Check if sn is time only
if !parts[0].contains('-') && parts[0].contains(':') {
mut hour_, mut minute_, mut second_, mut microsecond_, mut nanosecond_, mut unix_offset, mut is_local_time := 0, 0, 0, 0, 0, i64(0), true
hour_, minute_, second_, microsecond_, nanosecond_, unix_offset, is_local_time = parse_iso8601_time(parts[0])!
t = new(Time{
hour: hour_
minute: minute_
second: second_
nanosecond: nanosecond_
})
if is_local_time {
return t // Time is already local time
if is_time {
return error('missing date part of RFC 3339')
}
is_date := if s.len >= date_format_buffer.len {
s[4] == u8(`-`) && s[7] == u8(`-`)
} else {
false
}
if is_date {
year, month, day = check_and_extract_date(s)!
if s.len == date_format_buffer.len {
return new(Time{
year: year
month: month
day: day
is_local: false
})
}
mut unix_time := t.unix
if unix_offset < 0 {
unix_time -= (-unix_offset)
} else if unix_offset > 0 {
unix_time += unix_offset
}
is_datetime := if s.len >= date_format_buffer.len + 1 + time_format_buffer.len + 1 {
is_date && s[10] == u8(`T`)
} else {
false
}
if is_datetime {
// year, month, day := check_and_extract_date(s)!
hour_, minute_, second_, nanosecond_ = check_and_extract_time(s[date_format_buffer.len + 1..])!
}
mut timezone_start_position := 0
if is_datetime || is_time {
timezone_start_position = date_format_buffer.len + 1 + time_format_buffer.len
if s[timezone_start_position] == u8(`.`) {
timezone_start_position++
for s[timezone_start_position] !in [u8(`Z`), `z`, `+`, `-`] {
timezone_start_position++
if timezone_start_position == s.len {
return error('timezone error: expected "Z" or "z" or "+" or "-" in position ${timezone_start_position}, not "${[
s[timezone_start_position],
].bytestr()}"')
}
}
}
}
pos := date_format_buffer.len + time_format_buffer.len + 1
if pos >= s.len {
return error('timezone error: datetime string is too short')
}
if s[date_format_buffer.len + time_format_buffer.len + 1] !in [u8(`Z`), `z`, `+`, `-`, `.`] {
// RFC 3339 needs a timezone
return error('timezone error: expected "Z" or "z" or "+" or "-" in position ${
date_format_buffer.len + time_format_buffer.len + 1}, not "${[
s[date_format_buffer.len + time_format_buffer.len + 1],
].bytestr()}"')
} else {
if s[s.len - 1] in [u8(`Z`), `z`] {
return new(Time{
year: year
month: month
day: day
hour: hour_
minute: minute_
second: second_
nanosecond: nanosecond_
is_local: false
})
} else {
// Check if the string contains the timezone part after the time part +00:00
if s.len < date_format_buffer.len + 1 + time_format_buffer.len + 6 {
return error('datetime string is too short')
}
if s[s.len - 3] != u8(`:`) {
return error('timezone separator error: expected ":", not `${[
s[date_format_buffer.len + time_format_buffer.len + 3],
].bytestr()}` in position ${date_format_buffer.len + time_format_buffer.len + 3}')
}
// Check if it is UTC time
if unsafe { vmemcmp(s.str + s.len - 5, '00:00'.str, 5) == 0 } {
return new(Time{
year: year
month: month
day: day
hour: hour_
minute: minute_
second: second_
nanosecond: nanosecond_
is_local: false
})
}
is_negative := s[s.len - 6] == u8(`-`)
// To local time using the offset to add_seconds
mut offset_in_minutes := 0
mut offset_in_hours := 0
// offset hours
for i := 0; i < 2; i++ {
offset_in_hours = offset_in_minutes * 10 + (s[s.len - 5 + i] - u8(`0`))
}
// offset minutes
for i := 0; i < 2; i++ {
offset_in_minutes = offset_in_minutes * 10 + (s[s.len - 2 + i] - u8(`0`))
}
offset_in_minutes += offset_in_hours * 60
if !is_negative {
offset_in_minutes *= -1
}
mut time_to_be_returned := new(Time{
year: year
month: month
day: day
hour: hour_
minute: minute_
second: second_
nanosecond: nanosecond_
is_local: false
})
time_to_be_returned = time_to_be_returned.add_seconds(offset_in_minutes * 60)
return time_to_be_returned
}
t = unix_nanosecond(i64(unix_time), t.nanosecond)
return t
}
return error_invalid_time(9, 'malformed date')
@ -310,6 +535,6 @@ fn parse_iso8601_time(s string) !(int, int, int, int, int, i64, bool) {
if plus_min_z == `+` {
unix_offset *= -1
}
// eprintln('parse_iso8601_time s: $s | hour_: $hour_ | minute_: $minute_ | second_: $second_ | microsecond_: $microsecond_ | nanosecond_: $nanosecond_ | unix_offset: $unix_offset | is_local_time: $is_local_time')
// eprintln('parse_iso8601_time s: $s | hour_: $hour_ | minute_: $minute_ | second_: $second_ | microsecond_: $microsecond_ | nanosecond_: $nanosecond_ | unix_offset: $unix_offset | is_local: $is_local_time')
return hour_, minute_, second_, microsecond_, nanosecond_, unix_offset, is_local_time
}

View File

@ -3,8 +3,7 @@ import time
fn test_parse() {
s := '2018-01-27 12:48:34'
t := time.parse(s) or {
eprintln('> failing format: ${s} | err: ${err}')
assert false
assert false, '> failing format: ${s} | err: ${err}'
return
}
assert t.year == 2018 && t.month == 1 && t.day == 27 && t.hour == 12 && t.minute == 48
@ -27,8 +26,7 @@ fn test_parse_invalid() {
fn test_parse_rfc2822() {
s1 := 'Thu, 12 Dec 2019 06:07:45 GMT'
t1 := time.parse_rfc2822(s1) or {
eprintln('> failing format: ${s1} | err: ${err}')
assert false
assert false, '> failing format: ${s1} | err: ${err}'
return
}
assert t1.year == 2019 && t1.month == 12 && t1.day == 12 && t1.hour == 6 && t1.minute == 7
@ -36,8 +34,7 @@ fn test_parse_rfc2822() {
assert t1.unix() == 1576130865
s2 := 'Thu 12 Dec 2019 06:07:45 +0800'
t2 := time.parse_rfc2822(s2) or {
eprintln('> failing format: ${s2} | err: ${err}')
assert false
assert false, '> failing format: ${s2} | err: ${err}'
return
}
assert t2.year == 2019 && t2.month == 12 && t2.day == 12 && t2.hour == 6 && t2.minute == 7
@ -73,8 +70,7 @@ fn test_parse_iso8601() {
]
for i, format in formats {
t := time.parse_iso8601(format) or {
eprintln('>>> failing format: ${format} | err: ${err}')
assert false
assert false, '>>> failing format: ${format} | err: ${err}'
continue
}
year := times[i][0]
@ -97,8 +93,7 @@ fn test_parse_iso8601() {
fn test_parse_iso8601_local() {
format := '2020-06-05T15:38:06.015959'
t := time.parse_iso8601(format) or {
eprintln('> failing format: ${format} | err: ${err}')
assert false
assert false, '> failing format: ${format} | err: ${err}'
return
}
assert t.year == 2020
@ -135,8 +130,7 @@ fn test_parse_iso8601_invalid() {
fn test_parse_iso8601_date_only() {
format := '2020-06-05'
t := time.parse_iso8601(format) or {
eprintln('> failing format: ${format} | err: ${err}')
assert false
assert false, '> failing format: ${format} | err: ${err}'
return
}
assert t.year == 2020
@ -150,12 +144,21 @@ fn test_parse_iso8601_date_only() {
fn check_invalid_date(s string) {
if date := time.parse(s) {
eprintln('invalid date: "${s}" => "${date}"')
assert false
assert false, 'invalid date: "${s}" => "${date}"'
}
assert true
}
fn invalid_rfc3339(s string) string {
if date := time.parse_rfc3339(s) {
assert false, 'invalid date: "${s}" => "${date}"'
} else {
assert true
return err.str()
}
return ''
}
fn test_invalid_dates_should_error_during_parse() {
check_invalid_date('-99999-12-20 00:00:00')
check_invalid_date('99999-12-20 00:00:00')
@ -175,17 +178,48 @@ fn test_parse_rfc3339() {
pairs := [
['2015-01-06T15:47:32.080254511Z', '2015-01-06 15:47:32.080254'],
['2015-01-06T15:47:32.072697474Z', '2015-01-06 15:47:32.072697'],
['2015-01-06T15:47:32.1234Z', '2015-01-06 15:47:32.123400'],
['2015-01-06T15:47:32.001234Z', '2015-01-06 15:47:32.001234'],
['2015-01-06T15:47:32Z', '2015-01-06 15:47:32.000000'],
['2015-01-06T15:47:32+00:00', '2015-01-06 15:47:32.000000'],
['2015-01-06T15:47:32-00:00', '2015-01-06 15:47:32.000000'],
['2015-01-06T15:47:32-01:00', '2015-01-06 16:47:32.000000'],
['2015-01-06T15:47:32+01:00', '2015-01-06 14:47:32.000000'],
['2015-01-06T15:47:32-01:10', '2015-01-06 16:57:32.000000'],
['2015-01-06T15:47:32+01:10', '2015-01-06 14:37:32.000000'],
['2015-01-06T15:47:32.1234-00:00', '2015-01-06 15:47:32.123400'],
['2015-01-06T15:47:32.1234+01:00', '2015-01-06 14:47:32.123400'],
['2015-01-06T15:47:32.1234-01:00', '2015-01-06 16:47:32.123400'],
['2015-01-06T22:59:59-00:10', '2015-01-06 23:09:59.000000'],
['1979-05-27T07:32:00-08:00', '1979-05-27 15:32:00.000000'],
['2024-10-19T22:47:08-00:00', '2024-10-19 22:47:08.000000'],
['2024-10-19T22:47:08.9+00:00', '2024-10-19 22:47:08.900000'],
['2024-10-20T01:47:08+03:00', '2024-10-19 22:47:08.000000'],
['2024-10-20T01:47:08.981+03:00', '2024-10-19 22:47:08.981000'],
]
for pair in pairs {
input, expected := pair[0], pair[1]
res := time.parse_rfc3339(input) or {
eprintln('>>> failing input: ${input} | err: ${err}')
assert false
assert false, '>>> failing input: ${input} | err: ${err}'
return
}
output := res.format_ss_micro()
assert expected == output
}
assert invalid_rfc3339('22:47:08Z') == 'missing date part of RFC 3339'
assert invalid_rfc3339('01:47:08.981+03:00') == 'missing date part of RFC 3339'
assert invalid_rfc3339('2006-01-00') == 'date error: invalid day 0'
assert invalid_rfc3339('2006-01-32') == 'date error: invalid day 32'
assert invalid_rfc3339('2006-01-88') == 'date error: invalid day 88'
assert invalid_rfc3339('2006-00-01') == 'date error: invalid month 0'
assert invalid_rfc3339('2006-13-01') == 'date error: invalid month 13'
assert invalid_rfc3339('2006-77-01') == 'date error: invalid month 77'
assert invalid_rfc3339('2006-01-01T24:47:08Z') == 'invalid hour: 24'
assert invalid_rfc3339('2006-01-01T99:47:08Z') == 'invalid hour: 99'
assert invalid_rfc3339('2006-01-01T23:60:08Z') == 'invalid minute: 60'
assert invalid_rfc3339('2006-01-01T23:99:08Z') == 'invalid minute: 99'
assert invalid_rfc3339('2006-01-01T23:59:60Z') == 'invalid second: 60'
assert invalid_rfc3339('2006-01-01T23:59:99Z') == 'invalid second: 99'
}
fn test_ad_second_to_parse_result_in_2001() {
@ -205,8 +239,7 @@ fn test_ad_second_to_parse_result_pre_2001() {
fn test_parse_format() {
mut s := '2018-01-27 12:48:34'
mut t := time.parse_format(s, 'YYYY-MM-DD HH:mm:ss') or {
eprintln('> failing format: ${s} | err: ${err}')
assert false
assert false, '> failing format: ${s} | err: ${err}'
return
}
assert t.year == 2018 && t.month == 1 && t.day == 27 && t.hour == 12 && t.minute == 48
@ -214,8 +247,7 @@ fn test_parse_format() {
s = '2018-November-27 12:48:20'
t = time.parse_format(s, 'YYYY-MMMM-DD HH:mm:ss') or {
eprintln('> failing format: ${s} | err: ${err}')
assert false
assert false, '> failing format: ${s} | err: ${err}'
return
}
assert t.year == 2018 && t.month == 11 && t.day == 27 && t.hour == 12 && t.minute == 48
@ -223,8 +255,7 @@ fn test_parse_format() {
s = '18-1-2 0:8:2'
t = time.parse_format(s, 'YY-M-D H:m:s') or {
eprintln('> failing format: ${s} | err: ${err}')
assert false
assert false, '> failing format: ${s} | err: ${err}'
return
}
assert t.year == 2018 && t.month == 1 && t.day == 2 && t.hour == 0 && t.minute == 8
@ -233,6 +264,6 @@ fn test_parse_format() {
// This should always fail, because we test if M and D allow for a 01 value which they shouldn't
s = '2018-01-02 1:8:2'
t = time.parse_format(s, 'YYYY-M-D H:m:s') or { return }
eprintln('> failing for datetime: ${s}, the datetime string should not have passed the format "YYYY-M-D H:m:s"')
assert false
assert false, '> failing for datetime: ${s}, the datetime string should not have passed the format "YYYY-M-D H:m:s"'
}

View File

@ -17,6 +17,14 @@ pub const allowed_basic_escape_chars = [`u`, `U`, `b`, `t`, `n`, `f`, `r`, `"`,
// utf8_max is the largest inclusive value of the Unicodes scalar value ranges.
const utf8_max = 0x10FFFF
fn toml_parse_time(s string) !time.Time {
if s.len > 3 && s[2] == `:` {
// complete the partial time, with an arbitrary date:
return time.parse_rfc3339('0001-01-01T' + s)
}
return time.parse_rfc3339(s)!
}
// Checker checks a tree of TOML `ast.Value`'s for common errors.
pub struct Checker {
pub:
@ -318,8 +326,21 @@ fn (c &Checker) check_date_time(dt ast.DateTime) ! {
col: dt.pos.col + split[0].len
}
})!
// Use V's builtin functionality to validate the string
time.parse_rfc3339(lit) or {
// Simulate a time offset if it's missing then it can be checked. Already toml supports local time and rfc3339 don't.
mut has_time_offset := false
for ch in lit#[19..] {
if ch in [u8(`-`), `+`, `Z`] {
has_time_offset = true
break
}
}
mut lit_with_offset := lit
if !has_time_offset {
lit_with_offset += 'Z'
}
toml_parse_time(lit_with_offset) or {
return error(@MOD + '.' + @STRUCT + '.' + @FN +
' "${lit}" is not a valid RFC 3339 Date-Time format string "${err}". In ...${c.excerpt(dt.pos)}...')
}
@ -352,8 +373,7 @@ fn (c &Checker) check_date(date ast.Date) ! {
return error(@MOD + '.' + @STRUCT + '.' + @FN +
' "${lit}" does not have a valid RFC 3339 day indication in ...${c.excerpt(date.pos)}...')
}
// Use V's builtin functionality to validate the string
time.parse_rfc3339(lit) or {
toml_parse_time(lit) or {
return error(@MOD + '.' + @STRUCT + '.' + @FN +
' "${lit}" is not a valid RFC 3339 Date format string "${err}". In ...${c.excerpt(date.pos)}...')
}
@ -380,8 +400,22 @@ fn (c &Checker) check_time(t ast.Time) ! {
return error(@MOD + '.' + @STRUCT + '.' + @FN +
' "${lit}" is not a valid RFC 3339 Time format string in ...${c.excerpt(t.pos)}...')
}
// Use V's builtin functionality to validate the time string
time.parse_rfc3339(parts[0]) or {
// Simulate a time offset if it's missing then it can be checked. Already toml supports local time and rfc3339 don't.
mut has_time_offset := false
for ch in parts[0]#[8..] {
if ch in [u8(`-`), `+`, `Z`] {
has_time_offset = true
break
}
}
mut part_with_offset := parts[0]
if !has_time_offset {
part_with_offset += 'Z'
}
toml_parse_time(part_with_offset) or {
return error(@MOD + '.' + @STRUCT + '.' + @FN +
' "${lit}" is not a valid RFC 3339 Time format string "${err}". In ...${c.excerpt(t.pos)}...')
}